优化文件路径易出错的问题,美化下载进度条

This commit is contained in:
LeLe86 2020-03-03 13:10:58 +08:00
parent 7e4e77d958
commit 46182be1d9
3 changed files with 34 additions and 16 deletions

View File

@ -31,6 +31,8 @@ Fiddler的官网有时会连不上可去pc.qq.com搜索Fiddler4 并安装
至此配置完成了点软件左下角的方块会显示Capturing ,表示它此时处在可以抓取数据的状态,再点一下会暂停抓取。此处先打开为抓取状态 至此配置完成了点软件左下角的方块会显示Capturing ,表示它此时处在可以抓取数据的状态,再点一下会暂停抓取。此处先打开为抓取状态
![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602082132.png) ![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602082132.png)
有的朋友可能会在Fiddler 中抓取不到Https请求请仔细按照上面流程检查。若有其他异常绝大多数Fiddler相关的问题通过百度可以解决。
## c.打开某个微信公众号的历史文章列表 ## c.打开某个微信公众号的历史文章列表
![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602060364.png) ![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602060364.png)

View File

@ -1,5 +1,5 @@
{ {
"jsonDir": "C:/Users/kklwin10/Desktop/Dump-0103-20-14-29", "jsonDir": "c:/Users/kklwin10/Desktop/Dump-0103-20-14-29",
"htmlDir": "c:/vWeChatFiles/html/", "htmlDir": "c:/vWeChatFiles/html",
"pdfDir": "c:/vWeChatFiles/pdf/" "pdfDir": "c:/vWeChatFiles/pdf"
} }

View File

@ -8,6 +8,7 @@ from time import sleep
""" """
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl 本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
讨论QQ群 703431832
""" """
@ -34,6 +35,10 @@ def GetJson():
jstxt = ReadFile("config.json") jstxt = ReadFile("config.json")
jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别 jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别
jsbd = json.loads(jstxt) jsbd = json.loads(jstxt)
if jsbd["htmlDir"][-1]=="/":
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
if jsbd["jsonDir"][-1]=="/":
jsbd["jsonDir"]= jsbd["jsonDir"][:-1]
return jsbd return jsbd
@ -46,8 +51,8 @@ def DownLoadHtml(url):
'Connection':'keep-alive', 'Connection':'keep-alive',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
} }
requests.packages.urllib3.disable_warnings()
response = requests.get(url,headers = headers,proxies=None) response = requests.get(url,headers = headers,proxies=None,verify=False)
if response.status_code == 200: if response.status_code == 200:
htmltxt = response.text #返回的网页正文 htmltxt = response.text #返回的网页正文
return htmltxt return htmltxt
@ -63,7 +68,8 @@ def DownImg(url,savepath):
'Connection':'keep-alive', 'Connection':'keep-alive',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
} }
r = requests.get(url,headers = headers,proxies=None) requests.packages.urllib3.disable_warnings()
r = requests.get(url,headers = headers,proxies=None,verify=False)
with open(savepath, 'wb') as f: with open(savepath, 'wb') as f:
f.write(r.content) f.write(r.content)
@ -84,7 +90,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
if originalURL.startswith("//"):#如果url以//开头则需要添加http if originalURL.startswith("//"):#如果url以//开头则需要添加http
originalURL = "http:" + originalURL originalURL = "http:" + originalURL
if len(originalURL) > 0: if len(originalURL) > 0:
print("down img",imgindex) print("\r down imgs " + "" * imgindex +" " + str(imgindex),end="")
if "data-type" in img.attrs: if "data-type" in img.attrs:
imgtype = img.attrs["data-type"] imgtype = img.attrs["data-type"]
else: else:
@ -109,7 +115,10 @@ def ChangeCssSrc(bs):
def ChangeContent(bs): def ChangeContent(bs):
jscontent = bs.find(id="js_content") jscontent = bs.find(id="js_content")
if jscontent:
jscontent.attrs["style"]="" jscontent.attrs["style"]=""
else:
print("-----可能文章被删了-----")
#文章类 #文章类
class Article(): class Article():
@ -142,6 +151,7 @@ def GetArticleList(jsondir):
idx = artidx idx = artidx
title = app_msg_ext_info["title"] title = app_msg_ext_info["title"]
art = Article(url,pubdate,idx,title) art = Article(url,pubdate,idx,title)
if len(url)>3:#url不完整则跳过
ArtList.append(art) ArtList.append(art)
print(len(ArtList),pubdate, idx, title) print(len(ArtList),pubdate, idx, title)
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇 if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
@ -152,6 +162,7 @@ def GetArticleList(jsondir):
idx =artidx idx =artidx
title = subArt["title"] title = subArt["title"]
art = Article(url,pubdate,idx,title) art = Article(url,pubdate,idx,title)
if len(url)>3:
ArtList.append(art) ArtList.append(art)
print(len(ArtList),pubdate, idx, title) print(len(ArtList),pubdate, idx, title)
return ArtList return ArtList
@ -160,7 +171,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
saveHtmlDir = jsbd["htmlDir"] saveHtmlDir = jsbd["htmlDir"]
if not os.path.exists(saveHtmlDir): if not os.path.exists(saveHtmlDir):
os.makedirs(saveHtmlDir) os.makedirs(saveHtmlDir)
saveImgDir = os.path.join(saveHtmlDir, "images") saveImgDir = saveHtmlDir+ "/images"
if not os.path.exists(saveImgDir): if not os.path.exists(saveImgDir):
os.makedirs(saveImgDir) os.makedirs(saveImgDir)
ArtList = GetArticleList(jsonDir) ArtList = GetArticleList(jsonDir)
@ -171,7 +182,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
idx+=1 idx+=1
artname = art.pubdate + "_" + str(art.idx) artname = art.pubdate + "_" + str(art.idx)
arthtmlname = artname + ".html" arthtmlname = artname + ".html"
arthtmlsavepath = os.path.join(saveHtmlDir,arthtmlname) arthtmlsavepath = saveHtmlDir+"/"+arthtmlname
print(idx,"of",totalCount,artname,art.title) print(idx,"of",totalCount,artname,art.title)
# 如果已经有了则跳过,便于暂停后续传 # 如果已经有了则跳过,便于暂停后续传
if os.path.exists(arthtmlsavepath): if os.path.exists(arthtmlsavepath):
@ -179,6 +190,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
continue continue
arthtmlstr = DownLoadHtml(art.url) arthtmlstr = DownLoadHtml(art.url)
arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname) arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname)
print("\r",end="")
SaveFile(arthtmlsavepath,arthtmlstr) SaveFile(arthtmlsavepath,arthtmlstr)
sleep(3) #防止下载过快被微信屏蔽间隔3秒下载一篇 sleep(3) #防止下载过快被微信屏蔽间隔3秒下载一篇
@ -191,7 +203,7 @@ def PDFDir(htmldir,pdfdir):
for f in flist: for f in flist:
if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换含有tmp的不转换 if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换含有tmp的不转换
continue continue
htmlpath = os.path.join(htmldir,f) htmlpath = htmldir+"/"+f
tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件供转pdf用 tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件供转pdf用
htmlstr = ReadFile(htmlpath) htmlstr = ReadFile(htmlpath)
bs = BeautifulSoup(htmlstr, "lxml") bs = BeautifulSoup(htmlstr, "lxml")
@ -201,8 +213,8 @@ def PDFDir(htmldir,pdfdir):
if titleTag is not None: if titleTag is not None:
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","") title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","")
ridx = htmlpath.rindex("/") + 1 ridx = htmlpath.rindex("/") + 1
htmlname = htmlpath[ridx:-5] + title pdfname = htmlpath[ridx:-5] + title
pdfpath = os.path.join(pdfdir, htmlname + ".pdf") pdfpath = pdfdir+"/"+ pdfname + ".pdf"
""" """
把js等去掉减少转PDF时的加载项 把js等去掉减少转PDF时的加载项
@ -236,10 +248,14 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
""" """
先去config.json文件设置 1.设置
先去config.json文件中设置
jsonDirFiddler生成的文件 jsonDirFiddler生成的文件
htmlDir保存html的目录路径中不能有空格 htmlDir保存html的目录路径中不能有空格
pdfDir保存pdf的目录路径中不能有空格 pdfDir保存pdf的目录路径中不能有空格
2.使用方法
运行 python start.py #开始下载html
运行 python start.py pdf #把下载的html转pdf
""" """
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv)==1: if len(sys.argv)==1: