优化文件路径易出错的问题,美化下载进度条
This commit is contained in:
parent
7e4e77d958
commit
46182be1d9
@ -31,6 +31,8 @@ Fiddler的官网有时会连不上,可去pc.qq.com搜索Fiddler4 并安装
|
|||||||
至此配置完成了,点软件左下角的方块,会显示Capturing ,表示它此时处在可以抓取数据的状态,再点一下会暂停抓取。此处先打开为抓取状态
|
至此配置完成了,点软件左下角的方块,会显示Capturing ,表示它此时处在可以抓取数据的状态,再点一下会暂停抓取。此处先打开为抓取状态
|
||||||

|

|
||||||
|
|
||||||
|
有的朋友可能会在Fiddler 中抓取不到Https请求,请仔细按照上面流程检查。若有其他异常,绝大多数Fiddler相关的问题通过百度可以解决。
|
||||||
|
|
||||||
## c.打开某个微信公众号的历史文章列表
|
## c.打开某个微信公众号的历史文章列表
|
||||||

|

|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"jsonDir": "C:/Users/kklwin10/Desktop/Dump-0103-20-14-29",
|
"jsonDir": "c:/Users/kklwin10/Desktop/Dump-0103-20-14-29",
|
||||||
"htmlDir": "c:/vWeChatFiles/html/",
|
"htmlDir": "c:/vWeChatFiles/html",
|
||||||
"pdfDir": "c:/vWeChatFiles/pdf/"
|
"pdfDir": "c:/vWeChatFiles/pdf"
|
||||||
}
|
}
|
||||||
42
start.py
42
start.py
@ -8,6 +8,7 @@ from time import sleep
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
|
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
|
||||||
|
讨论QQ群 703431832
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -34,6 +35,10 @@ def GetJson():
|
|||||||
jstxt = ReadFile("config.json")
|
jstxt = ReadFile("config.json")
|
||||||
jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别
|
jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别
|
||||||
jsbd = json.loads(jstxt)
|
jsbd = json.loads(jstxt)
|
||||||
|
if jsbd["htmlDir"][-1]=="/":
|
||||||
|
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
|
||||||
|
if jsbd["jsonDir"][-1]=="/":
|
||||||
|
jsbd["jsonDir"]= jsbd["jsonDir"][:-1]
|
||||||
return jsbd
|
return jsbd
|
||||||
|
|
||||||
|
|
||||||
@ -46,8 +51,8 @@ def DownLoadHtml(url):
|
|||||||
'Connection':'keep-alive',
|
'Connection':'keep-alive',
|
||||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||||
}
|
}
|
||||||
|
requests.packages.urllib3.disable_warnings()
|
||||||
response = requests.get(url,headers = headers,proxies=None)
|
response = requests.get(url,headers = headers,proxies=None,verify=False)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
htmltxt = response.text #返回的网页正文
|
htmltxt = response.text #返回的网页正文
|
||||||
return htmltxt
|
return htmltxt
|
||||||
@ -63,7 +68,8 @@ def DownImg(url,savepath):
|
|||||||
'Connection':'keep-alive',
|
'Connection':'keep-alive',
|
||||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||||
}
|
}
|
||||||
r = requests.get(url,headers = headers,proxies=None)
|
requests.packages.urllib3.disable_warnings()
|
||||||
|
r = requests.get(url,headers = headers,proxies=None,verify=False)
|
||||||
with open(savepath, 'wb') as f:
|
with open(savepath, 'wb') as f:
|
||||||
f.write(r.content)
|
f.write(r.content)
|
||||||
|
|
||||||
@ -84,7 +90,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
|||||||
if originalURL.startswith("//"):#如果url以//开头,则需要添加http:
|
if originalURL.startswith("//"):#如果url以//开头,则需要添加http:
|
||||||
originalURL = "http:" + originalURL
|
originalURL = "http:" + originalURL
|
||||||
if len(originalURL) > 0:
|
if len(originalURL) > 0:
|
||||||
print("down img",imgindex)
|
print("\r down imgs " + "▇" * imgindex +" " + str(imgindex),end="")
|
||||||
if "data-type" in img.attrs:
|
if "data-type" in img.attrs:
|
||||||
imgtype = img.attrs["data-type"]
|
imgtype = img.attrs["data-type"]
|
||||||
else:
|
else:
|
||||||
@ -109,7 +115,10 @@ def ChangeCssSrc(bs):
|
|||||||
|
|
||||||
def ChangeContent(bs):
|
def ChangeContent(bs):
|
||||||
jscontent = bs.find(id="js_content")
|
jscontent = bs.find(id="js_content")
|
||||||
jscontent.attrs["style"]=""
|
if jscontent:
|
||||||
|
jscontent.attrs["style"]=""
|
||||||
|
else:
|
||||||
|
print("-----可能文章被删了-----")
|
||||||
|
|
||||||
#文章类
|
#文章类
|
||||||
class Article():
|
class Article():
|
||||||
@ -142,7 +151,8 @@ def GetArticleList(jsondir):
|
|||||||
idx = artidx
|
idx = artidx
|
||||||
title = app_msg_ext_info["title"]
|
title = app_msg_ext_info["title"]
|
||||||
art = Article(url,pubdate,idx,title)
|
art = Article(url,pubdate,idx,title)
|
||||||
ArtList.append(art)
|
if len(url)>3:#url不完整则跳过
|
||||||
|
ArtList.append(art)
|
||||||
print(len(ArtList),pubdate, idx, title)
|
print(len(ArtList),pubdate, idx, title)
|
||||||
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
|
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
|
||||||
artidx += 1
|
artidx += 1
|
||||||
@ -152,7 +162,8 @@ def GetArticleList(jsondir):
|
|||||||
idx =artidx
|
idx =artidx
|
||||||
title = subArt["title"]
|
title = subArt["title"]
|
||||||
art = Article(url,pubdate,idx,title)
|
art = Article(url,pubdate,idx,title)
|
||||||
ArtList.append(art)
|
if len(url)>3:
|
||||||
|
ArtList.append(art)
|
||||||
print(len(ArtList),pubdate, idx, title)
|
print(len(ArtList),pubdate, idx, title)
|
||||||
return ArtList
|
return ArtList
|
||||||
|
|
||||||
@ -160,7 +171,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
|
|||||||
saveHtmlDir = jsbd["htmlDir"]
|
saveHtmlDir = jsbd["htmlDir"]
|
||||||
if not os.path.exists(saveHtmlDir):
|
if not os.path.exists(saveHtmlDir):
|
||||||
os.makedirs(saveHtmlDir)
|
os.makedirs(saveHtmlDir)
|
||||||
saveImgDir = os.path.join(saveHtmlDir, "images")
|
saveImgDir = saveHtmlDir+ "/images"
|
||||||
if not os.path.exists(saveImgDir):
|
if not os.path.exists(saveImgDir):
|
||||||
os.makedirs(saveImgDir)
|
os.makedirs(saveImgDir)
|
||||||
ArtList = GetArticleList(jsonDir)
|
ArtList = GetArticleList(jsonDir)
|
||||||
@ -171,7 +182,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
|
|||||||
idx+=1
|
idx+=1
|
||||||
artname = art.pubdate + "_" + str(art.idx)
|
artname = art.pubdate + "_" + str(art.idx)
|
||||||
arthtmlname = artname + ".html"
|
arthtmlname = artname + ".html"
|
||||||
arthtmlsavepath = os.path.join(saveHtmlDir,arthtmlname)
|
arthtmlsavepath = saveHtmlDir+"/"+arthtmlname
|
||||||
print(idx,"of",totalCount,artname,art.title)
|
print(idx,"of",totalCount,artname,art.title)
|
||||||
# 如果已经有了则跳过,便于暂停后续传
|
# 如果已经有了则跳过,便于暂停后续传
|
||||||
if os.path.exists(arthtmlsavepath):
|
if os.path.exists(arthtmlsavepath):
|
||||||
@ -179,6 +190,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
|
|||||||
continue
|
continue
|
||||||
arthtmlstr = DownLoadHtml(art.url)
|
arthtmlstr = DownLoadHtml(art.url)
|
||||||
arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname)
|
arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname)
|
||||||
|
print("\r",end="")
|
||||||
SaveFile(arthtmlsavepath,arthtmlstr)
|
SaveFile(arthtmlsavepath,arthtmlstr)
|
||||||
|
|
||||||
sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇
|
sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇
|
||||||
@ -191,7 +203,7 @@ def PDFDir(htmldir,pdfdir):
|
|||||||
for f in flist:
|
for f in flist:
|
||||||
if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换,含有tmp的不转换
|
if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换,含有tmp的不转换
|
||||||
continue
|
continue
|
||||||
htmlpath = os.path.join(htmldir,f)
|
htmlpath = htmldir+"/"+f
|
||||||
tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件,供转pdf用
|
tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件,供转pdf用
|
||||||
htmlstr = ReadFile(htmlpath)
|
htmlstr = ReadFile(htmlpath)
|
||||||
bs = BeautifulSoup(htmlstr, "lxml")
|
bs = BeautifulSoup(htmlstr, "lxml")
|
||||||
@ -201,8 +213,8 @@ def PDFDir(htmldir,pdfdir):
|
|||||||
if titleTag is not None:
|
if titleTag is not None:
|
||||||
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","")
|
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","")
|
||||||
ridx = htmlpath.rindex("/") + 1
|
ridx = htmlpath.rindex("/") + 1
|
||||||
htmlname = htmlpath[ridx:-5] + title
|
pdfname = htmlpath[ridx:-5] + title
|
||||||
pdfpath = os.path.join(pdfdir, htmlname + ".pdf")
|
pdfpath = pdfdir+"/"+ pdfname + ".pdf"
|
||||||
|
|
||||||
"""
|
"""
|
||||||
把js等去掉,减少转PDF时的加载项,
|
把js等去掉,减少转PDF时的加载项,
|
||||||
@ -236,10 +248,14 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
先去config.json文件设置
|
1.设置:
|
||||||
|
先去config.json文件中设置
|
||||||
jsonDir:Fiddler生成的文件
|
jsonDir:Fiddler生成的文件
|
||||||
htmlDir:保存html的目录,路径中不能有空格
|
htmlDir:保存html的目录,路径中不能有空格
|
||||||
pdfDir:保存pdf的目录,路径中不能有空格
|
pdfDir:保存pdf的目录,路径中不能有空格
|
||||||
|
2.使用方法:
|
||||||
|
运行 python start.py #开始下载html
|
||||||
|
运行 python start.py pdf #把下载的html转pdf
|
||||||
"""
|
"""
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv)==1:
|
if len(sys.argv)==1:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user