From 46182be1d9e203c02a575e792179457b8d31db19 Mon Sep 17 00:00:00 2001 From: LeLe86 <251192913@qq.com> Date: Tue, 3 Mar 2020 13:10:58 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=87=E4=BB=B6=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E6=98=93=E5=87=BA=E9=94=99=E7=9A=84=E9=97=AE=E9=A2=98?= =?UTF-8?q?=EF=BC=8C=E7=BE=8E=E5=8C=96=E4=B8=8B=E8=BD=BD=E8=BF=9B=E5=BA=A6?= =?UTF-8?q?=E6=9D=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 ++ config.json | 6 +++--- start.py | 42 +++++++++++++++++++++++++++++------------- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 226d9ef..48a81c2 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ Fiddler的官网有时会连不上,可去pc.qq.com搜索Fiddler4 并安装 至此配置完成了,点软件左下角的方块,会显示Capturing ,表示它此时处在可以抓取数据的状态,再点一下会暂停抓取。此处先打开为抓取状态 ![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602082132.png) +有的朋友可能会在Fiddler 中抓取不到Https请求,请仔细按照上面流程检查。若有其他异常,绝大多数Fiddler相关的问题通过百度可以解决。 + ## c.打开某个微信公众号的历史文章列表 ![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602060364.png) diff --git a/config.json b/config.json index 4fd562c..445cfc1 100644 --- a/config.json +++ b/config.json @@ -1,5 +1,5 @@ { - "jsonDir": "C:/Users/kklwin10/Desktop/Dump-0103-20-14-29", - "htmlDir": "c:/vWeChatFiles/html/", - "pdfDir": "c:/vWeChatFiles/pdf/" + "jsonDir": "c:/Users/kklwin10/Desktop/Dump-0103-20-14-29", + "htmlDir": "c:/vWeChatFiles/html", + "pdfDir": "c:/vWeChatFiles/pdf" } \ No newline at end of file diff --git a/start.py b/start.py index 8f34311..eb992d6 100644 --- a/start.py +++ b/start.py @@ -8,6 +8,7 @@ from time import sleep """ 本项目开源地址 https://github.com/LeLe86/vWeChatCrawl +讨论QQ群 703431832 """ @@ -34,6 +35,10 @@ def GetJson(): jstxt = ReadFile("config.json") jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别 jsbd = json.loads(jstxt) + if jsbd["htmlDir"][-1]=="/": + jsbd["htmlDir"] = jsbd["htmlDir"][:-1] + if jsbd["jsonDir"][-1]=="/": + jsbd["jsonDir"]= jsbd["jsonDir"][:-1] return jsbd @@ -46,8 +51,8 @@ def DownLoadHtml(url): 'Connection':'keep-alive', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' } - - response = requests.get(url,headers = headers,proxies=None) + requests.packages.urllib3.disable_warnings() + response = requests.get(url,headers = headers,proxies=None,verify=False) if response.status_code == 200: htmltxt = response.text #返回的网页正文 return htmltxt @@ -63,7 +68,8 @@ def DownImg(url,savepath): 'Connection':'keep-alive', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' } - r = requests.get(url,headers = headers,proxies=None) + requests.packages.urllib3.disable_warnings() + r = requests.get(url,headers = headers,proxies=None,verify=False) with open(savepath, 'wb') as f: f.write(r.content) @@ -84,7 +90,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname): if originalURL.startswith("//"):#如果url以//开头,则需要添加http: originalURL = "http:" + originalURL if len(originalURL) > 0: - print("down img",imgindex) + print("\r down imgs " + "▇" * imgindex +" " + str(imgindex),end="") if "data-type" in img.attrs: imgtype = img.attrs["data-type"] else: @@ -109,7 +115,10 @@ def ChangeCssSrc(bs): def ChangeContent(bs): jscontent = bs.find(id="js_content") - jscontent.attrs["style"]="" + if jscontent: + jscontent.attrs["style"]="" + else: + print("-----可能文章被删了-----") #文章类 class Article(): @@ -142,7 +151,8 @@ def GetArticleList(jsondir): idx = artidx title = app_msg_ext_info["title"] art = Article(url,pubdate,idx,title) - ArtList.append(art) + if len(url)>3:#url不完整则跳过 + ArtList.append(art) print(len(ArtList),pubdate, idx, title) if app_msg_ext_info["is_multi"] == 1: # 一次发多篇 artidx += 1 @@ -152,7 +162,8 @@ def GetArticleList(jsondir): idx =artidx title = subArt["title"] art = Article(url,pubdate,idx,title) - ArtList.append(art) + if len(url)>3: + ArtList.append(art) print(len(ArtList),pubdate, idx, title) return ArtList @@ -160,7 +171,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir): saveHtmlDir = jsbd["htmlDir"] if not os.path.exists(saveHtmlDir): os.makedirs(saveHtmlDir) - saveImgDir = os.path.join(saveHtmlDir, "images") + saveImgDir = saveHtmlDir+ "/images" if not os.path.exists(saveImgDir): os.makedirs(saveImgDir) ArtList = GetArticleList(jsonDir) @@ -171,7 +182,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir): idx+=1 artname = art.pubdate + "_" + str(art.idx) arthtmlname = artname + ".html" - arthtmlsavepath = os.path.join(saveHtmlDir,arthtmlname) + arthtmlsavepath = saveHtmlDir+"/"+arthtmlname print(idx,"of",totalCount,artname,art.title) # 如果已经有了则跳过,便于暂停后续传 if os.path.exists(arthtmlsavepath): @@ -179,6 +190,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir): continue arthtmlstr = DownLoadHtml(art.url) arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname) + print("\r",end="") SaveFile(arthtmlsavepath,arthtmlstr) sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇 @@ -191,7 +203,7 @@ def PDFDir(htmldir,pdfdir): for f in flist: if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换,含有tmp的不转换 continue - htmlpath = os.path.join(htmldir,f) + htmlpath = htmldir+"/"+f tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件,供转pdf用 htmlstr = ReadFile(htmlpath) bs = BeautifulSoup(htmlstr, "lxml") @@ -201,8 +213,8 @@ def PDFDir(htmldir,pdfdir): if titleTag is not None: title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","") ridx = htmlpath.rindex("/") + 1 - htmlname = htmlpath[ridx:-5] + title - pdfpath = os.path.join(pdfdir, htmlname + ".pdf") + pdfname = htmlpath[ridx:-5] + title + pdfpath = pdfdir+"/"+ pdfname + ".pdf" """ 把js等去掉,减少转PDF时的加载项, @@ -236,10 +248,14 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True): """ - 先去config.json文件设置 + 1.设置: + 先去config.json文件中设置 jsonDir:Fiddler生成的文件 htmlDir:保存html的目录,路径中不能有空格 pdfDir:保存pdf的目录,路径中不能有空格 + 2.使用方法: + 运行 python start.py #开始下载html + 运行 python start.py pdf #把下载的html转pdf """ if __name__ == "__main__": if len(sys.argv)==1: