From 46182be1d9e203c02a575e792179457b8d31db19 Mon Sep 17 00:00:00 2001
From: LeLe86 <251192913@qq.com>
Date: Tue, 3 Mar 2020 13:10:58 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=87=E4=BB=B6=E8=B7=AF?=
 =?UTF-8?q?=E5=BE=84=E6=98=93=E5=87=BA=E9=94=99=E7=9A=84=E9=97=AE=E9=A2=98?=
 =?UTF-8?q?=EF=BC=8C=E7=BE=8E=E5=8C=96=E4=B8=8B=E8=BD=BD=E8=BF=9B=E5=BA=A6?=
 =?UTF-8?q?=E6=9D=A1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md   |  2 ++
 config.json |  6 +++---
 start.py    | 42 +++++++++++++++++++++++++++++-------------
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 226d9ef..48a81c2 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ Fiddler的官网有时会连不上，可去pc.qq.com搜索Fiddler4  并安装
 至此配置完成了，点软件左下角的方块，会显示Capturing ，表示它此时处在可以抓取数据的状态，再点一下会暂停抓取。此处先打开为抓取状态  
 ![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602082132.png)  
 
+有的朋友可能会在Fiddler 中抓取不到Https请求，请仔细按照上面流程检查。若有其他异常，绝大多数Fiddler相关的问题通过百度可以解决。
+
 ## c.打开某个微信公众号的历史文章列表
 ![avatar](http://img1.xiaokuake.com/p/wp-content/uploads/2019/08/2019080602060364.png) 
 
diff --git a/config.json b/config.json
index 4fd562c..445cfc1 100644
--- a/config.json
+++ b/config.json
@@ -1,5 +1,5 @@
 {
-    "jsonDir": "C:/Users/kklwin10/Desktop/Dump-0103-20-14-29",
-    "htmlDir": "c:/vWeChatFiles/html/",
-    "pdfDir": "c:/vWeChatFiles/pdf/"
+    "jsonDir": "c:/Users/kklwin10/Desktop/Dump-0103-20-14-29",
+    "htmlDir": "c:/vWeChatFiles/html",
+    "pdfDir": "c:/vWeChatFiles/pdf"
 }
\ No newline at end of file
diff --git a/start.py b/start.py
index 8f34311..eb992d6 100644
--- a/start.py
+++ b/start.py
@@ -8,6 +8,7 @@ from time import sleep
 
 """
 本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
+讨论QQ群 703431832
 
 """
 
@@ -34,6 +35,10 @@ def GetJson():
     jstxt = ReadFile("config.json")
     jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别
     jsbd = json.loads(jstxt)
+    if jsbd["htmlDir"][-1]=="/":
+        jsbd["htmlDir"] = jsbd["htmlDir"][:-1] 
+    if jsbd["jsonDir"][-1]=="/":
+        jsbd["jsonDir"]= jsbd["jsonDir"][:-1] 
     return jsbd
 
 
@@ -46,8 +51,8 @@ def DownLoadHtml(url):
                      'Connection':'keep-alive',
                      'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
               } 
-     
-    response = requests.get(url,headers = headers,proxies=None)
+    requests.packages.urllib3.disable_warnings()
+    response = requests.get(url,headers = headers,proxies=None,verify=False)
     if response.status_code == 200:
         htmltxt = response.text #返回的网页正文
         return htmltxt
@@ -63,7 +68,8 @@ def DownImg(url,savepath):
                      'Connection':'keep-alive',
                      'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
               } 
-    r = requests.get(url,headers = headers,proxies=None)
+    requests.packages.urllib3.disable_warnings()
+    r = requests.get(url,headers = headers,proxies=None,verify=False)
     with open(savepath, 'wb') as f:
         f.write(r.content)
 
@@ -84,7 +90,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
         if originalURL.startswith("//"):#如果url以//开头，则需要添加http：
             originalURL = "http:" + originalURL
         if len(originalURL) > 0:
-            print("down img",imgindex)
+            print("\r down imgs " + "▇" * imgindex +" " + str(imgindex),end="")
             if "data-type" in img.attrs:
                 imgtype = img.attrs["data-type"]
             else:
@@ -109,7 +115,10 @@ def ChangeCssSrc(bs):
             
 def ChangeContent(bs):
     jscontent = bs.find(id="js_content")
-    jscontent.attrs["style"]=""
+    if jscontent:
+        jscontent.attrs["style"]=""
+    else:
+        print("-----可能文章被删了-----")
     
 #文章类
 class Article():
@@ -142,7 +151,8 @@ def GetArticleList(jsondir):
                 idx = artidx
                 title = app_msg_ext_info["title"]
                 art = Article(url,pubdate,idx,title)
-                ArtList.append(art)
+                if len(url)>3:#url不完整则跳过
+                    ArtList.append(art)
                 print(len(ArtList),pubdate, idx, title)
                 if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
                     artidx += 1
@@ -152,7 +162,8 @@ def GetArticleList(jsondir):
                       idx =artidx
                       title = subArt["title"]
                       art = Article(url,pubdate,idx,title)
-                      ArtList.append(art)
+                      if len(url)>3:
+                        ArtList.append(art)
                       print(len(ArtList),pubdate, idx, title)
     return ArtList
 
@@ -160,7 +171,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
     saveHtmlDir = jsbd["htmlDir"]
     if not os.path.exists(saveHtmlDir):
         os.makedirs(saveHtmlDir)
-    saveImgDir = os.path.join(saveHtmlDir, "images")
+    saveImgDir = saveHtmlDir+ "/images"
     if not os.path.exists(saveImgDir):
         os.makedirs(saveImgDir)
     ArtList = GetArticleList(jsonDir)
@@ -171,7 +182,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
         idx+=1
         artname = art.pubdate + "_" + str(art.idx)
         arthtmlname = artname + ".html"
-        arthtmlsavepath = os.path.join(saveHtmlDir,arthtmlname)
+        arthtmlsavepath = saveHtmlDir+"/"+arthtmlname
         print(idx,"of",totalCount,artname,art.title)
         # 如果已经有了则跳过，便于暂停后续传
         if os.path.exists(arthtmlsavepath):
@@ -179,6 +190,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
             continue
         arthtmlstr = DownLoadHtml(art.url)
         arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname)
+        print("\r",end="")
         SaveFile(arthtmlsavepath,arthtmlstr)
 
         sleep(3) #防止下载过快被微信屏蔽，间隔3秒下载一篇
@@ -191,7 +203,7 @@ def PDFDir(htmldir,pdfdir):
     for f in flist:
         if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换，含有tmp的不转换
             continue
-        htmlpath = os.path.join(htmldir,f)
+        htmlpath = htmldir+"/"+f
         tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件，供转pdf用
         htmlstr = ReadFile(htmlpath)
         bs = BeautifulSoup(htmlstr, "lxml")
@@ -201,8 +213,8 @@ def PDFDir(htmldir,pdfdir):
         if titleTag is not None:
             title = "_" + titleTag.get_text().replace(" ", "").replace("  ","").replace("\n","")
         ridx = htmlpath.rindex("/") + 1
-        htmlname = htmlpath[ridx:-5] + title
-        pdfpath = os.path.join(pdfdir, htmlname + ".pdf")
+        pdfname = htmlpath[ridx:-5] + title
+        pdfpath = pdfdir+"/"+ pdfname + ".pdf"
 
         """
             把js等去掉，减少转PDF时的加载项，
@@ -236,10 +248,14 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
 
 
     """
-            先去config.json文件设置
+        1.设置：
+            先去config.json文件中设置
             jsonDir：Fiddler生成的文件
             htmlDir：保存html的目录，路径中不能有空格
             pdfDir：保存pdf的目录，路径中不能有空格
+        2.使用方法：    
+            运行 python start.py      #开始下载html  
+            运行 python start.py pdf  #把下载的html转pdf 
     """
 if __name__ == "__main__":
     if len(sys.argv)==1: