3 changed files with 18 additions and 83 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 /html
 /pdf
 /useful
 *.zip
--- a/config.json
+++ b/config.json
@ -1,6 +1,5 @@
 {
    "jsonDir": "./mp",
-    "htmlDir": "./useful",
+    "htmlDir": "./html",
-    "pdfDir": "./pdf",
+    "pdfDir": "./pdf"
    "usefulDir": "./useful"
 }
--- a/start.py
+++ b/start.py
@ -1,12 +1,8 @@
 # coding=UTF-8
 from importlib.resources import path
 import os, sys
 import requests
 import json
 import pdfkit
 import codecs
 import subprocess
 import re
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
 from time import sleep
@ -47,8 +43,6 @@ def GetJson():
        jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
    if jsbd["jsonDir"][-1] == "/":
        jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
    if jsbd["usefulDir"][-1] == "/":
        jsbd["usefulDir"] = jsbd["usefulDir"][:-1]
    return jsbd
@ -219,7 +213,7 @@ def DownHtmlMain(jsonDir, saveHtmlDir):
        print("\r", end="")
        SaveFile(arthtmlsavepath, arthtmlstr)
-        sleep(0)  # 防止下载过快被微信屏蔽，间隔3秒下载一篇
+        sleep(5)  # 防止下载过快被微信屏蔽，间隔3秒下载一篇
 # 把一个文件夹下的html文件都转为pdf
@ -257,41 +251,21 @@ def PDFDir(htmldir, pdfdir):
 # 把一个Html文件转为pdf
 def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
-    print(htmlpath, pdfpath)
+    if skipExists and os.path.exists(pdfpath):
-    options = {
+        print("pdf exists", pdfpath)
-        'page-size': 'Letter',
+        if removehtml:
-        'margin-top': '0.75in',
+            os.remove(htmlpath)
-        'margin-right': '0.75in',
+        return
-        'margin-bottom': '0.75in',
+    exepath = "wkhtmltopdf.exe"  # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
-        'margin-left': '0.75in',
+    cmdlist = []
-        'encoding': "UTF-8",
+    cmdlist.append(" --load-error-handling ignore ")
-        'custom-header': [
+    cmdlist.append(" --page-height 200 ")  # 数字可以自己调节，也可以不加这两行
-            ('Accept-Encoding', 'gzip')
+    cmdlist.append(" --page-width 140 ")
-        ],
+    cmdlist.append(" " + htmlpath + " ")
-        # 'cookie': [
+    cmdlist.append(" " + pdfpath + " ")
-        #     ('cookie-empty-value', '""')
+    cmdstr = exepath + "".join(cmdlist)
-        #     ('cookie-name1', 'cookie-value1'),
+    print(cmdstr)
-        #     ('cookie-name2', 'cookie-value2'),
+    result = subprocess.check_call(cmdstr, shell=False)
        # ],
        'no-outline': None,
        'enable-local-file-access': None
    }
    pdfkit.from_file(htmlpath, pdfpath, options=options, verbose=True)
    # if skipExists and os.path.exists(pdfpath):
    #     print("pdf exists", pdfpath)
    #     if removehtml:
    #         os.remove(htmlpath)
    #     return
    # exepath = "wkhtmltopdf.exe"  # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
    # cmdlist = []
    # cmdlist.append(" --load-error-handling ignore ")
    # cmdlist.append(" --page-height 200 ")  # 数字可以自己调节，也可以不加这两行
    # cmdlist.append(" --page-width 140 ")
    # cmdlist.append(" " + htmlpath + " ")
    # cmdlist.append(" " + pdfpath + " ")
    # cmdstr = exepath + "".join(cmdlist)
    # print(cmdstr)
    # result = subprocess.check_call(cmdstr, shell=False)
    # stdout,stderr = result.communicate()
    # result.wait() #等待转换完一个再转下一个
    if removehtml:
@ -309,38 +283,6 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
    """
 # 搜索关键字找到对应html并归类
 def SearchDir(htmldir, savedir):
    recordFile = 'execute-records'
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    flist = os.listdir(htmldir)
    for f in flist:
        if (not f[-5:] == ".html") or ("tmp" in f):  # 不是html文件的不转换，含有tmp的不转换
            continue
        FoundFlag = False
        htmlpath = htmldir + "/" + f
        tmppath = htmlpath[:-5] + ".html"  # 生成临时文件，供转pdf用
        htmlstr = ReadFile(htmlpath)
        bs = BeautifulSoup(htmlstr, "lxml")
        # pdf文件名中包含文章标题，但如果标题中有不能出现在文件名中的符号则会转换失败
        keyWord = bs.find_all(text=re.compile("养老"))
        print(keyWord)
        if (keyWord):
            """
                把js等去掉，减少转PDF时的加载项，
                注意此处去掉了css(link），如果发现pdf格式乱了可以不去掉css
            """
            [s.extract() for s in bs(["script", "iframe"])]
            # , "link"
            SaveFile(savedir + '/' +f, str(bs))
 if __name__ == "__main__":
    if len(sys.argv) == 1:
        arg = None
@ -356,8 +298,3 @@ if __name__ == "__main__":
        saveHtmlDir = jsbd["htmlDir"]
        savePdfDir = jsbd["pdfDir"]
        PDFDir(saveHtmlDir, savePdfDir)
    elif arg == "search":
        jsbd = GetJson()
        htmlDir = jsbd["htmlDir"]
        saveDir = jsbd["usefulDir"]
        SearchDir(htmlDir, saveDir)