3 changed files with 18 additions and 83 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 /html
 /pdf
-/useful
 *.zip
--- a/config.json
+++ b/config.json
@ -1,6 +1,5 @@
 {
    "jsonDir": "./mp",
-    "htmlDir": "./useful",
-    "pdfDir": "./pdf",
-    "usefulDir": "./useful"
+    "htmlDir": "./html",
+    "pdfDir": "./pdf"
 }
--- a/start.py
+++ b/start.py
@ -1,12 +1,8 @@
 # coding=UTF-8
-from importlib.resources import path
 import os, sys
 import requests
 import json
-import pdfkit
-import codecs
 import subprocess
-import re
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
 from time import sleep
@ -47,8 +43,6 @@ def GetJson():
        jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
    if jsbd["jsonDir"][-1] == "/":
        jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
-    if jsbd["usefulDir"][-1] == "/":
-        jsbd["usefulDir"] = jsbd["usefulDir"][:-1]
    return jsbd


@ -219,7 +213,7 @@ def DownHtmlMain(jsonDir, saveHtmlDir):
        print("\r", end="")
        SaveFile(arthtmlsavepath, arthtmlstr)

-        sleep(0)  # 防止下载过快被微信屏蔽，间隔3秒下载一篇
+        sleep(5)  # 防止下载过快被微信屏蔽，间隔3秒下载一篇


 # 把一个文件夹下的html文件都转为pdf
@ -257,41 +251,21 @@ def PDFDir(htmldir, pdfdir):

 # 把一个Html文件转为pdf
 def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
-    print(htmlpath, pdfpath)
-    options = {
-        'page-size': 'Letter',
-        'margin-top': '0.75in',
-        'margin-right': '0.75in',
-        'margin-bottom': '0.75in',
-        'margin-left': '0.75in',
-        'encoding': "UTF-8",
-        'custom-header': [
-            ('Accept-Encoding', 'gzip')
-        ],
-        # 'cookie': [
-        #     ('cookie-empty-value', '""')
-        #     ('cookie-name1', 'cookie-value1'),
-        #     ('cookie-name2', 'cookie-value2'),
-        # ],
-        'no-outline': None,
-        'enable-local-file-access': None
-    }
-    pdfkit.from_file(htmlpath, pdfpath, options=options, verbose=True)
-    # if skipExists and os.path.exists(pdfpath):
-    #     print("pdf exists", pdfpath)
-    #     if removehtml:
-    #         os.remove(htmlpath)
-    #     return
-    # exepath = "wkhtmltopdf.exe"  # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
-    # cmdlist = []
-    # cmdlist.append(" --load-error-handling ignore ")
-    # cmdlist.append(" --page-height 200 ")  # 数字可以自己调节，也可以不加这两行
-    # cmdlist.append(" --page-width 140 ")
-    # cmdlist.append(" " + htmlpath + " ")
-    # cmdlist.append(" " + pdfpath + " ")
-    # cmdstr = exepath + "".join(cmdlist)
-    # print(cmdstr)
-    # result = subprocess.check_call(cmdstr, shell=False)
+    if skipExists and os.path.exists(pdfpath):
+        print("pdf exists", pdfpath)
+        if removehtml:
+            os.remove(htmlpath)
+        return
+    exepath = "wkhtmltopdf.exe"  # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
+    cmdlist = []
+    cmdlist.append(" --load-error-handling ignore ")
+    cmdlist.append(" --page-height 200 ")  # 数字可以自己调节，也可以不加这两行
+    cmdlist.append(" --page-width 140 ")
+    cmdlist.append(" " + htmlpath + " ")
+    cmdlist.append(" " + pdfpath + " ")
+    cmdstr = exepath + "".join(cmdlist)
+    print(cmdstr)
+    result = subprocess.check_call(cmdstr, shell=False)
    # stdout,stderr = result.communicate()
    # result.wait() #等待转换完一个再转下一个
    if removehtml:
@ -309,38 +283,6 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
    """


-
-# 搜索关键字找到对应html并归类
-def SearchDir(htmldir, savedir):
-
-    recordFile = 'execute-records'
-
-    if not os.path.exists(savedir):
-        os.makedirs(savedir)
-    flist = os.listdir(htmldir)
-    
-    for f in flist:
-        if (not f[-5:] == ".html") or ("tmp" in f):  # 不是html文件的不转换，含有tmp的不转换
-            continue
-
-        FoundFlag = False
-        htmlpath = htmldir + "/" + f
-        tmppath = htmlpath[:-5] + ".html"  # 生成临时文件，供转pdf用
-        htmlstr = ReadFile(htmlpath)
-        bs = BeautifulSoup(htmlstr, "lxml")
-        # pdf文件名中包含文章标题，但如果标题中有不能出现在文件名中的符号则会转换失败
-        keyWord = bs.find_all(text=re.compile("养老"))
-        print(keyWord)
-        if (keyWord):
-            """
-                把js等去掉，减少转PDF时的加载项，
-                注意此处去掉了css(link），如果发现pdf格式乱了可以不去掉css
-            """
-            [s.extract() for s in bs(["script", "iframe"])]
-            # , "link"
-            SaveFile(savedir + '/' +f, str(bs))
-
-
 if __name__ == "__main__":
    if len(sys.argv) == 1:
        arg = None
@ -356,8 +298,3 @@ if __name__ == "__main__":
        saveHtmlDir = jsbd["htmlDir"]
        savePdfDir = jsbd["pdfDir"]
        PDFDir(saveHtmlDir, savePdfDir)
-    elif arg == "search":
-        jsbd = GetJson()
-        htmlDir = jsbd["htmlDir"]
-        saveDir = jsbd["usefulDir"]
-        SearchDir(htmlDir, saveDir)