diff --git a/.gitignore b/.gitignore index 04b9500..16a1277 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /html /pdf +/useful *.zip \ No newline at end of file diff --git a/config.json b/config.json index 0d82f09..0b252b2 100644 --- a/config.json +++ b/config.json @@ -1,5 +1,6 @@ { "jsonDir": "./mp", - "htmlDir": "./html", - "pdfDir": "./pdf" + "htmlDir": "./useful", + "pdfDir": "./pdf", + "usefulDir": "./useful" } \ No newline at end of file diff --git a/start.py b/start.py index 174e3ff..333ba4c 100644 --- a/start.py +++ b/start.py @@ -4,7 +4,9 @@ import os, sys import requests import json import pdfkit +import codecs import subprocess +import re from bs4 import BeautifulSoup from datetime import datetime, timedelta from time import sleep @@ -45,6 +47,8 @@ def GetJson(): jsbd["htmlDir"] = jsbd["htmlDir"][:-1] if jsbd["jsonDir"][-1] == "/": jsbd["jsonDir"] = jsbd["jsonDir"][:-1] + if jsbd["usefulDir"][-1] == "/": + jsbd["usefulDir"] = jsbd["usefulDir"][:-1] return jsbd @@ -305,6 +309,38 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True): """ + +# 搜索关键字找到对应html并归类 +def SearchDir(htmldir, savedir): + + recordFile = 'execute-records' + + if not os.path.exists(savedir): + os.makedirs(savedir) + flist = os.listdir(htmldir) + + for f in flist: + if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换,含有tmp的不转换 + continue + + FoundFlag = False + htmlpath = htmldir + "/" + f + tmppath = htmlpath[:-5] + ".html" # 生成临时文件,供转pdf用 + htmlstr = ReadFile(htmlpath) + bs = BeautifulSoup(htmlstr, "lxml") + # pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败 + keyWord = bs.find_all(text=re.compile("养老")) + print(keyWord) + if (keyWord): + """ + 把js等去掉,减少转PDF时的加载项, + 注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css + """ + [s.extract() for s in bs(["script", "iframe"])] + # , "link" + SaveFile(savedir + '/' +f, str(bs)) + + if __name__ == "__main__": if len(sys.argv) == 1: arg = None @@ -320,3 +356,8 @@ if __name__ == "__main__": saveHtmlDir = jsbd["htmlDir"] savePdfDir = jsbd["pdfDir"] PDFDir(saveHtmlDir, savePdfDir) + elif arg == "search": + jsbd = GetJson() + htmlDir = jsbd["htmlDir"] + saveDir = jsbd["usefulDir"] + SearchDir(htmlDir, saveDir)