新增筛选固定内容功能

This commit is contained in:
rucky 2022-05-26 16:41:55 +08:00
parent d38ffdab1d
commit c4499fd076
3 changed files with 45 additions and 2 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
/html
/pdf
/useful
*.zip

View File

@ -1,5 +1,6 @@
{
"jsonDir": "./mp",
"htmlDir": "./html",
"pdfDir": "./pdf"
"htmlDir": "./useful",
"pdfDir": "./pdf",
"usefulDir": "./useful"
}

View File

@ -4,7 +4,9 @@ import os, sys
import requests
import json
import pdfkit
import codecs
import subprocess
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
@ -45,6 +47,8 @@ def GetJson():
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
if jsbd["jsonDir"][-1] == "/":
jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
if jsbd["usefulDir"][-1] == "/":
jsbd["usefulDir"] = jsbd["usefulDir"][:-1]
return jsbd
@ -305,6 +309,38 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
"""
# 搜索关键字找到对应html并归类
def SearchDir(htmldir, savedir):
recordFile = 'execute-records'
if not os.path.exists(savedir):
os.makedirs(savedir)
flist = os.listdir(htmldir)
for f in flist:
if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换含有tmp的不转换
continue
FoundFlag = False
htmlpath = htmldir + "/" + f
tmppath = htmlpath[:-5] + ".html" # 生成临时文件供转pdf用
htmlstr = ReadFile(htmlpath)
bs = BeautifulSoup(htmlstr, "lxml")
# pdf文件名中包含文章标题但如果标题中有不能出现在文件名中的符号则会转换失败
keyWord = bs.find_all(text=re.compile("养老"))
print(keyWord)
if (keyWord):
"""
把js等去掉减少转PDF时的加载项
注意此处去掉了css(link如果发现pdf格式乱了可以不去掉css
"""
[s.extract() for s in bs(["script", "iframe"])]
# , "link"
SaveFile(savedir + '/' +f, str(bs))
if __name__ == "__main__":
if len(sys.argv) == 1:
arg = None
@ -320,3 +356,8 @@ if __name__ == "__main__":
saveHtmlDir = jsbd["htmlDir"]
savePdfDir = jsbd["pdfDir"]
PDFDir(saveHtmlDir, savePdfDir)
elif arg == "search":
jsbd = GetJson()
htmlDir = jsbd["htmlDir"]
saveDir = jsbd["usefulDir"]
SearchDir(htmlDir, saveDir)