新增筛选固定内容功能
This commit is contained in:
parent
d38ffdab1d
commit
c4499fd076
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
/html
|
/html
|
||||||
/pdf
|
/pdf
|
||||||
|
/useful
|
||||||
*.zip
|
*.zip
|
||||||
@ -1,5 +1,6 @@
|
|||||||
{
|
{
|
||||||
"jsonDir": "./mp",
|
"jsonDir": "./mp",
|
||||||
"htmlDir": "./html",
|
"htmlDir": "./useful",
|
||||||
"pdfDir": "./pdf"
|
"pdfDir": "./pdf",
|
||||||
|
"usefulDir": "./useful"
|
||||||
}
|
}
|
||||||
41
start.py
41
start.py
@ -4,7 +4,9 @@ import os, sys
|
|||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
import pdfkit
|
import pdfkit
|
||||||
|
import codecs
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from time import sleep
|
from time import sleep
|
||||||
@ -45,6 +47,8 @@ def GetJson():
|
|||||||
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
|
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
|
||||||
if jsbd["jsonDir"][-1] == "/":
|
if jsbd["jsonDir"][-1] == "/":
|
||||||
jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
|
jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
|
||||||
|
if jsbd["usefulDir"][-1] == "/":
|
||||||
|
jsbd["usefulDir"] = jsbd["usefulDir"][:-1]
|
||||||
return jsbd
|
return jsbd
|
||||||
|
|
||||||
|
|
||||||
@ -305,6 +309,38 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 搜索关键字找到对应html并归类
|
||||||
|
def SearchDir(htmldir, savedir):
|
||||||
|
|
||||||
|
recordFile = 'execute-records'
|
||||||
|
|
||||||
|
if not os.path.exists(savedir):
|
||||||
|
os.makedirs(savedir)
|
||||||
|
flist = os.listdir(htmldir)
|
||||||
|
|
||||||
|
for f in flist:
|
||||||
|
if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换,含有tmp的不转换
|
||||||
|
continue
|
||||||
|
|
||||||
|
FoundFlag = False
|
||||||
|
htmlpath = htmldir + "/" + f
|
||||||
|
tmppath = htmlpath[:-5] + ".html" # 生成临时文件,供转pdf用
|
||||||
|
htmlstr = ReadFile(htmlpath)
|
||||||
|
bs = BeautifulSoup(htmlstr, "lxml")
|
||||||
|
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
|
||||||
|
keyWord = bs.find_all(text=re.compile("养老"))
|
||||||
|
print(keyWord)
|
||||||
|
if (keyWord):
|
||||||
|
"""
|
||||||
|
把js等去掉,减少转PDF时的加载项,
|
||||||
|
注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css
|
||||||
|
"""
|
||||||
|
[s.extract() for s in bs(["script", "iframe"])]
|
||||||
|
# , "link"
|
||||||
|
SaveFile(savedir + '/' +f, str(bs))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
arg = None
|
arg = None
|
||||||
@ -320,3 +356,8 @@ if __name__ == "__main__":
|
|||||||
saveHtmlDir = jsbd["htmlDir"]
|
saveHtmlDir = jsbd["htmlDir"]
|
||||||
savePdfDir = jsbd["pdfDir"]
|
savePdfDir = jsbd["pdfDir"]
|
||||||
PDFDir(saveHtmlDir, savePdfDir)
|
PDFDir(saveHtmlDir, savePdfDir)
|
||||||
|
elif arg == "search":
|
||||||
|
jsbd = GetJson()
|
||||||
|
htmlDir = jsbd["htmlDir"]
|
||||||
|
saveDir = jsbd["usefulDir"]
|
||||||
|
SearchDir(htmlDir, saveDir)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user