Compare commits
No commits in common. "master" and "3e5e027025016cbb19eb47ce981fefec6cef2e75" have entirely different histories.
master
...
3e5e027025
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,4 +1,3 @@
|
|||||||
/html
|
/html
|
||||||
/pdf
|
/pdf
|
||||||
/useful
|
|
||||||
*.zip
|
*.zip
|
||||||
@ -1,6 +1,5 @@
|
|||||||
{
|
{
|
||||||
"jsonDir": "./mp",
|
"jsonDir": "./mp",
|
||||||
"htmlDir": "./useful",
|
"htmlDir": "./html",
|
||||||
"pdfDir": "./pdf",
|
"pdfDir": "./pdf"
|
||||||
"usefulDir": "./useful"
|
|
||||||
}
|
}
|
||||||
95
start.py
95
start.py
@ -1,12 +1,8 @@
|
|||||||
# coding=UTF-8
|
# coding=UTF-8
|
||||||
from importlib.resources import path
|
|
||||||
import os, sys
|
import os, sys
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
import pdfkit
|
|
||||||
import codecs
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import re
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from time import sleep
|
from time import sleep
|
||||||
@ -47,8 +43,6 @@ def GetJson():
|
|||||||
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
|
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
|
||||||
if jsbd["jsonDir"][-1] == "/":
|
if jsbd["jsonDir"][-1] == "/":
|
||||||
jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
|
jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
|
||||||
if jsbd["usefulDir"][-1] == "/":
|
|
||||||
jsbd["usefulDir"] = jsbd["usefulDir"][:-1]
|
|
||||||
return jsbd
|
return jsbd
|
||||||
|
|
||||||
|
|
||||||
@ -219,7 +213,7 @@ def DownHtmlMain(jsonDir, saveHtmlDir):
|
|||||||
print("\r", end="")
|
print("\r", end="")
|
||||||
SaveFile(arthtmlsavepath, arthtmlstr)
|
SaveFile(arthtmlsavepath, arthtmlstr)
|
||||||
|
|
||||||
sleep(0) # 防止下载过快被微信屏蔽,间隔3秒下载一篇
|
sleep(5) # 防止下载过快被微信屏蔽,间隔3秒下载一篇
|
||||||
|
|
||||||
|
|
||||||
# 把一个文件夹下的html文件都转为pdf
|
# 把一个文件夹下的html文件都转为pdf
|
||||||
@ -257,41 +251,21 @@ def PDFDir(htmldir, pdfdir):
|
|||||||
|
|
||||||
# 把一个Html文件转为pdf
|
# 把一个Html文件转为pdf
|
||||||
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
||||||
print(htmlpath, pdfpath)
|
if skipExists and os.path.exists(pdfpath):
|
||||||
options = {
|
print("pdf exists", pdfpath)
|
||||||
'page-size': 'Letter',
|
if removehtml:
|
||||||
'margin-top': '0.75in',
|
os.remove(htmlpath)
|
||||||
'margin-right': '0.75in',
|
return
|
||||||
'margin-bottom': '0.75in',
|
exepath = "wkhtmltopdf.exe" # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
|
||||||
'margin-left': '0.75in',
|
cmdlist = []
|
||||||
'encoding': "UTF-8",
|
cmdlist.append(" --load-error-handling ignore ")
|
||||||
'custom-header': [
|
cmdlist.append(" --page-height 200 ") # 数字可以自己调节,也可以不加这两行
|
||||||
('Accept-Encoding', 'gzip')
|
cmdlist.append(" --page-width 140 ")
|
||||||
],
|
cmdlist.append(" " + htmlpath + " ")
|
||||||
# 'cookie': [
|
cmdlist.append(" " + pdfpath + " ")
|
||||||
# ('cookie-empty-value', '""')
|
cmdstr = exepath + "".join(cmdlist)
|
||||||
# ('cookie-name1', 'cookie-value1'),
|
print(cmdstr)
|
||||||
# ('cookie-name2', 'cookie-value2'),
|
result = subprocess.check_call(cmdstr, shell=False)
|
||||||
# ],
|
|
||||||
'no-outline': None,
|
|
||||||
'enable-local-file-access': None
|
|
||||||
}
|
|
||||||
pdfkit.from_file(htmlpath, pdfpath, options=options, verbose=True)
|
|
||||||
# if skipExists and os.path.exists(pdfpath):
|
|
||||||
# print("pdf exists", pdfpath)
|
|
||||||
# if removehtml:
|
|
||||||
# os.remove(htmlpath)
|
|
||||||
# return
|
|
||||||
# exepath = "wkhtmltopdf.exe" # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
|
|
||||||
# cmdlist = []
|
|
||||||
# cmdlist.append(" --load-error-handling ignore ")
|
|
||||||
# cmdlist.append(" --page-height 200 ") # 数字可以自己调节,也可以不加这两行
|
|
||||||
# cmdlist.append(" --page-width 140 ")
|
|
||||||
# cmdlist.append(" " + htmlpath + " ")
|
|
||||||
# cmdlist.append(" " + pdfpath + " ")
|
|
||||||
# cmdstr = exepath + "".join(cmdlist)
|
|
||||||
# print(cmdstr)
|
|
||||||
# result = subprocess.check_call(cmdstr, shell=False)
|
|
||||||
# stdout,stderr = result.communicate()
|
# stdout,stderr = result.communicate()
|
||||||
# result.wait() #等待转换完一个再转下一个
|
# result.wait() #等待转换完一个再转下一个
|
||||||
if removehtml:
|
if removehtml:
|
||||||
@ -309,38 +283,6 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 搜索关键字找到对应html并归类
|
|
||||||
def SearchDir(htmldir, savedir):
|
|
||||||
|
|
||||||
recordFile = 'execute-records'
|
|
||||||
|
|
||||||
if not os.path.exists(savedir):
|
|
||||||
os.makedirs(savedir)
|
|
||||||
flist = os.listdir(htmldir)
|
|
||||||
|
|
||||||
for f in flist:
|
|
||||||
if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换,含有tmp的不转换
|
|
||||||
continue
|
|
||||||
|
|
||||||
FoundFlag = False
|
|
||||||
htmlpath = htmldir + "/" + f
|
|
||||||
tmppath = htmlpath[:-5] + ".html" # 生成临时文件,供转pdf用
|
|
||||||
htmlstr = ReadFile(htmlpath)
|
|
||||||
bs = BeautifulSoup(htmlstr, "lxml")
|
|
||||||
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
|
|
||||||
keyWord = bs.find_all(text=re.compile("养老"))
|
|
||||||
print(keyWord)
|
|
||||||
if (keyWord):
|
|
||||||
"""
|
|
||||||
把js等去掉,减少转PDF时的加载项,
|
|
||||||
注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css
|
|
||||||
"""
|
|
||||||
[s.extract() for s in bs(["script", "iframe"])]
|
|
||||||
# , "link"
|
|
||||||
SaveFile(savedir + '/' +f, str(bs))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
arg = None
|
arg = None
|
||||||
@ -356,8 +298,3 @@ if __name__ == "__main__":
|
|||||||
saveHtmlDir = jsbd["htmlDir"]
|
saveHtmlDir = jsbd["htmlDir"]
|
||||||
savePdfDir = jsbd["pdfDir"]
|
savePdfDir = jsbd["pdfDir"]
|
||||||
PDFDir(saveHtmlDir, savePdfDir)
|
PDFDir(saveHtmlDir, savePdfDir)
|
||||||
elif arg == "search":
|
|
||||||
jsbd = GetJson()
|
|
||||||
htmlDir = jsbd["htmlDir"]
|
|
||||||
saveDir = jsbd["usefulDir"]
|
|
||||||
SearchDir(htmlDir, saveDir)
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user