Compare commits

..

No commits in common. "master" and "3e5e027025016cbb19eb47ce981fefec6cef2e75" have entirely different histories.

3 changed files with 18 additions and 83 deletions

1
.gitignore vendored
View File

@ -1,4 +1,3 @@
/html /html
/pdf /pdf
/useful
*.zip *.zip

View File

@ -1,6 +1,5 @@
{ {
"jsonDir": "./mp", "jsonDir": "./mp",
"htmlDir": "./useful", "htmlDir": "./html",
"pdfDir": "./pdf", "pdfDir": "./pdf"
"usefulDir": "./useful"
} }

View File

@ -1,12 +1,8 @@
# coding=UTF-8 # coding=UTF-8
from importlib.resources import path
import os, sys import os, sys
import requests import requests
import json import json
import pdfkit
import codecs
import subprocess import subprocess
import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime, timedelta from datetime import datetime, timedelta
from time import sleep from time import sleep
@ -47,8 +43,6 @@ def GetJson():
jsbd["htmlDir"] = jsbd["htmlDir"][:-1] jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
if jsbd["jsonDir"][-1] == "/": if jsbd["jsonDir"][-1] == "/":
jsbd["jsonDir"] = jsbd["jsonDir"][:-1] jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
if jsbd["usefulDir"][-1] == "/":
jsbd["usefulDir"] = jsbd["usefulDir"][:-1]
return jsbd return jsbd
@ -219,7 +213,7 @@ def DownHtmlMain(jsonDir, saveHtmlDir):
print("\r", end="") print("\r", end="")
SaveFile(arthtmlsavepath, arthtmlstr) SaveFile(arthtmlsavepath, arthtmlstr)
sleep(0) # 防止下载过快被微信屏蔽间隔3秒下载一篇 sleep(5) # 防止下载过快被微信屏蔽间隔3秒下载一篇
# 把一个文件夹下的html文件都转为pdf # 把一个文件夹下的html文件都转为pdf
@ -257,41 +251,21 @@ def PDFDir(htmldir, pdfdir):
# 把一个Html文件转为pdf # 把一个Html文件转为pdf
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True): def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
print(htmlpath, pdfpath) if skipExists and os.path.exists(pdfpath):
options = { print("pdf exists", pdfpath)
'page-size': 'Letter', if removehtml:
'margin-top': '0.75in', os.remove(htmlpath)
'margin-right': '0.75in', return
'margin-bottom': '0.75in', exepath = "wkhtmltopdf.exe" # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
'margin-left': '0.75in', cmdlist = []
'encoding': "UTF-8", cmdlist.append(" --load-error-handling ignore ")
'custom-header': [ cmdlist.append(" --page-height 200 ") # 数字可以自己调节,也可以不加这两行
('Accept-Encoding', 'gzip') cmdlist.append(" --page-width 140 ")
], cmdlist.append(" " + htmlpath + " ")
# 'cookie': [ cmdlist.append(" " + pdfpath + " ")
# ('cookie-empty-value', '""') cmdstr = exepath + "".join(cmdlist)
# ('cookie-name1', 'cookie-value1'), print(cmdstr)
# ('cookie-name2', 'cookie-value2'), result = subprocess.check_call(cmdstr, shell=False)
# ],
'no-outline': None,
'enable-local-file-access': None
}
pdfkit.from_file(htmlpath, pdfpath, options=options, verbose=True)
# if skipExists and os.path.exists(pdfpath):
# print("pdf exists", pdfpath)
# if removehtml:
# os.remove(htmlpath)
# return
# exepath = "wkhtmltopdf.exe" # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
# cmdlist = []
# cmdlist.append(" --load-error-handling ignore ")
# cmdlist.append(" --page-height 200 ") # 数字可以自己调节,也可以不加这两行
# cmdlist.append(" --page-width 140 ")
# cmdlist.append(" " + htmlpath + " ")
# cmdlist.append(" " + pdfpath + " ")
# cmdstr = exepath + "".join(cmdlist)
# print(cmdstr)
# result = subprocess.check_call(cmdstr, shell=False)
# stdout,stderr = result.communicate() # stdout,stderr = result.communicate()
# result.wait() #等待转换完一个再转下一个 # result.wait() #等待转换完一个再转下一个
if removehtml: if removehtml:
@ -309,38 +283,6 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
""" """
# 搜索关键字找到对应html并归类
def SearchDir(htmldir, savedir):
recordFile = 'execute-records'
if not os.path.exists(savedir):
os.makedirs(savedir)
flist = os.listdir(htmldir)
for f in flist:
if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换含有tmp的不转换
continue
FoundFlag = False
htmlpath = htmldir + "/" + f
tmppath = htmlpath[:-5] + ".html" # 生成临时文件供转pdf用
htmlstr = ReadFile(htmlpath)
bs = BeautifulSoup(htmlstr, "lxml")
# pdf文件名中包含文章标题但如果标题中有不能出现在文件名中的符号则会转换失败
keyWord = bs.find_all(text=re.compile("养老"))
print(keyWord)
if (keyWord):
"""
把js等去掉减少转PDF时的加载项
注意此处去掉了css(link如果发现pdf格式乱了可以不去掉css
"""
[s.extract() for s in bs(["script", "iframe"])]
# , "link"
SaveFile(savedir + '/' +f, str(bs))
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) == 1: if len(sys.argv) == 1:
arg = None arg = None
@ -356,8 +298,3 @@ if __name__ == "__main__":
saveHtmlDir = jsbd["htmlDir"] saveHtmlDir = jsbd["htmlDir"]
savePdfDir = jsbd["pdfDir"] savePdfDir = jsbd["pdfDir"]
PDFDir(saveHtmlDir, savePdfDir) PDFDir(saveHtmlDir, savePdfDir)
elif arg == "search":
jsbd = GetJson()
htmlDir = jsbd["htmlDir"]
saveDir = jsbd["usefulDir"]
SearchDir(htmlDir, saveDir)