Update start.py

This commit is contained in:
LeLe86 2022-05-09 13:15:51 +08:00 committed by GitHub
parent 004992a250
commit 65019e7d89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

218
start.py
View File

@ -1,9 +1,9 @@
import os,sys import os, sys
import requests import requests
import json import json
import subprocess import subprocess
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime,timedelta from datetime import datetime, timedelta
from time import sleep from time import sleep
""" """
@ -11,104 +11,111 @@ from time import sleep
讨论QQ群 703431832 加群暗号:不止技术流 讨论QQ群 703431832 加群暗号:不止技术流
""" """
#保存文件
def SaveFile(fpath,fileContent): # 保存文件
def SaveFile(fpath, fileContent):
with open(fpath, 'w', encoding='utf-8') as f: with open(fpath, 'w', encoding='utf-8') as f:
f.write(fileContent) f.write(fileContent)
#读取文件
# 读取文件
def ReadFile(filepath): def ReadFile(filepath):
with open(filepath, 'r', encoding='utf-8') as f: with open(filepath, 'r', encoding='utf-8') as f:
all_the_text = f.read() all_the_text = f.read()
return all_the_text return all_the_text
#时间戳转日期
# 时间戳转日期
def Timestamp2Datetime(stampstr): def Timestamp2Datetime(stampstr):
dt = datetime.utcfromtimestamp(stampstr) dt = datetime.utcfromtimestamp(stampstr)
dt = dt + timedelta(hours=8) dt = dt + timedelta(hours=8)
newtimestr = dt.strftime("%Y%m%d_%H%M%S") newtimestr = dt.strftime("%Y%m%d_%H%M%S")
return newtimestr return newtimestr
#初始化环境
# 初始化环境
def GetJson(): def GetJson():
jstxt = ReadFile("config.json") jstxt = ReadFile("config.json")
jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别 jstxt = jstxt.replace("\\\\", "/").replace("\\", "/") # 防止json中有 / 导致无法识别
jsbd = json.loads(jstxt) jsbd = json.loads(jstxt)
if jsbd["htmlDir"][-1]=="/": if jsbd["htmlDir"][-1] == "/":
jsbd["htmlDir"] = jsbd["htmlDir"][:-1] jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
if jsbd["jsonDir"][-1]=="/": if jsbd["jsonDir"][-1] == "/":
jsbd["jsonDir"]= jsbd["jsonDir"][:-1] jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
return jsbd return jsbd
#下载url网页 # 下载url网页
def DownLoadHtml(url): def DownLoadHtml(url):
#构造请求头 # 构造请求头
headers = { headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection':'keep-alive', 'Connection': 'keep-alive',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
} }
session = requests.Session() session = requests.Session()
session.trust_env = False session.trust_env = False
response = session.get(url,headers = headers) response = session.get(url, headers=headers)
if response.status_code == 200: if response.status_code == 200:
htmltxt = response.text #返回的网页正文 htmltxt = response.text # 返回的网页正文
return htmltxt return htmltxt
else: else:
return None return None
#将图片从远程下载保存到本地
def DownImg(url,savepath): # 将图片从远程下载保存到本地
#构造请求头 def DownImg(url, savepath):
# 构造请求头
headers = { headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection':'keep-alive', 'Connection': 'keep-alive',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
} }
session = requests.Session() session = requests.Session()
session.trust_env = False session.trust_env = False
response = session.get(url, headers=headers) response = session.get(url, headers=headers)
with open(savepath, 'wb') as f: with open(savepath, 'wb') as f:
f.write(response.content) f.write(response.content)
#修改网页中图片的src使图片能正常显示
def ChangeImgSrc(htmltxt,saveimgdir,htmlname): # 修改网页中图片的src使图片能正常显示
bs =BeautifulSoup(htmltxt,"lxml") #由网页源代码生成BeautifulSoup对象第二个参数固定为lxml def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象第二个参数固定为lxml
imgList = bs.findAll("img") imgList = bs.findAll("img")
imgindex = 0 imgindex = 0
for img in imgList: for img in imgList:
imgindex += 1 imgindex += 1
originalURL = "" # 图片真实url originalURL = "" # 图片真实url
if "data-src" in img.attrs:#有的<img 标签中可能没有data-src if "data-src" in img.attrs: # 有的<img 标签中可能没有data-src
originalURL = img.attrs['data-src'] originalURL = img.attrs['data-src']
elif "src" in img.attrs:#如果有src则提取出来 elif "src" in img.attrs: # 如果有src则提取出来
originalURL = img.attrs['src'] originalURL = img.attrs['src']
else: else:
originalURL = "" originalURL = ""
if originalURL.startswith("//"):#如果url以//开头则需要添加http if originalURL.startswith("//"): # 如果url以//开头则需要添加http
originalURL = "http:" + originalURL originalURL = "http:" + originalURL
if len(originalURL) > 20: if len(originalURL) > 20:
print("\r down imgs " + "" * imgindex +" " + str(imgindex),end="") print("\r down imgs " + "" * imgindex + " " + str(imgindex), end="")
if "data-type" in img.attrs: if "data-type" in img.attrs:
imgtype = img.attrs["data-type"] imgtype = img.attrs["data-type"]
else: else:
imgtype = "png" imgtype = "png"
imgname = htmlname + "_"+str(imgindex)+"."+imgtype #形如 1.png的图片名 imgname = htmlname + "_" + str(imgindex) + "." + imgtype # 形如 1.png的图片名
imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录 imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录
DownImg(originalURL,imgsavepath) DownImg(originalURL, imgsavepath)
img.attrs["src"] = "images/" + imgname #网页中图片的相对路径 img.attrs["src"] = "images/" + imgname # 网页中图片的相对路径
else : else:
img.attrs["src"] = "" img.attrs["src"] = ""
ChangeCssSrc(bs) #修改link标签 ChangeCssSrc(bs) # 修改link标签
ChangeContent(bs) #修改js_content的style使正文能正常显示 ChangeContent(bs) # 修改js_content的style使正文能正常显示
allscript = bs.findAll("script") allscript = bs.findAll("script")
for script in allscript: for script in allscript:
if "src" in script.attrs: #解决远程加载js失败导致打开网页很慢的问题 if "src" in script.attrs: # 解决远程加载js失败导致打开网页很慢的问题
script["src"]="" script["src"] = ""
return str(bs) #将BeautifulSoup对象再转换为字符串用于保存 return str(bs) # 将BeautifulSoup对象再转换为字符串用于保存
def ChangeCssSrc(bs): def ChangeCssSrc(bs):
linkList = bs.findAll("link") linkList = bs.findAll("link")
@ -118,111 +125,116 @@ def ChangeCssSrc(bs):
newhref = "http:" + href newhref = "http:" + href
link.attrs["href"] = newhref link.attrs["href"] = newhref
def ChangeContent(bs): def ChangeContent(bs):
jscontent = bs.find(id="js_content") jscontent = bs.find(id="js_content")
if jscontent: if jscontent:
jscontent.attrs["style"]="" jscontent.attrs["style"] = ""
else: else:
print("-----可能文章被删了-----") print("-----可能文章被删了-----")
#文章类
# 文章类
class Article(): class Article():
def __init__(self,url,pubdate,idx,title): def __init__(self, url, pubdate, idx, title):
self.url = url self.url = url
self.pubdate = pubdate self.pubdate = pubdate
self.idx = idx self.idx = idx
self.title = title self.title = title
#从fiddler保存的json文件中提取文章url等信息
# 从fiddler保存的json文件中提取文章url等信息
def GetArticleList(jsondir): def GetArticleList(jsondir):
filelist = os.listdir(jsondir) filelist = os.listdir(jsondir)
ArtList = [] ArtList = []
for file in filelist: for file in filelist:
try: try:
filepath = os.path.join(jsondir,file) filepath = os.path.join(jsondir, file)
filetxt = ReadFile(filepath) filetxt = ReadFile(filepath)
jsbody = json.loads(filetxt) jsbody = json.loads(filetxt)
general_msg_list = jsbody["general_msg_list"] general_msg_list = jsbody["general_msg_list"]
jsbd2= json.loads(general_msg_list) jsbd2 = json.loads(general_msg_list)
list = jsbd2["list"] list = jsbd2["list"]
for item in list: #一个item里可能有多篇文章 for item in list: # 一个item里可能有多篇文章
artidx = 1 #请注意这里的编号只是为了保存html方便并不对应于真实的文章发文位置(比如头条、次条、3条) artidx = 1 # 请注意这里的编号只是为了保存html方便并不对应于真实的文章发文位置(比如头条、次条、3条)
comm_msg_info = item["comm_msg_info"] comm_msg_info = item["comm_msg_info"]
pubstamp = comm_msg_info["datetime"] pubstamp = comm_msg_info["datetime"]
pubdate = Timestamp2Datetime(pubstamp) pubdate = Timestamp2Datetime(pubstamp)
if comm_msg_info["type"] == 49: #49为普通图文类型还有其他类型暂不考虑 if comm_msg_info["type"] == 49: # 49为普通图文类型还有其他类型暂不考虑
app_msg_ext_info = item["app_msg_ext_info"] app_msg_ext_info = item["app_msg_ext_info"]
url = app_msg_ext_info["content_url"] #文章链接 url = app_msg_ext_info["content_url"] # 文章链接
idx = artidx idx = artidx
title = app_msg_ext_info["title"] title = app_msg_ext_info["title"]
art = Article(url,pubdate,idx,title) art = Article(url, pubdate, idx, title)
if len(url)>3:#url不完整则跳过 if len(url) > 3: # url不完整则跳过
ArtList.append(art) ArtList.append(art)
print(len(ArtList),pubdate, idx, title) print(len(ArtList), pubdate, idx, title)
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇 if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
artidx += 1 artidx += 1
multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"] multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"]
for subArt in multi_app_msg_item_list: for subArt in multi_app_msg_item_list:
url =subArt["content_url"] url = subArt["content_url"]
idx =artidx idx = artidx
title = subArt["title"] title = subArt["title"]
art = Article(url,pubdate,idx,title) art = Article(url, pubdate, idx, title)
if len(url)>3: if len(url) > 3:
ArtList.append(art) ArtList.append(art)
print(len(ArtList),pubdate, idx, title) print(len(ArtList), pubdate, idx, title)
except: except:
print("跳过,可不用管",file) print("跳过,可不用管", file)
return ArtList return ArtList
def DownHtmlMain(jsonDir,saveHtmlDir):
def DownHtmlMain(jsonDir, saveHtmlDir):
saveHtmlDir = jsbd["htmlDir"] saveHtmlDir = jsbd["htmlDir"]
if not os.path.exists(saveHtmlDir): if not os.path.exists(saveHtmlDir):
os.makedirs(saveHtmlDir) os.makedirs(saveHtmlDir)
saveImgDir = saveHtmlDir+ "/images" saveImgDir = saveHtmlDir + "/images"
if not os.path.exists(saveImgDir): if not os.path.exists(saveImgDir):
os.makedirs(saveImgDir) os.makedirs(saveImgDir)
ArtList = GetArticleList(jsonDir) ArtList = GetArticleList(jsonDir)
ArtList.sort(key=lambda x:x.pubdate,reverse=True) #按日期倒序排列 ArtList.sort(key=lambda x: x.pubdate, reverse=True) # 按日期倒序排列
totalCount = len(ArtList) totalCount = len(ArtList)
idx=0 idx = 0
for art in ArtList: for art in ArtList:
idx+=1 idx += 1
artname = art.pubdate + "_" + str(art.idx) artname = art.pubdate + "_" + str(art.idx)
arthtmlname = artname + ".html" arthtmlname = artname + ".html"
arthtmlsavepath = saveHtmlDir+"/"+arthtmlname arthtmlsavepath = saveHtmlDir + "/" + arthtmlname
print(idx,"of",totalCount,artname,art.title) print(idx, "of", totalCount, artname, art.title)
# 如果已经有了则跳过,便于暂停后续传 # 如果已经有了则跳过,便于暂停后续传
if os.path.exists(arthtmlsavepath): if os.path.exists(arthtmlsavepath):
print("exists",arthtmlsavepath) print("exists", arthtmlsavepath)
continue continue
arthtmlstr = DownLoadHtml(art.url) arthtmlstr = DownLoadHtml(art.url)
arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname) arthtmlstr = ChangeImgSrc(arthtmlstr, saveImgDir, artname)
print("\r",end="") print("\r", end="")
SaveFile(arthtmlsavepath,arthtmlstr) SaveFile(arthtmlsavepath, arthtmlstr)
sleep(3) #防止下载过快被微信屏蔽间隔3秒下载一篇 sleep(3) # 防止下载过快被微信屏蔽间隔3秒下载一篇
#把一个文件夹下的html文件都转为pdf
def PDFDir(htmldir,pdfdir): # 把一个文件夹下的html文件都转为pdf
def PDFDir(htmldir, pdfdir):
if not os.path.exists(pdfdir): if not os.path.exists(pdfdir):
os.makedirs(pdfdir) os.makedirs(pdfdir)
flist = os.listdir(htmldir) flist = os.listdir(htmldir)
for f in flist: for f in flist:
if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换含有tmp的不转换 if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换含有tmp的不转换
continue continue
htmlpath = htmldir+"/"+f htmlpath = htmldir + "/" + f
tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件供转pdf用 tmppath = htmlpath[:-5] + "_tmp.html" # 生成临时文件供转pdf用
htmlstr = ReadFile(htmlpath) htmlstr = ReadFile(htmlpath)
bs = BeautifulSoup(htmlstr, "lxml") bs = BeautifulSoup(htmlstr, "lxml")
title = "" title = ""
# pdf文件名中包含文章标题但如果标题中有不能出现在文件名中的符号则会转换失败 # pdf文件名中包含文章标题但如果标题中有不能出现在文件名中的符号则会转换失败
titleTag = bs.find(id="activity-name") titleTag = bs.find(id="activity-name")
if titleTag is not None: if titleTag is not None:
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","") title = "_" + titleTag.get_text().replace(" ", "").replace(" ", "").replace("\n", "").replace("|", "").replace(":", "")
ridx = htmlpath.rindex("/") + 1 ridx = htmlpath.rindex("/") + 1
pdfname = htmlpath[ridx:-5] + title pdfname = htmlpath[ridx:-5] + title
pdfpath = pdfdir+"/"+ pdfname + ".pdf" pdfpath = pdfdir + "/" + pdfname + ".pdf"
""" """
把js等去掉减少转PDF时的加载项 把js等去掉减少转PDF时的加载项
@ -230,24 +242,25 @@ def PDFDir(htmldir,pdfdir):
""" """
[s.extract() for s in bs(["script", "iframe", "link"])] [s.extract() for s in bs(["script", "iframe", "link"])]
SaveFile(tmppath, str(bs)) SaveFile(tmppath, str(bs))
try: try:
PDFOne(tmppath,pdfpath) PDFOne(tmppath, pdfpath)
except: except:
print("转pdf失败可能是因为标题中有特殊字符",f) print("转pdf失败可能是因为标题中有特殊字符", f)
#把一个Html文件转为pdf
def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True): # 把一个Html文件转为pdf
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
if skipExists and os.path.exists(pdfpath): if skipExists and os.path.exists(pdfpath):
print("pdf exists",pdfpath) print("pdf exists", pdfpath)
if removehtml: if removehtml:
os.remove(htmlpath) os.remove(htmlpath)
return return
exepath = "wkhtmltopdf.exe"#把wkhtmltopdf.exe文件保存到与本py文件相同的目录下 exepath = "wkhtmltopdf.exe" # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
cmdlist =[] cmdlist = []
cmdlist.append(" --load-error-handling ignore ") cmdlist.append(" --load-error-handling ignore ")
cmdlist.append(" --page-height 200 ") #数字可以自己调节,也可以不加这两行 cmdlist.append(" --page-height 200 ") # 数字可以自己调节,也可以不加这两行
cmdlist.append(" --page-width 140 ") cmdlist.append(" --page-width 140 ")
cmdlist.append(" " + htmlpath +" ") cmdlist.append(" " + htmlpath + " ")
cmdlist.append(" " + pdfpath + " ") cmdlist.append(" " + pdfpath + " ")
cmdstr = exepath + "".join(cmdlist) cmdstr = exepath + "".join(cmdlist)
print(cmdstr) print(cmdstr)
@ -257,7 +270,6 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
if removehtml: if removehtml:
os.remove(htmlpath) os.remove(htmlpath)
""" """
1.设置 1.设置
先去config.json文件中设置 先去config.json文件中设置
@ -268,18 +280,20 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
运行 python start.py #开始下载html 运行 python start.py #开始下载html
运行 python start.py pdf #把下载的html转pdf 运行 python start.py pdf #把下载的html转pdf
""" """
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv)==1: if len(sys.argv) == 1:
arg = None arg = None
else: else:
arg = sys.argv[1] arg = sys.argv[1]
if arg is None or arg == "html" : if arg is None or arg == "html":
jsbd = GetJson() jsbd = GetJson()
saveHtmlDir = jsbd["htmlDir"] saveHtmlDir = jsbd["htmlDir"]
jsdir= jsbd["jsonDir"] jsdir = jsbd["jsonDir"]
DownHtmlMain(jsdir,saveHtmlDir) DownHtmlMain(jsdir, saveHtmlDir)
elif arg == "pdf": elif arg == "pdf":
jsbd = GetJson() jsbd = GetJson()
saveHtmlDir = jsbd["htmlDir"] saveHtmlDir = jsbd["htmlDir"]
savePdfDir = jsbd["pdfDir"] savePdfDir = jsbd["pdfDir"]
PDFDir(saveHtmlDir,savePdfDir) PDFDir(saveHtmlDir, savePdfDir)