Update start.py

This commit is contained in:
LeLe86 2022-05-09 13:15:51 +08:00 committed by GitHub
parent 004992a250
commit 65019e7d89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,17 +11,20 @@ from time import sleep
讨论QQ群 703431832 加群暗号:不止技术流 讨论QQ群 703431832 加群暗号:不止技术流
""" """
# 保存文件 # 保存文件
def SaveFile(fpath, fileContent): def SaveFile(fpath, fileContent):
with open(fpath, 'w', encoding='utf-8') as f: with open(fpath, 'w', encoding='utf-8') as f:
f.write(fileContent) f.write(fileContent)
# 读取文件 # 读取文件
def ReadFile(filepath): def ReadFile(filepath):
with open(filepath, 'r', encoding='utf-8') as f: with open(filepath, 'r', encoding='utf-8') as f:
all_the_text = f.read() all_the_text = f.read()
return all_the_text return all_the_text
# 时间戳转日期 # 时间戳转日期
def Timestamp2Datetime(stampstr): def Timestamp2Datetime(stampstr):
dt = datetime.utcfromtimestamp(stampstr) dt = datetime.utcfromtimestamp(stampstr)
@ -29,6 +32,7 @@ def Timestamp2Datetime(stampstr):
newtimestr = dt.strftime("%Y%m%d_%H%M%S") newtimestr = dt.strftime("%Y%m%d_%H%M%S")
return newtimestr return newtimestr
# 初始化环境 # 初始化环境
def GetJson(): def GetJson():
jstxt = ReadFile("config.json") jstxt = ReadFile("config.json")
@ -59,6 +63,7 @@ def DownLoadHtml(url):
else: else:
return None return None
# 将图片从远程下载保存到本地 # 将图片从远程下载保存到本地
def DownImg(url, savepath): def DownImg(url, savepath):
# 构造请求头 # 构造请求头
@ -74,6 +79,7 @@ def DownImg(url,savepath):
with open(savepath, 'wb') as f: with open(savepath, 'wb') as f:
f.write(response.content) f.write(response.content)
# 修改网页中图片的src使图片能正常显示 # 修改网页中图片的src使图片能正常显示
def ChangeImgSrc(htmltxt, saveimgdir, htmlname): def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象第二个参数固定为lxml bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象第二个参数固定为lxml
@ -110,6 +116,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
script["src"] = "" script["src"] = ""
return str(bs) # 将BeautifulSoup对象再转换为字符串用于保存 return str(bs) # 将BeautifulSoup对象再转换为字符串用于保存
def ChangeCssSrc(bs): def ChangeCssSrc(bs):
linkList = bs.findAll("link") linkList = bs.findAll("link")
for link in linkList: for link in linkList:
@ -118,6 +125,7 @@ def ChangeCssSrc(bs):
newhref = "http:" + href newhref = "http:" + href
link.attrs["href"] = newhref link.attrs["href"] = newhref
def ChangeContent(bs): def ChangeContent(bs):
jscontent = bs.find(id="js_content") jscontent = bs.find(id="js_content")
if jscontent: if jscontent:
@ -125,6 +133,7 @@ def ChangeContent(bs):
else: else:
print("-----可能文章被删了-----") print("-----可能文章被删了-----")
# 文章类 # 文章类
class Article(): class Article():
def __init__(self, url, pubdate, idx, title): def __init__(self, url, pubdate, idx, title):
@ -133,6 +142,7 @@ class Article():
self.idx = idx self.idx = idx
self.title = title self.title = title
# 从fiddler保存的json文件中提取文章url等信息 # 从fiddler保存的json文件中提取文章url等信息
def GetArticleList(jsondir): def GetArticleList(jsondir):
filelist = os.listdir(jsondir) filelist = os.listdir(jsondir)
@ -175,6 +185,7 @@ def GetArticleList(jsondir):
print("跳过,可不用管", file) print("跳过,可不用管", file)
return ArtList return ArtList
def DownHtmlMain(jsonDir, saveHtmlDir): def DownHtmlMain(jsonDir, saveHtmlDir):
saveHtmlDir = jsbd["htmlDir"] saveHtmlDir = jsbd["htmlDir"]
if not os.path.exists(saveHtmlDir): if not os.path.exists(saveHtmlDir):
@ -203,6 +214,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
sleep(3) # 防止下载过快被微信屏蔽间隔3秒下载一篇 sleep(3) # 防止下载过快被微信屏蔽间隔3秒下载一篇
# 把一个文件夹下的html文件都转为pdf # 把一个文件夹下的html文件都转为pdf
def PDFDir(htmldir, pdfdir): def PDFDir(htmldir, pdfdir):
if not os.path.exists(pdfdir): if not os.path.exists(pdfdir):
@ -219,7 +231,7 @@ def PDFDir(htmldir,pdfdir):
# pdf文件名中包含文章标题但如果标题中有不能出现在文件名中的符号则会转换失败 # pdf文件名中包含文章标题但如果标题中有不能出现在文件名中的符号则会转换失败
titleTag = bs.find(id="activity-name") titleTag = bs.find(id="activity-name")
if titleTag is not None: if titleTag is not None:
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","") title = "_" + titleTag.get_text().replace(" ", "").replace(" ", "").replace("\n", "").replace("|", "").replace(":", "")
ridx = htmlpath.rindex("/") + 1 ridx = htmlpath.rindex("/") + 1
pdfname = htmlpath[ridx:-5] + title pdfname = htmlpath[ridx:-5] + title
pdfpath = pdfdir + "/" + pdfname + ".pdf" pdfpath = pdfdir + "/" + pdfname + ".pdf"
@ -235,6 +247,7 @@ def PDFDir(htmldir,pdfdir):
except: except:
print("转pdf失败可能是因为标题中有特殊字符", f) print("转pdf失败可能是因为标题中有特殊字符", f)
# 把一个Html文件转为pdf # 把一个Html文件转为pdf
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True): def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
if skipExists and os.path.exists(pdfpath): if skipExists and os.path.exists(pdfpath):
@ -257,7 +270,6 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
if removehtml: if removehtml:
os.remove(htmlpath) os.remove(htmlpath)
""" """
1.设置 1.设置
先去config.json文件中设置 先去config.json文件中设置
@ -268,6 +280,8 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
运行 python start.py #开始下载html 运行 python start.py #开始下载html
运行 python start.py pdf #把下载的html转pdf 运行 python start.py pdf #把下载的html转pdf
""" """
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) == 1: if len(sys.argv) == 1:
arg = None arg = None