Update start.py
This commit is contained in:
parent
004992a250
commit
65019e7d89
18
start.py
18
start.py
@ -11,17 +11,20 @@ from time import sleep
|
||||
讨论QQ群 703431832 加群暗号:不止技术流
|
||||
"""
|
||||
|
||||
|
||||
# 保存文件
|
||||
def SaveFile(fpath, fileContent):
|
||||
with open(fpath, 'w', encoding='utf-8') as f:
|
||||
f.write(fileContent)
|
||||
|
||||
|
||||
# 读取文件
|
||||
def ReadFile(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
all_the_text = f.read()
|
||||
return all_the_text
|
||||
|
||||
|
||||
# 时间戳转日期
|
||||
def Timestamp2Datetime(stampstr):
|
||||
dt = datetime.utcfromtimestamp(stampstr)
|
||||
@ -29,6 +32,7 @@ def Timestamp2Datetime(stampstr):
|
||||
newtimestr = dt.strftime("%Y%m%d_%H%M%S")
|
||||
return newtimestr
|
||||
|
||||
|
||||
# 初始化环境
|
||||
def GetJson():
|
||||
jstxt = ReadFile("config.json")
|
||||
@ -59,6 +63,7 @@ def DownLoadHtml(url):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# 将图片从远程下载保存到本地
|
||||
def DownImg(url, savepath):
|
||||
# 构造请求头
|
||||
@ -74,6 +79,7 @@ def DownImg(url,savepath):
|
||||
with open(savepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
|
||||
|
||||
# 修改网页中图片的src,使图片能正常显示
|
||||
def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
|
||||
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
|
||||
@ -110,6 +116,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
||||
script["src"] = ""
|
||||
return str(bs) # 将BeautifulSoup对象再转换为字符串,用于保存
|
||||
|
||||
|
||||
def ChangeCssSrc(bs):
|
||||
linkList = bs.findAll("link")
|
||||
for link in linkList:
|
||||
@ -118,6 +125,7 @@ def ChangeCssSrc(bs):
|
||||
newhref = "http:" + href
|
||||
link.attrs["href"] = newhref
|
||||
|
||||
|
||||
def ChangeContent(bs):
|
||||
jscontent = bs.find(id="js_content")
|
||||
if jscontent:
|
||||
@ -125,6 +133,7 @@ def ChangeContent(bs):
|
||||
else:
|
||||
print("-----可能文章被删了-----")
|
||||
|
||||
|
||||
# 文章类
|
||||
class Article():
|
||||
def __init__(self, url, pubdate, idx, title):
|
||||
@ -133,6 +142,7 @@ class Article():
|
||||
self.idx = idx
|
||||
self.title = title
|
||||
|
||||
|
||||
# 从fiddler保存的json文件中提取文章url等信息
|
||||
def GetArticleList(jsondir):
|
||||
filelist = os.listdir(jsondir)
|
||||
@ -175,6 +185,7 @@ def GetArticleList(jsondir):
|
||||
print("跳过,可不用管", file)
|
||||
return ArtList
|
||||
|
||||
|
||||
def DownHtmlMain(jsonDir, saveHtmlDir):
|
||||
saveHtmlDir = jsbd["htmlDir"]
|
||||
if not os.path.exists(saveHtmlDir):
|
||||
@ -203,6 +214,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
|
||||
|
||||
sleep(3) # 防止下载过快被微信屏蔽,间隔3秒下载一篇
|
||||
|
||||
|
||||
# 把一个文件夹下的html文件都转为pdf
|
||||
def PDFDir(htmldir, pdfdir):
|
||||
if not os.path.exists(pdfdir):
|
||||
@ -219,7 +231,7 @@ def PDFDir(htmldir,pdfdir):
|
||||
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
|
||||
titleTag = bs.find(id="activity-name")
|
||||
if titleTag is not None:
|
||||
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","")
|
||||
title = "_" + titleTag.get_text().replace(" ", "").replace(" ", "").replace("\n", "").replace("|", "").replace(":", "")
|
||||
ridx = htmlpath.rindex("/") + 1
|
||||
pdfname = htmlpath[ridx:-5] + title
|
||||
pdfpath = pdfdir + "/" + pdfname + ".pdf"
|
||||
@ -235,6 +247,7 @@ def PDFDir(htmldir,pdfdir):
|
||||
except:
|
||||
print("转pdf失败,可能是因为标题中有特殊字符", f)
|
||||
|
||||
|
||||
# 把一个Html文件转为pdf
|
||||
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
||||
if skipExists and os.path.exists(pdfpath):
|
||||
@ -257,7 +270,6 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
||||
if removehtml:
|
||||
os.remove(htmlpath)
|
||||
|
||||
|
||||
"""
|
||||
1.设置:
|
||||
先去config.json文件中设置
|
||||
@ -268,6 +280,8 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
||||
运行 python start.py #开始下载html
|
||||
运行 python start.py pdf #把下载的html转pdf
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) == 1:
|
||||
arg = None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user