Update start.py
This commit is contained in:
parent
004992a250
commit
65019e7d89
18
start.py
18
start.py
@ -11,17 +11,20 @@ from time import sleep
|
|||||||
讨论QQ群 703431832 加群暗号:不止技术流
|
讨论QQ群 703431832 加群暗号:不止技术流
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# 保存文件
|
# 保存文件
|
||||||
def SaveFile(fpath, fileContent):
|
def SaveFile(fpath, fileContent):
|
||||||
with open(fpath, 'w', encoding='utf-8') as f:
|
with open(fpath, 'w', encoding='utf-8') as f:
|
||||||
f.write(fileContent)
|
f.write(fileContent)
|
||||||
|
|
||||||
|
|
||||||
# 读取文件
|
# 读取文件
|
||||||
def ReadFile(filepath):
|
def ReadFile(filepath):
|
||||||
with open(filepath, 'r', encoding='utf-8') as f:
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
all_the_text = f.read()
|
all_the_text = f.read()
|
||||||
return all_the_text
|
return all_the_text
|
||||||
|
|
||||||
|
|
||||||
# 时间戳转日期
|
# 时间戳转日期
|
||||||
def Timestamp2Datetime(stampstr):
|
def Timestamp2Datetime(stampstr):
|
||||||
dt = datetime.utcfromtimestamp(stampstr)
|
dt = datetime.utcfromtimestamp(stampstr)
|
||||||
@ -29,6 +32,7 @@ def Timestamp2Datetime(stampstr):
|
|||||||
newtimestr = dt.strftime("%Y%m%d_%H%M%S")
|
newtimestr = dt.strftime("%Y%m%d_%H%M%S")
|
||||||
return newtimestr
|
return newtimestr
|
||||||
|
|
||||||
|
|
||||||
# 初始化环境
|
# 初始化环境
|
||||||
def GetJson():
|
def GetJson():
|
||||||
jstxt = ReadFile("config.json")
|
jstxt = ReadFile("config.json")
|
||||||
@ -59,6 +63,7 @@ def DownLoadHtml(url):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# 将图片从远程下载保存到本地
|
# 将图片从远程下载保存到本地
|
||||||
def DownImg(url, savepath):
|
def DownImg(url, savepath):
|
||||||
# 构造请求头
|
# 构造请求头
|
||||||
@ -74,6 +79,7 @@ def DownImg(url,savepath):
|
|||||||
with open(savepath, 'wb') as f:
|
with open(savepath, 'wb') as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
|
|
||||||
|
|
||||||
# 修改网页中图片的src,使图片能正常显示
|
# 修改网页中图片的src,使图片能正常显示
|
||||||
def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
|
def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
|
||||||
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
|
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
|
||||||
@ -110,6 +116,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
|||||||
script["src"] = ""
|
script["src"] = ""
|
||||||
return str(bs) # 将BeautifulSoup对象再转换为字符串,用于保存
|
return str(bs) # 将BeautifulSoup对象再转换为字符串,用于保存
|
||||||
|
|
||||||
|
|
||||||
def ChangeCssSrc(bs):
|
def ChangeCssSrc(bs):
|
||||||
linkList = bs.findAll("link")
|
linkList = bs.findAll("link")
|
||||||
for link in linkList:
|
for link in linkList:
|
||||||
@ -118,6 +125,7 @@ def ChangeCssSrc(bs):
|
|||||||
newhref = "http:" + href
|
newhref = "http:" + href
|
||||||
link.attrs["href"] = newhref
|
link.attrs["href"] = newhref
|
||||||
|
|
||||||
|
|
||||||
def ChangeContent(bs):
|
def ChangeContent(bs):
|
||||||
jscontent = bs.find(id="js_content")
|
jscontent = bs.find(id="js_content")
|
||||||
if jscontent:
|
if jscontent:
|
||||||
@ -125,6 +133,7 @@ def ChangeContent(bs):
|
|||||||
else:
|
else:
|
||||||
print("-----可能文章被删了-----")
|
print("-----可能文章被删了-----")
|
||||||
|
|
||||||
|
|
||||||
# 文章类
|
# 文章类
|
||||||
class Article():
|
class Article():
|
||||||
def __init__(self, url, pubdate, idx, title):
|
def __init__(self, url, pubdate, idx, title):
|
||||||
@ -133,6 +142,7 @@ class Article():
|
|||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.title = title
|
self.title = title
|
||||||
|
|
||||||
|
|
||||||
# 从fiddler保存的json文件中提取文章url等信息
|
# 从fiddler保存的json文件中提取文章url等信息
|
||||||
def GetArticleList(jsondir):
|
def GetArticleList(jsondir):
|
||||||
filelist = os.listdir(jsondir)
|
filelist = os.listdir(jsondir)
|
||||||
@ -175,6 +185,7 @@ def GetArticleList(jsondir):
|
|||||||
print("跳过,可不用管", file)
|
print("跳过,可不用管", file)
|
||||||
return ArtList
|
return ArtList
|
||||||
|
|
||||||
|
|
||||||
def DownHtmlMain(jsonDir, saveHtmlDir):
|
def DownHtmlMain(jsonDir, saveHtmlDir):
|
||||||
saveHtmlDir = jsbd["htmlDir"]
|
saveHtmlDir = jsbd["htmlDir"]
|
||||||
if not os.path.exists(saveHtmlDir):
|
if not os.path.exists(saveHtmlDir):
|
||||||
@ -203,6 +214,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
|
|||||||
|
|
||||||
sleep(3) # 防止下载过快被微信屏蔽,间隔3秒下载一篇
|
sleep(3) # 防止下载过快被微信屏蔽,间隔3秒下载一篇
|
||||||
|
|
||||||
|
|
||||||
# 把一个文件夹下的html文件都转为pdf
|
# 把一个文件夹下的html文件都转为pdf
|
||||||
def PDFDir(htmldir, pdfdir):
|
def PDFDir(htmldir, pdfdir):
|
||||||
if not os.path.exists(pdfdir):
|
if not os.path.exists(pdfdir):
|
||||||
@ -219,7 +231,7 @@ def PDFDir(htmldir,pdfdir):
|
|||||||
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
|
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
|
||||||
titleTag = bs.find(id="activity-name")
|
titleTag = bs.find(id="activity-name")
|
||||||
if titleTag is not None:
|
if titleTag is not None:
|
||||||
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","")
|
title = "_" + titleTag.get_text().replace(" ", "").replace(" ", "").replace("\n", "").replace("|", "").replace(":", "")
|
||||||
ridx = htmlpath.rindex("/") + 1
|
ridx = htmlpath.rindex("/") + 1
|
||||||
pdfname = htmlpath[ridx:-5] + title
|
pdfname = htmlpath[ridx:-5] + title
|
||||||
pdfpath = pdfdir + "/" + pdfname + ".pdf"
|
pdfpath = pdfdir + "/" + pdfname + ".pdf"
|
||||||
@ -235,6 +247,7 @@ def PDFDir(htmldir,pdfdir):
|
|||||||
except:
|
except:
|
||||||
print("转pdf失败,可能是因为标题中有特殊字符", f)
|
print("转pdf失败,可能是因为标题中有特殊字符", f)
|
||||||
|
|
||||||
|
|
||||||
# 把一个Html文件转为pdf
|
# 把一个Html文件转为pdf
|
||||||
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
||||||
if skipExists and os.path.exists(pdfpath):
|
if skipExists and os.path.exists(pdfpath):
|
||||||
@ -257,7 +270,6 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
|||||||
if removehtml:
|
if removehtml:
|
||||||
os.remove(htmlpath)
|
os.remove(htmlpath)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
1.设置:
|
1.设置:
|
||||||
先去config.json文件中设置
|
先去config.json文件中设置
|
||||||
@ -268,6 +280,8 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
|||||||
运行 python start.py #开始下载html
|
运行 python start.py #开始下载html
|
||||||
运行 python start.py pdf #把下载的html转pdf
|
运行 python start.py pdf #把下载的html转pdf
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
arg = None
|
arg = None
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user