Update start.py
This commit is contained in:
parent
004992a250
commit
65019e7d89
204
start.py
204
start.py
@ -1,9 +1,9 @@
|
|||||||
import os,sys
|
import os, sys
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
import subprocess
|
import subprocess
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime,timedelta
|
from datetime import datetime, timedelta
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -11,62 +11,67 @@ from time import sleep
|
|||||||
讨论QQ群 703431832 加群暗号:不止技术流
|
讨论QQ群 703431832 加群暗号:不止技术流
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#保存文件
|
|
||||||
def SaveFile(fpath,fileContent):
|
# 保存文件
|
||||||
|
def SaveFile(fpath, fileContent):
|
||||||
with open(fpath, 'w', encoding='utf-8') as f:
|
with open(fpath, 'w', encoding='utf-8') as f:
|
||||||
f.write(fileContent)
|
f.write(fileContent)
|
||||||
|
|
||||||
#读取文件
|
|
||||||
|
# 读取文件
|
||||||
def ReadFile(filepath):
|
def ReadFile(filepath):
|
||||||
with open(filepath, 'r', encoding='utf-8') as f:
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
all_the_text = f.read()
|
all_the_text = f.read()
|
||||||
return all_the_text
|
return all_the_text
|
||||||
|
|
||||||
#时间戳转日期
|
|
||||||
|
# 时间戳转日期
|
||||||
def Timestamp2Datetime(stampstr):
|
def Timestamp2Datetime(stampstr):
|
||||||
dt = datetime.utcfromtimestamp(stampstr)
|
dt = datetime.utcfromtimestamp(stampstr)
|
||||||
dt = dt + timedelta(hours=8)
|
dt = dt + timedelta(hours=8)
|
||||||
newtimestr = dt.strftime("%Y%m%d_%H%M%S")
|
newtimestr = dt.strftime("%Y%m%d_%H%M%S")
|
||||||
return newtimestr
|
return newtimestr
|
||||||
|
|
||||||
#初始化环境
|
|
||||||
|
# 初始化环境
|
||||||
def GetJson():
|
def GetJson():
|
||||||
jstxt = ReadFile("config.json")
|
jstxt = ReadFile("config.json")
|
||||||
jstxt = jstxt.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别
|
jstxt = jstxt.replace("\\\\", "/").replace("\\", "/") # 防止json中有 / 导致无法识别
|
||||||
jsbd = json.loads(jstxt)
|
jsbd = json.loads(jstxt)
|
||||||
if jsbd["htmlDir"][-1]=="/":
|
if jsbd["htmlDir"][-1] == "/":
|
||||||
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
|
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
|
||||||
if jsbd["jsonDir"][-1]=="/":
|
if jsbd["jsonDir"][-1] == "/":
|
||||||
jsbd["jsonDir"]= jsbd["jsonDir"][:-1]
|
jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
|
||||||
return jsbd
|
return jsbd
|
||||||
|
|
||||||
|
|
||||||
#下载url网页
|
# 下载url网页
|
||||||
def DownLoadHtml(url):
|
def DownLoadHtml(url):
|
||||||
#构造请求头
|
# 构造请求头
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
||||||
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
'Connection':'keep-alive',
|
'Connection': 'keep-alive',
|
||||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||||
}
|
}
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.trust_env = False
|
session.trust_env = False
|
||||||
response = session.get(url,headers = headers)
|
response = session.get(url, headers=headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
htmltxt = response.text #返回的网页正文
|
htmltxt = response.text # 返回的网页正文
|
||||||
return htmltxt
|
return htmltxt
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
#将图片从远程下载保存到本地
|
|
||||||
def DownImg(url,savepath):
|
# 将图片从远程下载保存到本地
|
||||||
#构造请求头
|
def DownImg(url, savepath):
|
||||||
|
# 构造请求头
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
||||||
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
'Connection':'keep-alive',
|
'Connection': 'keep-alive',
|
||||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||||
}
|
}
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.trust_env = False
|
session.trust_env = False
|
||||||
@ -74,41 +79,43 @@ def DownImg(url,savepath):
|
|||||||
with open(savepath, 'wb') as f:
|
with open(savepath, 'wb') as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
|
|
||||||
#修改网页中图片的src,使图片能正常显示
|
|
||||||
def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
# 修改网页中图片的src,使图片能正常显示
|
||||||
bs =BeautifulSoup(htmltxt,"lxml") #由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
|
def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
|
||||||
|
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
|
||||||
imgList = bs.findAll("img")
|
imgList = bs.findAll("img")
|
||||||
imgindex = 0
|
imgindex = 0
|
||||||
for img in imgList:
|
for img in imgList:
|
||||||
imgindex += 1
|
imgindex += 1
|
||||||
originalURL = "" # 图片真实url
|
originalURL = "" # 图片真实url
|
||||||
if "data-src" in img.attrs:#有的<img 标签中可能没有data-src
|
if "data-src" in img.attrs: # 有的<img 标签中可能没有data-src
|
||||||
originalURL = img.attrs['data-src']
|
originalURL = img.attrs['data-src']
|
||||||
elif "src" in img.attrs:#如果有src则提取出来
|
elif "src" in img.attrs: # 如果有src则提取出来
|
||||||
originalURL = img.attrs['src']
|
originalURL = img.attrs['src']
|
||||||
else:
|
else:
|
||||||
originalURL = ""
|
originalURL = ""
|
||||||
if originalURL.startswith("//"):#如果url以//开头,则需要添加http:
|
if originalURL.startswith("//"): # 如果url以//开头,则需要添加http:
|
||||||
originalURL = "http:" + originalURL
|
originalURL = "http:" + originalURL
|
||||||
if len(originalURL) > 20:
|
if len(originalURL) > 20:
|
||||||
print("\r down imgs " + "▇" * imgindex +" " + str(imgindex),end="")
|
print("\r down imgs " + "▇" * imgindex + " " + str(imgindex), end="")
|
||||||
if "data-type" in img.attrs:
|
if "data-type" in img.attrs:
|
||||||
imgtype = img.attrs["data-type"]
|
imgtype = img.attrs["data-type"]
|
||||||
else:
|
else:
|
||||||
imgtype = "png"
|
imgtype = "png"
|
||||||
imgname = htmlname + "_"+str(imgindex)+"."+imgtype #形如 1.png的图片名
|
imgname = htmlname + "_" + str(imgindex) + "." + imgtype # 形如 1.png的图片名
|
||||||
imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录
|
imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录
|
||||||
DownImg(originalURL,imgsavepath)
|
DownImg(originalURL, imgsavepath)
|
||||||
img.attrs["src"] = "images/" + imgname #网页中图片的相对路径
|
img.attrs["src"] = "images/" + imgname # 网页中图片的相对路径
|
||||||
else :
|
else:
|
||||||
img.attrs["src"] = ""
|
img.attrs["src"] = ""
|
||||||
ChangeCssSrc(bs) #修改link标签
|
ChangeCssSrc(bs) # 修改link标签
|
||||||
ChangeContent(bs) #修改js_content的style,使正文能正常显示
|
ChangeContent(bs) # 修改js_content的style,使正文能正常显示
|
||||||
allscript = bs.findAll("script")
|
allscript = bs.findAll("script")
|
||||||
for script in allscript:
|
for script in allscript:
|
||||||
if "src" in script.attrs: #解决远程加载js失败导致打开网页很慢的问题
|
if "src" in script.attrs: # 解决远程加载js失败导致打开网页很慢的问题
|
||||||
script["src"]=""
|
script["src"] = ""
|
||||||
return str(bs) #将BeautifulSoup对象再转换为字符串,用于保存
|
return str(bs) # 将BeautifulSoup对象再转换为字符串,用于保存
|
||||||
|
|
||||||
|
|
||||||
def ChangeCssSrc(bs):
|
def ChangeCssSrc(bs):
|
||||||
linkList = bs.findAll("link")
|
linkList = bs.findAll("link")
|
||||||
@ -118,111 +125,116 @@ def ChangeCssSrc(bs):
|
|||||||
newhref = "http:" + href
|
newhref = "http:" + href
|
||||||
link.attrs["href"] = newhref
|
link.attrs["href"] = newhref
|
||||||
|
|
||||||
|
|
||||||
def ChangeContent(bs):
|
def ChangeContent(bs):
|
||||||
jscontent = bs.find(id="js_content")
|
jscontent = bs.find(id="js_content")
|
||||||
if jscontent:
|
if jscontent:
|
||||||
jscontent.attrs["style"]=""
|
jscontent.attrs["style"] = ""
|
||||||
else:
|
else:
|
||||||
print("-----可能文章被删了-----")
|
print("-----可能文章被删了-----")
|
||||||
|
|
||||||
#文章类
|
|
||||||
|
# 文章类
|
||||||
class Article():
|
class Article():
|
||||||
def __init__(self,url,pubdate,idx,title):
|
def __init__(self, url, pubdate, idx, title):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.pubdate = pubdate
|
self.pubdate = pubdate
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.title = title
|
self.title = title
|
||||||
|
|
||||||
#从fiddler保存的json文件中提取文章url等信息
|
|
||||||
|
# 从fiddler保存的json文件中提取文章url等信息
|
||||||
def GetArticleList(jsondir):
|
def GetArticleList(jsondir):
|
||||||
filelist = os.listdir(jsondir)
|
filelist = os.listdir(jsondir)
|
||||||
ArtList = []
|
ArtList = []
|
||||||
for file in filelist:
|
for file in filelist:
|
||||||
try:
|
try:
|
||||||
filepath = os.path.join(jsondir,file)
|
filepath = os.path.join(jsondir, file)
|
||||||
filetxt = ReadFile(filepath)
|
filetxt = ReadFile(filepath)
|
||||||
jsbody = json.loads(filetxt)
|
jsbody = json.loads(filetxt)
|
||||||
general_msg_list = jsbody["general_msg_list"]
|
general_msg_list = jsbody["general_msg_list"]
|
||||||
jsbd2= json.loads(general_msg_list)
|
jsbd2 = json.loads(general_msg_list)
|
||||||
list = jsbd2["list"]
|
list = jsbd2["list"]
|
||||||
for item in list: #一个item里可能有多篇文章
|
for item in list: # 一个item里可能有多篇文章
|
||||||
artidx = 1 #请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条)
|
artidx = 1 # 请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条)
|
||||||
comm_msg_info = item["comm_msg_info"]
|
comm_msg_info = item["comm_msg_info"]
|
||||||
|
|
||||||
pubstamp = comm_msg_info["datetime"]
|
pubstamp = comm_msg_info["datetime"]
|
||||||
pubdate = Timestamp2Datetime(pubstamp)
|
pubdate = Timestamp2Datetime(pubstamp)
|
||||||
if comm_msg_info["type"] == 49: #49为普通图文类型,还有其他类型,暂不考虑
|
if comm_msg_info["type"] == 49: # 49为普通图文类型,还有其他类型,暂不考虑
|
||||||
app_msg_ext_info = item["app_msg_ext_info"]
|
app_msg_ext_info = item["app_msg_ext_info"]
|
||||||
url = app_msg_ext_info["content_url"] #文章链接
|
url = app_msg_ext_info["content_url"] # 文章链接
|
||||||
idx = artidx
|
idx = artidx
|
||||||
title = app_msg_ext_info["title"]
|
title = app_msg_ext_info["title"]
|
||||||
art = Article(url,pubdate,idx,title)
|
art = Article(url, pubdate, idx, title)
|
||||||
if len(url)>3:#url不完整则跳过
|
if len(url) > 3: # url不完整则跳过
|
||||||
ArtList.append(art)
|
ArtList.append(art)
|
||||||
print(len(ArtList),pubdate, idx, title)
|
print(len(ArtList), pubdate, idx, title)
|
||||||
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
|
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
|
||||||
artidx += 1
|
artidx += 1
|
||||||
multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"]
|
multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"]
|
||||||
for subArt in multi_app_msg_item_list:
|
for subArt in multi_app_msg_item_list:
|
||||||
url =subArt["content_url"]
|
url = subArt["content_url"]
|
||||||
idx =artidx
|
idx = artidx
|
||||||
title = subArt["title"]
|
title = subArt["title"]
|
||||||
art = Article(url,pubdate,idx,title)
|
art = Article(url, pubdate, idx, title)
|
||||||
if len(url)>3:
|
if len(url) > 3:
|
||||||
ArtList.append(art)
|
ArtList.append(art)
|
||||||
print(len(ArtList),pubdate, idx, title)
|
print(len(ArtList), pubdate, idx, title)
|
||||||
except:
|
except:
|
||||||
print("跳过,可不用管",file)
|
print("跳过,可不用管", file)
|
||||||
return ArtList
|
return ArtList
|
||||||
|
|
||||||
def DownHtmlMain(jsonDir,saveHtmlDir):
|
|
||||||
|
def DownHtmlMain(jsonDir, saveHtmlDir):
|
||||||
saveHtmlDir = jsbd["htmlDir"]
|
saveHtmlDir = jsbd["htmlDir"]
|
||||||
if not os.path.exists(saveHtmlDir):
|
if not os.path.exists(saveHtmlDir):
|
||||||
os.makedirs(saveHtmlDir)
|
os.makedirs(saveHtmlDir)
|
||||||
saveImgDir = saveHtmlDir+ "/images"
|
saveImgDir = saveHtmlDir + "/images"
|
||||||
if not os.path.exists(saveImgDir):
|
if not os.path.exists(saveImgDir):
|
||||||
os.makedirs(saveImgDir)
|
os.makedirs(saveImgDir)
|
||||||
ArtList = GetArticleList(jsonDir)
|
ArtList = GetArticleList(jsonDir)
|
||||||
ArtList.sort(key=lambda x:x.pubdate,reverse=True) #按日期倒序排列
|
ArtList.sort(key=lambda x: x.pubdate, reverse=True) # 按日期倒序排列
|
||||||
totalCount = len(ArtList)
|
totalCount = len(ArtList)
|
||||||
idx=0
|
idx = 0
|
||||||
for art in ArtList:
|
for art in ArtList:
|
||||||
idx+=1
|
idx += 1
|
||||||
artname = art.pubdate + "_" + str(art.idx)
|
artname = art.pubdate + "_" + str(art.idx)
|
||||||
arthtmlname = artname + ".html"
|
arthtmlname = artname + ".html"
|
||||||
arthtmlsavepath = saveHtmlDir+"/"+arthtmlname
|
arthtmlsavepath = saveHtmlDir + "/" + arthtmlname
|
||||||
print(idx,"of",totalCount,artname,art.title)
|
print(idx, "of", totalCount, artname, art.title)
|
||||||
# 如果已经有了则跳过,便于暂停后续传
|
# 如果已经有了则跳过,便于暂停后续传
|
||||||
if os.path.exists(arthtmlsavepath):
|
if os.path.exists(arthtmlsavepath):
|
||||||
print("exists",arthtmlsavepath)
|
print("exists", arthtmlsavepath)
|
||||||
continue
|
continue
|
||||||
arthtmlstr = DownLoadHtml(art.url)
|
arthtmlstr = DownLoadHtml(art.url)
|
||||||
arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname)
|
arthtmlstr = ChangeImgSrc(arthtmlstr, saveImgDir, artname)
|
||||||
print("\r",end="")
|
print("\r", end="")
|
||||||
SaveFile(arthtmlsavepath,arthtmlstr)
|
SaveFile(arthtmlsavepath, arthtmlstr)
|
||||||
|
|
||||||
sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇
|
sleep(3) # 防止下载过快被微信屏蔽,间隔3秒下载一篇
|
||||||
|
|
||||||
#把一个文件夹下的html文件都转为pdf
|
|
||||||
def PDFDir(htmldir,pdfdir):
|
# 把一个文件夹下的html文件都转为pdf
|
||||||
|
def PDFDir(htmldir, pdfdir):
|
||||||
if not os.path.exists(pdfdir):
|
if not os.path.exists(pdfdir):
|
||||||
os.makedirs(pdfdir)
|
os.makedirs(pdfdir)
|
||||||
flist = os.listdir(htmldir)
|
flist = os.listdir(htmldir)
|
||||||
for f in flist:
|
for f in flist:
|
||||||
if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换,含有tmp的不转换
|
if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换,含有tmp的不转换
|
||||||
continue
|
continue
|
||||||
htmlpath = htmldir+"/"+f
|
htmlpath = htmldir + "/" + f
|
||||||
tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件,供转pdf用
|
tmppath = htmlpath[:-5] + "_tmp.html" # 生成临时文件,供转pdf用
|
||||||
htmlstr = ReadFile(htmlpath)
|
htmlstr = ReadFile(htmlpath)
|
||||||
bs = BeautifulSoup(htmlstr, "lxml")
|
bs = BeautifulSoup(htmlstr, "lxml")
|
||||||
title = ""
|
title = ""
|
||||||
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
|
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
|
||||||
titleTag = bs.find(id="activity-name")
|
titleTag = bs.find(id="activity-name")
|
||||||
if titleTag is not None:
|
if titleTag is not None:
|
||||||
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","")
|
title = "_" + titleTag.get_text().replace(" ", "").replace(" ", "").replace("\n", "").replace("|", "").replace(":", "")
|
||||||
ridx = htmlpath.rindex("/") + 1
|
ridx = htmlpath.rindex("/") + 1
|
||||||
pdfname = htmlpath[ridx:-5] + title
|
pdfname = htmlpath[ridx:-5] + title
|
||||||
pdfpath = pdfdir+"/"+ pdfname + ".pdf"
|
pdfpath = pdfdir + "/" + pdfname + ".pdf"
|
||||||
|
|
||||||
"""
|
"""
|
||||||
把js等去掉,减少转PDF时的加载项,
|
把js等去掉,减少转PDF时的加载项,
|
||||||
@ -231,23 +243,24 @@ def PDFDir(htmldir,pdfdir):
|
|||||||
[s.extract() for s in bs(["script", "iframe", "link"])]
|
[s.extract() for s in bs(["script", "iframe", "link"])]
|
||||||
SaveFile(tmppath, str(bs))
|
SaveFile(tmppath, str(bs))
|
||||||
try:
|
try:
|
||||||
PDFOne(tmppath,pdfpath)
|
PDFOne(tmppath, pdfpath)
|
||||||
except:
|
except:
|
||||||
print("转pdf失败,可能是因为标题中有特殊字符",f)
|
print("转pdf失败,可能是因为标题中有特殊字符", f)
|
||||||
|
|
||||||
#把一个Html文件转为pdf
|
|
||||||
def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
# 把一个Html文件转为pdf
|
||||||
|
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
|
||||||
if skipExists and os.path.exists(pdfpath):
|
if skipExists and os.path.exists(pdfpath):
|
||||||
print("pdf exists",pdfpath)
|
print("pdf exists", pdfpath)
|
||||||
if removehtml:
|
if removehtml:
|
||||||
os.remove(htmlpath)
|
os.remove(htmlpath)
|
||||||
return
|
return
|
||||||
exepath = "wkhtmltopdf.exe"#把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
|
exepath = "wkhtmltopdf.exe" # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
|
||||||
cmdlist =[]
|
cmdlist = []
|
||||||
cmdlist.append(" --load-error-handling ignore ")
|
cmdlist.append(" --load-error-handling ignore ")
|
||||||
cmdlist.append(" --page-height 200 ") #数字可以自己调节,也可以不加这两行
|
cmdlist.append(" --page-height 200 ") # 数字可以自己调节,也可以不加这两行
|
||||||
cmdlist.append(" --page-width 140 ")
|
cmdlist.append(" --page-width 140 ")
|
||||||
cmdlist.append(" " + htmlpath +" ")
|
cmdlist.append(" " + htmlpath + " ")
|
||||||
cmdlist.append(" " + pdfpath + " ")
|
cmdlist.append(" " + pdfpath + " ")
|
||||||
cmdstr = exepath + "".join(cmdlist)
|
cmdstr = exepath + "".join(cmdlist)
|
||||||
print(cmdstr)
|
print(cmdstr)
|
||||||
@ -257,7 +270,6 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
|||||||
if removehtml:
|
if removehtml:
|
||||||
os.remove(htmlpath)
|
os.remove(htmlpath)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
1.设置:
|
1.设置:
|
||||||
先去config.json文件中设置
|
先去config.json文件中设置
|
||||||
@ -268,18 +280,20 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
|
|||||||
运行 python start.py #开始下载html
|
运行 python start.py #开始下载html
|
||||||
运行 python start.py pdf #把下载的html转pdf
|
运行 python start.py pdf #把下载的html转pdf
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv)==1:
|
if len(sys.argv) == 1:
|
||||||
arg = None
|
arg = None
|
||||||
else:
|
else:
|
||||||
arg = sys.argv[1]
|
arg = sys.argv[1]
|
||||||
if arg is None or arg == "html" :
|
if arg is None or arg == "html":
|
||||||
jsbd = GetJson()
|
jsbd = GetJson()
|
||||||
saveHtmlDir = jsbd["htmlDir"]
|
saveHtmlDir = jsbd["htmlDir"]
|
||||||
jsdir= jsbd["jsonDir"]
|
jsdir = jsbd["jsonDir"]
|
||||||
DownHtmlMain(jsdir,saveHtmlDir)
|
DownHtmlMain(jsdir, saveHtmlDir)
|
||||||
elif arg == "pdf":
|
elif arg == "pdf":
|
||||||
jsbd = GetJson()
|
jsbd = GetJson()
|
||||||
saveHtmlDir = jsbd["htmlDir"]
|
saveHtmlDir = jsbd["htmlDir"]
|
||||||
savePdfDir = jsbd["pdfDir"]
|
savePdfDir = jsbd["pdfDir"]
|
||||||
PDFDir(saveHtmlDir,savePdfDir)
|
PDFDir(saveHtmlDir, savePdfDir)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user