Update start.py

2022-05-09 13:15:51 +08:00
parent 004992a250
commit 65019e7d89
1 changed files with 120 additions and 106 deletions
--- a/start.py
+++ b/start.py
@@ -11,17 +11,20 @@ from time import sleep
 讨论QQ群 703431832 加群暗号:不止技术流
 """
 # 保存文件
 def SaveFile(fpath, fileContent):
    with open(fpath, 'w', encoding='utf-8') as f:
        f.write(fileContent)
 # 读取文件
 def ReadFile(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        all_the_text = f.read()
    return all_the_text
 # 时间戳转日期
 def Timestamp2Datetime(stampstr):
    dt = datetime.utcfromtimestamp(stampstr)
@@ -29,6 +32,7 @@ def Timestamp2Datetime(stampstr):
    newtimestr = dt.strftime("%Y%m%d_%H%M%S")
    return newtimestr
 # 初始化环境
 def GetJson():
    jstxt = ReadFile("config.json")
@@ -59,6 +63,7 @@ def DownLoadHtml(url):
    else:
        return None
 # 将图片从远程下载保存到本地
 def DownImg(url, savepath):
    # 构造请求头
@@ -74,6 +79,7 @@ def DownImg(url,savepath):
    with open(savepath, 'wb') as f:
        f.write(response.content)
 # 修改网页中图片的src，使图片能正常显示
 def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
    bs = BeautifulSoup(htmltxt, "lxml")  # 由网页源代码生成BeautifulSoup对象，第二个参数固定为lxml
@@ -110,6 +116,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
            script["src"] = ""
    return str(bs)  # 将BeautifulSoup对象再转换为字符串，用于保存
 def ChangeCssSrc(bs):
    linkList = bs.findAll("link")
    for link in linkList:
@@ -118,6 +125,7 @@ def ChangeCssSrc(bs):
            newhref = "http:" + href
            link.attrs["href"] = newhref
 def ChangeContent(bs):
    jscontent = bs.find(id="js_content")
    if jscontent:
@@ -125,6 +133,7 @@ def ChangeContent(bs):
    else:
        print("-----可能文章被删了-----")
 # 文章类
 class Article():
    def __init__(self, url, pubdate, idx, title):
@@ -133,6 +142,7 @@ class Article():
        self.idx = idx
        self.title = title
 # 从fiddler保存的json文件中提取文章url等信息
 def GetArticleList(jsondir):
    filelist = os.listdir(jsondir)
@@ -175,6 +185,7 @@ def GetArticleList(jsondir):
            print("跳过，可不用管", file)
    return ArtList
 def DownHtmlMain(jsonDir, saveHtmlDir):
    saveHtmlDir = jsbd["htmlDir"]
    if not os.path.exists(saveHtmlDir):
@@ -203,6 +214,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir):
        sleep(3)  # 防止下载过快被微信屏蔽，间隔3秒下载一篇
 # 把一个文件夹下的html文件都转为pdf
 def PDFDir(htmldir, pdfdir):
    if not os.path.exists(pdfdir):
@@ -219,7 +231,7 @@ def PDFDir(htmldir,pdfdir):
        # pdf文件名中包含文章标题，但如果标题中有不能出现在文件名中的符号则会转换失败
        titleTag = bs.find(id="activity-name")
        if titleTag is not None:
-            title = "_" + titleTag.get_text().replace(" ", "").replace("  ","").replace("\n","")
+            title = "_" + titleTag.get_text().replace(" ", "").replace("  ", "").replace("\n", "").replace("|", "").replace(":", "")
        ridx = htmlpath.rindex("/") + 1
        pdfname = htmlpath[ridx:-5] + title
        pdfpath = pdfdir + "/" + pdfname + ".pdf"
@@ -235,6 +247,7 @@ def PDFDir(htmldir,pdfdir):
        except:
            print("转pdf失败，可能是因为标题中有特殊字符", f)
 # 把一个Html文件转为pdf
 def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
    if skipExists and os.path.exists(pdfpath):
@@ -257,7 +270,6 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
    if removehtml:
        os.remove(htmlpath)
    """
        1.设置：
            先去config.json文件中设置
@@ -268,6 +280,8 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True):
            运行 python start.py      #开始下载html  
            运行 python start.py pdf  #把下载的html转pdf 
    """
 if __name__ == "__main__":
    if len(sys.argv) == 1:
        arg = None