diff --git a/config.json b/config.json new file mode 100644 index 0000000..54eedf6 --- /dev/null +++ b/config.json @@ -0,0 +1,5 @@ +{ + "jsonDir": "C:/vWeChatFiles/rawlist/Dump-0805-15-00-45", + "htmlDir": "c:/vWeChatFiles/html/", + "pdfDir": "c:/vWeChatFiles/pdf/" +} \ No newline at end of file diff --git a/version0.1.py b/version0.1.py deleted file mode 100644 index 7a4484f..0000000 --- a/version0.1.py +++ /dev/null @@ -1,155 +0,0 @@ -import os,sys -import requests -import json -from bs4 import BeautifulSoup -from datetime import datetime,timedelta -from time import sleep - -#保存文件 -def SaveFile(fpath,fileContent): - with open(fpath, 'w', encoding='utf-8') as f: - f.write(fileContent) - -#读取文件 -def ReadFile(filepath): - with open(filepath, 'r', encoding='utf-8') as f: - all_the_text = f.read() - return all_the_text - -#时间戳转日期 -def Timestamp2Datetime(stampstr): - dt = datetime.utcfromtimestamp(stampstr) - dt = dt + timedelta(hours=8) - newtimestr = dt.strftime("%Y%m%d_%H%M%S") - return newtimestr - -#下载url网页 -def DownLoadHtml(url): - #构造请求头 - headers = { - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', - 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Connection':'keep-alive', - 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' - } - - response = requests.get(url,headers = headers) - if response.status_code == 200: - htmltxt = response.text #返回的网页正文 - return htmltxt - else: - return None - -#将图片从远程下载保存到本地 -def DownImg(url,savepath): - r = requests.get(url) - with open(savepath, 'wb') as f: - f.write(r.content) - -#修改网页中图片的src,使图片能正常显示 -def ChangeImgSrc(htmltxt,saveimgdir,htmlname): - bs =BeautifulSoup(htmltxt,"lxml") #由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml - imgList = bs.findAll("img") - imgindex = 0 - for img in imgList: - imgindex += 1 - originalURL = "" # 图片真实url - if "data-src" in img.attrs:#有的 0: - print("down img",imgindex) - if "data-type" in img.attrs: - imgtype = img.attrs["data-type"] - else: - imgtype = "png" - imgname = htmlname + "_"+str(imgindex)+"."+imgtype #形如 1.png的图片名 - imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录 - DownImg(originalURL,imgsavepath) - img.attrs["src"] = "images/" + imgname #网页中图片的相对路径 - else : - img.attrs["src"] = "" - ChangeCssSrc(bs) #修改link标签 - return str(bs) #将BeautifulSoup对象再转换为字符串,用于保存 - -def ChangeCssSrc(bs): - linkList = bs.findAll("link") - for link in linkList: - href = link.attrs["href"] - if href.startswith("//"): - newhref = "http:" + href - link.attrs["href"] = newhref - -#文章类 -class Article(): - def __init__(self,url,pubdate,idx,title): - self.url = url - self.pubdate = pubdate - self.idx = idx - self.title = title - -#从fiddler保存的json文件中提取文章url等信息 -def GetArticleList(jsondir): - filelist = os.listdir(jsondir) - ArtList = [] - for file in filelist: - filepath = os.path.join(jsondir,file) - filetxt = ReadFile(filepath) - jsbody = json.loads(filetxt) - general_msg_list = jsbody["general_msg_list"] - jsbd2= json.loads(general_msg_list) - list = jsbd2["list"] - for item in list: #一个item里可能有多篇文章 - artidx = 1 #请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条) - comm_msg_info = item["comm_msg_info"] - app_msg_ext_info = item["app_msg_ext_info"] - pubstamp = comm_msg_info["datetime"] - pubdate = Timestamp2Datetime(pubstamp) - if comm_msg_info["type"] == 49: #49为普通图文类型,还有其他类型,暂不考虑 - url = app_msg_ext_info["content_url"] #文章链接 - idx = artidx - title = app_msg_ext_info["title"] - art = Article(url,pubdate,idx,title) - ArtList.append(art) - print(len(ArtList),pubdate, idx, title) - if app_msg_ext_info["is_multi"] == 1: # 一次发多篇 - artidx += 1 - multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"] - for subArt in multi_app_msg_item_list: - url =subArt["content_url"] - idx =artidx - title = subArt["title"] - art = Article(url,pubdate,idx,title) - ArtList.append(art) - print(len(ArtList),pubdate, idx, title) - return ArtList - - -if __name__ == "__main__": - dir = "C:/vWeChatFiles/rawlist/Dump-0805-15-00-45" #改成你自己的文件夹地址 - saveHtmlDir = "c:/vWeChatFiles/html/" #改成你自己的保存目录,如果没有要新建 - saveImgDir = "c:/vWeChatFiles/html/images/" #改成你自己的保存目录,如果没有要新建 - ArtList = GetArticleList(dir) - ArtList.sort(key=lambda x:x.pubdate,reverse=True) #按日期倒序排列 - totalCount = len(ArtList) - idx=0 - for art in ArtList: - idx+=1 - artname = art.pubdate + "_" + str(art.idx) - arthtmlname = artname + ".html" - arthtmlsavepath = os.path.join(saveHtmlDir,arthtmlname) - print(idx,"of",totalCount,artname,art.title) - # 如果已经有了则跳过,便于暂停后续传 - if os.path.exists(arthtmlsavepath): - print("exists",arthtmlsavepath) - continue - arthtmlstr = DownLoadHtml(art.url) - arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname) - SaveFile(arthtmlsavepath,arthtmlstr) - - sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇 \ No newline at end of file