Update start.py
修正了若干常见问题
This commit is contained in:
parent
0d52e92361
commit
ac3473fda1
88
start.py
88
start.py
@ -8,8 +8,7 @@ from time import sleep
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
|
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
|
||||||
讨论QQ群 703431832
|
讨论QQ群 703431832 加群暗号:不止技术流
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#保存文件
|
#保存文件
|
||||||
@ -50,9 +49,10 @@ def DownLoadHtml(url):
|
|||||||
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
'Connection':'keep-alive',
|
'Connection':'keep-alive',
|
||||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||||
}
|
}
|
||||||
requests.packages.urllib3.disable_warnings()
|
session = requests.Session()
|
||||||
response = requests.get(url,headers = headers,proxies=None,verify=False)
|
session.trust_env = False
|
||||||
|
response = session.get(url,headers = headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
htmltxt = response.text #返回的网页正文
|
htmltxt = response.text #返回的网页正文
|
||||||
return htmltxt
|
return htmltxt
|
||||||
@ -67,11 +67,12 @@ def DownImg(url,savepath):
|
|||||||
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
'Connection':'keep-alive',
|
'Connection':'keep-alive',
|
||||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||||
}
|
}
|
||||||
requests.packages.urllib3.disable_warnings()
|
session = requests.Session()
|
||||||
r = requests.get(url,headers = headers,proxies=None,verify=False)
|
session.trust_env = False
|
||||||
|
response = session.get(url, headers=headers)
|
||||||
with open(savepath, 'wb') as f:
|
with open(savepath, 'wb') as f:
|
||||||
f.write(r.content)
|
f.write(response.content)
|
||||||
|
|
||||||
#修改网页中图片的src,使图片能正常显示
|
#修改网页中图片的src,使图片能正常显示
|
||||||
def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
||||||
@ -89,8 +90,8 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
|||||||
originalURL = ""
|
originalURL = ""
|
||||||
if originalURL.startswith("//"):#如果url以//开头,则需要添加http:
|
if originalURL.startswith("//"):#如果url以//开头,则需要添加http:
|
||||||
originalURL = "http:" + originalURL
|
originalURL = "http:" + originalURL
|
||||||
if len(originalURL) > 0:
|
if len(originalURL) > 20:
|
||||||
print("\r down imgs " + "▇" * imgindex +" " + str(imgindex),end="")
|
print("\r down imgs " + "▇" * imgindex +" " + str(imgindex),end="")
|
||||||
if "data-type" in img.attrs:
|
if "data-type" in img.attrs:
|
||||||
imgtype = img.attrs["data-type"]
|
imgtype = img.attrs["data-type"]
|
||||||
else:
|
else:
|
||||||
@ -133,38 +134,41 @@ def GetArticleList(jsondir):
|
|||||||
filelist = os.listdir(jsondir)
|
filelist = os.listdir(jsondir)
|
||||||
ArtList = []
|
ArtList = []
|
||||||
for file in filelist:
|
for file in filelist:
|
||||||
filepath = os.path.join(jsondir,file)
|
try:
|
||||||
filetxt = ReadFile(filepath)
|
filepath = os.path.join(jsondir,file)
|
||||||
jsbody = json.loads(filetxt)
|
filetxt = ReadFile(filepath)
|
||||||
general_msg_list = jsbody["general_msg_list"]
|
jsbody = json.loads(filetxt)
|
||||||
jsbd2= json.loads(general_msg_list)
|
general_msg_list = jsbody["general_msg_list"]
|
||||||
list = jsbd2["list"]
|
jsbd2= json.loads(general_msg_list)
|
||||||
for item in list: #一个item里可能有多篇文章
|
list = jsbd2["list"]
|
||||||
artidx = 1 #请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条)
|
for item in list: #一个item里可能有多篇文章
|
||||||
comm_msg_info = item["comm_msg_info"]
|
artidx = 1 #请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条)
|
||||||
|
comm_msg_info = item["comm_msg_info"]
|
||||||
pubstamp = comm_msg_info["datetime"]
|
|
||||||
pubdate = Timestamp2Datetime(pubstamp)
|
pubstamp = comm_msg_info["datetime"]
|
||||||
if comm_msg_info["type"] == 49: #49为普通图文类型,还有其他类型,暂不考虑
|
pubdate = Timestamp2Datetime(pubstamp)
|
||||||
app_msg_ext_info = item["app_msg_ext_info"]
|
if comm_msg_info["type"] == 49: #49为普通图文类型,还有其他类型,暂不考虑
|
||||||
url = app_msg_ext_info["content_url"] #文章链接
|
app_msg_ext_info = item["app_msg_ext_info"]
|
||||||
idx = artidx
|
url = app_msg_ext_info["content_url"] #文章链接
|
||||||
title = app_msg_ext_info["title"]
|
idx = artidx
|
||||||
art = Article(url,pubdate,idx,title)
|
title = app_msg_ext_info["title"]
|
||||||
if len(url)>3:#url不完整则跳过
|
art = Article(url,pubdate,idx,title)
|
||||||
ArtList.append(art)
|
if len(url)>3:#url不完整则跳过
|
||||||
print(len(ArtList),pubdate, idx, title)
|
|
||||||
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
|
|
||||||
artidx += 1
|
|
||||||
multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"]
|
|
||||||
for subArt in multi_app_msg_item_list:
|
|
||||||
url =subArt["content_url"]
|
|
||||||
idx =artidx
|
|
||||||
title = subArt["title"]
|
|
||||||
art = Article(url,pubdate,idx,title)
|
|
||||||
if len(url)>3:
|
|
||||||
ArtList.append(art)
|
ArtList.append(art)
|
||||||
print(len(ArtList),pubdate, idx, title)
|
print(len(ArtList),pubdate, idx, title)
|
||||||
|
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇
|
||||||
|
artidx += 1
|
||||||
|
multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"]
|
||||||
|
for subArt in multi_app_msg_item_list:
|
||||||
|
url =subArt["content_url"]
|
||||||
|
idx =artidx
|
||||||
|
title = subArt["title"]
|
||||||
|
art = Article(url,pubdate,idx,title)
|
||||||
|
if len(url)>3:
|
||||||
|
ArtList.append(art)
|
||||||
|
print(len(ArtList),pubdate, idx, title)
|
||||||
|
except:
|
||||||
|
print("跳过,可不用管",file)
|
||||||
return ArtList
|
return ArtList
|
||||||
|
|
||||||
def DownHtmlMain(jsonDir,saveHtmlDir):
|
def DownHtmlMain(jsonDir,saveHtmlDir):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user