Update start.py
修正了若干常见问题
This commit is contained in:
parent
0d52e92361
commit
ac3473fda1
20
start.py
20
start.py
@ -8,8 +8,7 @@ from time import sleep
|
||||
|
||||
"""
|
||||
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
|
||||
讨论QQ群 703431832
|
||||
|
||||
讨论QQ群 703431832 加群暗号:不止技术流
|
||||
"""
|
||||
|
||||
#保存文件
|
||||
@ -51,8 +50,9 @@ def DownLoadHtml(url):
|
||||
'Connection':'keep-alive',
|
||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||
}
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
response = requests.get(url,headers = headers,proxies=None,verify=False)
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
response = session.get(url,headers = headers)
|
||||
if response.status_code == 200:
|
||||
htmltxt = response.text #返回的网页正文
|
||||
return htmltxt
|
||||
@ -68,10 +68,11 @@ def DownImg(url,savepath):
|
||||
'Connection':'keep-alive',
|
||||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
|
||||
}
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
r = requests.get(url,headers = headers,proxies=None,verify=False)
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
response = session.get(url, headers=headers)
|
||||
with open(savepath, 'wb') as f:
|
||||
f.write(r.content)
|
||||
f.write(response.content)
|
||||
|
||||
#修改网页中图片的src,使图片能正常显示
|
||||
def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
||||
@ -89,7 +90,7 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname):
|
||||
originalURL = ""
|
||||
if originalURL.startswith("//"):#如果url以//开头,则需要添加http:
|
||||
originalURL = "http:" + originalURL
|
||||
if len(originalURL) > 0:
|
||||
if len(originalURL) > 20:
|
||||
print("\r down imgs " + "▇" * imgindex +" " + str(imgindex),end="")
|
||||
if "data-type" in img.attrs:
|
||||
imgtype = img.attrs["data-type"]
|
||||
@ -133,6 +134,7 @@ def GetArticleList(jsondir):
|
||||
filelist = os.listdir(jsondir)
|
||||
ArtList = []
|
||||
for file in filelist:
|
||||
try:
|
||||
filepath = os.path.join(jsondir,file)
|
||||
filetxt = ReadFile(filepath)
|
||||
jsbody = json.loads(filetxt)
|
||||
@ -165,6 +167,8 @@ def GetArticleList(jsondir):
|
||||
if len(url)>3:
|
||||
ArtList.append(art)
|
||||
print(len(ArtList),pubdate, idx, title)
|
||||
except:
|
||||
print("跳过,可不用管",file)
|
||||
return ArtList
|
||||
|
||||
def DownHtmlMain(jsonDir,saveHtmlDir):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user