新增筛选固定内容功能
This commit is contained in:
		
							parent
							
								
									d38ffdab1d
								
							
						
					
					
						commit
						c4499fd076
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,3 +1,4 @@ | ||||
| /html | ||||
| /pdf | ||||
| /useful | ||||
| *.zip | ||||
| @ -1,5 +1,6 @@ | ||||
| { | ||||
|     "jsonDir": "./mp", | ||||
|     "htmlDir": "./html", | ||||
|     "pdfDir": "./pdf" | ||||
|     "htmlDir": "./useful", | ||||
|     "pdfDir": "./pdf", | ||||
|     "usefulDir": "./useful" | ||||
| } | ||||
							
								
								
									
										41
									
								
								start.py
									
									
									
									
									
								
							
							
						
						
									
										41
									
								
								start.py
									
									
									
									
									
								
							| @ -4,7 +4,9 @@ import os, sys | ||||
| import requests | ||||
| import json | ||||
| import pdfkit | ||||
| import codecs | ||||
| import subprocess | ||||
| import re | ||||
| from bs4 import BeautifulSoup | ||||
| from datetime import datetime, timedelta | ||||
| from time import sleep | ||||
| @ -45,6 +47,8 @@ def GetJson(): | ||||
|         jsbd["htmlDir"] = jsbd["htmlDir"][:-1] | ||||
|     if jsbd["jsonDir"][-1] == "/": | ||||
|         jsbd["jsonDir"] = jsbd["jsonDir"][:-1] | ||||
|     if jsbd["usefulDir"][-1] == "/": | ||||
|         jsbd["usefulDir"] = jsbd["usefulDir"][:-1] | ||||
|     return jsbd | ||||
| 
 | ||||
| 
 | ||||
| @ -305,6 +309,38 @@ def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True): | ||||
|     """ | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| # 搜索关键字找到对应html并归类 | ||||
| def SearchDir(htmldir, savedir): | ||||
| 
 | ||||
|     recordFile = 'execute-records' | ||||
| 
 | ||||
|     if not os.path.exists(savedir): | ||||
|         os.makedirs(savedir) | ||||
|     flist = os.listdir(htmldir) | ||||
|      | ||||
|     for f in flist: | ||||
|         if (not f[-5:] == ".html") or ("tmp" in f):  # 不是html文件的不转换,含有tmp的不转换 | ||||
|             continue | ||||
| 
 | ||||
|         FoundFlag = False | ||||
|         htmlpath = htmldir + "/" + f | ||||
|         tmppath = htmlpath[:-5] + ".html"  # 生成临时文件,供转pdf用 | ||||
|         htmlstr = ReadFile(htmlpath) | ||||
|         bs = BeautifulSoup(htmlstr, "lxml") | ||||
|         # pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败 | ||||
|         keyWord = bs.find_all(text=re.compile("养老")) | ||||
|         print(keyWord) | ||||
|         if (keyWord): | ||||
|             """ | ||||
|                 把js等去掉,减少转PDF时的加载项, | ||||
|                 注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css | ||||
|             """ | ||||
|             [s.extract() for s in bs(["script", "iframe"])] | ||||
|             # , "link" | ||||
|             SaveFile(savedir + '/' +f, str(bs)) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     if len(sys.argv) == 1: | ||||
|         arg = None | ||||
| @ -320,3 +356,8 @@ if __name__ == "__main__": | ||||
|         saveHtmlDir = jsbd["htmlDir"] | ||||
|         savePdfDir = jsbd["pdfDir"] | ||||
|         PDFDir(saveHtmlDir, savePdfDir) | ||||
|     elif arg == "search": | ||||
|         jsbd = GetJson() | ||||
|         htmlDir = jsbd["htmlDir"] | ||||
|         saveDir = jsbd["usefulDir"] | ||||
|         SearchDir(htmlDir, saveDir) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 rucky
						rucky