-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
44 lines (38 loc) · 1.51 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
from bs4 import BeautifulSoup
import json
from pathlib import Path
from collections import deque
# 将root改成您存放html文件的目录
root = Path(".")
# 扫描html文件
def scanfile(path: Path, content) -> dict:
htmlcontent = BeautifulSoup(content, 'html.parser')
# 将p, h2, h3, h4的内容,去除符号之后作为搜索内容
textlist = "".join(
map(lambda p: re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", p.get_text()),
htmlcontent.find_all(name=['p', 'h2', 'h3', 'h4'])))
# 以h1或者文件名作为标题
title = htmlcontent.find(name="title")
title = title.get_text() if title else path.stem
return {
"title": title,
"path": path.relative_to(root).__str__(),
"text": textlist
}
if __name__ == "__main__":
j = []
target = deque([root]) # type: deque[Path]
# 递归的遍历文件夹下所有的html文件
while len(target) > 0:
file = target.pop()
if file.is_dir():
target.extend(file.iterdir())
elif file.is_file() and file.suffix == ".html":
j.append(scanfile(file, file.read_bytes()))
# 将最后的扫描结果和search.js输出到searcher.js
# html文件中应该包含searcher.js
with open("./ref/searcher.js", "w", encoding='utf-8') as output:
with open("./ref/search.js", "r", encoding='utf-8') as input:
output.write("let SearchResult = '"+json.dumps(j)+"';\n")
output.write(input.read())