-
Notifications
You must be signed in to change notification settings - Fork 1
/
common.py
38 lines (36 loc) · 1.16 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup
# Common settings for all elasticsearch indexes
index_settings = {
"index": {
"number_of_shards" : 1,
"analysis": {
"analyzer": {
# 'trigram' and 'reverse' analyzers needed for phrase suggester. See mappings/hugo.json.
"trigram": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "shingle"]
},
"reverse": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "reverse"]
}
},
"filter": {
# 'shingle' filter needed by 'trigram' analyzer.
"shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3
}
}
}
}
}
def html2text(html):
"""
Return the plain text (UTF-8) representation of the given HTML
"""
parser = BeautifulSoup(html, features="html.parser")
return ''.join(parser.find_all(string=True))