-
-
Notifications
You must be signed in to change notification settings - Fork 13
/
cleanup.py
103 lines (97 loc) · 2.31 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import yyjson
from glob import glob
import esm
import fasttext
banned_word = [
"xxx",
"lotto",
"poker",
"porn",
"bokep",
"judi",
"sange",
"togel",
"memek",
"kontol",
"dientot",
"qiuqiu",
"betting",
"taruhan",
"cock",
"toket",
"video jav",
"video porn",
"pussy",
"domino",
"blowjob",
"cerita seks",
"cerita sex",
"teen sex",
"milf",
"doggy style",
"squirt",
"fuck",
"hentai",
"cumshot",
"rape",
"colmek",
"coli",
"masturb",
"tetek",
"entot",
"diperkosa",
"bispak",
"nyepong",
"sepong",
"jablay",
"ngewe",
"jilbab hot",
"lonte",
"fortamen",
"vimax",
"vig power",
"ceritasex",
"cerita dewasa",
"sabung ayam",
"agen bola",
"bugil"
]
banned_domain_index = esm.Index()
for domain in banned_word:
banned_domain_index.enter(domain)
banned_domain_index.fix()
def preprocess_for_fasttext(x):
return x.replace("\n", " ").replace("\r", " ")[:4000][-1500:]
model = fasttext.load_model("nsfw_filter.bin")
for file in glob("*.json-clean"):
print(f"-> Processing {file}")
out = open(file + "-nobet-porn-agc", "a")
with open(file, "rb") as f:
for line in f:
line = line.strip()
doc = yyjson.loads(line)
url = doc["url"].lower()
text = doc["text"].lower()
qline = banned_domain_index.query(line.decode("utf-8").lower())
if not (
len(qline) > 5
or any( # Spammy domain
x in url
for x in (
"/search/",
"/page/",
"?q=",
"?search=",
"s=",
".space",
".icu",
"/tag/"
)
) or any(x in text for x in ("search result", "sites directory")) # SEO Optimization stuff (bad content)
):
out.write(line.decode("utf-8") + "\n")
continue
if bool(qline):
nsfw = model.predict(preprocess_for_fasttext(text))
if nsfw[0][0] == "__label__sfw":
out.write(line.decode("utf-8") + "\n")