-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdaniel_light.py
126 lines (111 loc) · 3.6 KB
/
daniel_light.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json
import re
with open("stopwords-fr.json") as f:
set_stop = set(json.load(f))
def filter_KW(liste):
out = []
liste = set(liste).difference(set_stop)
for x in liste:
numbers = re.findall("[0-9]",x)
if x.lower() in set_stop:
continue
elif len(x)<2:
continue
elif len(re.findall("[A-Za-z]", x))==0:
continue
elif len(numbers)>0:
if len(numbers[0])==len(x):
continue
out.append(x)
return out
def tokenize(chaine):
chaine = re.sub("\.|'|\?|»|«|\"", " ", chaine)
chaine = re.sub(" {2,}"," ", chaine)
mots = chaine.split()
return mots
def is_subsequence(needle, haystack):
return any( haystack[i:i+len(needle)] == needle
for i in range(len(haystack) - len(needle) + 1))
def get_ngrams(inter, word_titre, word_body):
selected = [x for x in inter]
haystack=[x.lower() for x in word_body]
for i, word in enumerate(word_titre):
if word in inter or word.lower() in inter:
n_gram = [word]
last = i
for j, w in enumerate(word_titre[i+1:]):
if w in inter:
last = j
n_gram.append(w)
if len(n_gram)>1 and len(n_gram)<5 and last!=i:
needle = [x.lower() for x in n_gram]
if is_subsequence(needle, haystack):
selected.append("_".join(n_gram[:last+2]))
## TODO: régler le problème des n-grammes
## if "gilets" in haystack and "jaunes in haystack":
## if "gilets_jaunes" not in selected:
## print(selected)
## x = haystack.index("gilets")
## print(haystack[x:x+10])
## print(word_titre[word_titre.index("gilets"):])
return selected
def get_KW(titre, text):
body = text
chapeau = ""
if len(titre)<100:
chapeau = text[:200]
body = text[200:]
word_titre = set(tokenize(titre) + tokenize(chapeau))
word_body = set(tokenize(body))
inter = word_titre.intersection(word_body)
inter = filter_KW(inter)
inter = get_ngrams(inter, tokenize(titre), tokenize(body))
return inter
with open("corpus_final/corpus.json") as f:
L = json.load(f)
dic = {"index":{}, "index_I":{}}
for cpt, info_text in enumerate(L):
titre = info_text["titre"]
text = info_text["texte"]
date = info_text["date"]
elems_date = re.findall("[0-9]{1,4}", date)
if len(elems_date[0])==2:
L[cpt]["date"] = "%s-%s-%s"%(elems_date[1], elems_date[2], elems_date[0])
liste_KW = get_KW(titre, text)
for KW in liste_KW:
if KW.lower() in dic["index"]:
KW = KW.lower()
dic["index"].setdefault(KW, [])
dic["index"][KW].append(cpt)
dic["index_I"].setdefault(cpt, [])
dic["index_I"][cpt].append(KW)
print("On a %i Documents"%(cpt+1))
l_test = [[len(dic["index"][kw]), kw] for kw in dic["index"]]
S = sorted([x for x in l_test if x[0]>1], reverse=True)
S2 = sorted([x for x in l_test if "_" in x[1]], reverse=True)
top10 = [x[1] for x in S[:10]]
top20 = [x for x in S[:20]]
print("Top20 :",top20)
topGrams = [x for x in S2[:10]]
print("Top grams :",topGrams)
entetes = ["date", "journal", "titre"]
out_csv = [[x for x in entetes]]
for i in range(len(top10)):
out_csv[0].append(top10[i])
out_csv[0].append("autres mots-clés")
out_csv[0] = ";".join(out_csv[0])
for ID_txt, liste_KW in dic["index_I"].items():
info_txt = L[ID_txt]
ligne = [info_txt[cle] for cle in entetes]+[""]*10+[[]]
for kw in liste_KW:
if kw in top10:
ligne[top10.index(kw)+len(entetes)]="X"
else:
ligne[-1].append(kw)
if len(ligne[2])>100:
ligne[2] = ligne[2][:100]+"..."
ligne[-1]=" , ".join(ligne[-1])
ligne = [re.sub(";", " ", x) for x in ligne]
out_csv.append(";".join(ligne))
with open("out_complet.csv", "w") as w:
w.write("\n".join(out_csv))