-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreproc.py
43 lines (31 loc) · 1.07 KB
/
preproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import nltk
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import words
from nltk.tag import pos_tag
import json
five_letter_words = set(filter(lambda w: len(w) == 5, words.words()))
pos = pos_tag(five_letter_words)
pos_by_word = {}
for (word, p) in pos:
pos_by_word[word] = p
filtered_dictionary = set(w for w in five_letter_words if 'NNP' not in pos_by_word[w])
frequencies_file = open('wiki_corpus_frequency.txt')
frequencies_raw = frequencies_file.read()
frequencies = frequencies_raw.splitlines()
frequency_by_word = {}
for word in filtered_dictionary:
frequency_by_word[word] = 0
for line in frequencies:
word, frequency_text = line.split(' ')
if word in filtered_dictionary:
frequency_by_word[word] = int(frequency_text)
sorted_by_freq = list(filtered_dictionary)
sorted_by_freq.sort(key=lambda w: -frequency_by_word[w])
top_5k = sorted_by_freq[:5000]
serialized = [{
"word": w,
"frequency": frequency_by_word[w]
} for w in top_5k]
out_file = open('top_5k_words.json', 'w')
json.dump(serialized, out_file)