-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
114 lines (99 loc) · 3.29 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import re
import numpy as np
from nltk.tokenize import WhitespaceTokenizer
def load_glove():
"""
This function loads the GloVe file.
Returns:
- word_2_vec (dict): The GloVe file.
"""
word_2_vec = {}
print("Loading GloVe")
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/glove.6B.50d-relativized.txt")) as file:
for line in file:
l = line.split()
word_embedding = np.array(l[1:], dtype=np.float32)
word_2_vec[l[0]] = word_embedding
print("Finish loading GloVe")
return word_2_vec
def read_file(data):
"""
This function reads the data file.
----------
Parameters:
- data (str): The path to the data file (train.txt or dev.txt).
Returns:
- data_file (list): The data file.
"""
data_file = []
with open(data, "r") as file:
for line in file:
data_file.append(line)
return data_file
def split_line(sentence):
"""
This function splits the data into features and targets.
----------
Parameters:
- sentence (str): The sentence of the data list to be split
Returns:
- features (dict): The features.
- targets (dict): The targets.
"""
line = sentence.split("\t")
targets = line[0]
features = tokenizer(line[1])
return features, targets
def clean_sentence(sentence):
"""
This function cleans the sentence by lowering all the capitals and remove
all symbols.
----------
Parameters:
- sentence (str): The sentence to be cleaned.
Returns:
- sentence (str): The cleaned sentence.
"""
sentence = sentence.lower()
sentence = re.sub(r'[^\w]', ' ', sentence)
return sentence.replace(" ", " ")
def tokenizer(sentence):
"""
This function tokenizes the sentence.
----------
Parameters:
- sentence (str): The sentence to be tokenized.
Returns:
- WhitespaceTokenizer().tokenize(sentence): The tokenized sentence.
"""
sentence = clean_sentence(sentence)
return WhitespaceTokenizer().tokenize(sentence)
def word_to_vec(word, word_2_vec):
"""
This function vectorizes the word using word_2_vec embeddings.
----------
Parameters:
- word (str): The word to be vectorized.
- word_2_vec (dict): The GloVe file to get word embeddings.
Returns:
- embed: The vectorized word.
"""
if word in word_2_vec:
embed = word_2_vec[word]
else:
embed = random_vectorize(word, word_2_vec)
return embed
def random_vectorize(word, word_2_vec, word_vector_size=50):
"""
This function handles the case where the word is not in the GloVe file.
----------
Parameters:
- word (str): The word to be vectorized.
- word_2_vec (dict): The GloVe file to get word embeddings
- word_vector_size (int): The size of the word vector (default 50)
Returns:
- The vectorized word.
"""
word_2_vec[word] = np.random.uniform(0.0, 1.0, word_vector_size)
return word_2_vec[word]