-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistics.py
83 lines (63 loc) · 2.08 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from collections import Counter
import sys
import re
def compute_sentences(filepath):
sentences = 0
with open(filepath, 'r', encoding='utf-8') as in_file:
for _ in in_file:
sentences += 1
return sentences
def compute_words(filepath):
words = 0
with open(filepath, 'r', encoding='utf-8') as in_file:
for line in in_file:
words += len(line.split(" "))
return words
def compute_chars(filepath):
chars = 0
with open(filepath, 'r', encoding='utf-8') as in_file:
for line in in_file:
chars += len(line)
return chars
def compute_chars_dict(filepath):
global_counter = Counter()
with open(filepath, 'r', encoding='utf-8') as in_file:
for line in in_file:
global_counter += Counter(line)
return global_counter
def compute_chars_diacritics(filepath):
regexPattern = re.compile('[aăâiîsștț]')
chars = 0
with open(filepath, 'r', encoding='utf-8') as in_file:
for line in in_file:
listOfmatches = regexPattern.findall(line)
chars += len(listOfmatches)
return chars
def compute_chars_no_spaces(filepath):
chars = 0
with open(filepath, 'r', encoding='utf-8') as in_file:
for line in in_file:
sentence = line.replace(" ", "")
sentence = sentence.replace("\n", "")
chars += len(sentence)
return chars
def compute_statistics(filepath):
print("Statistics for", filepath.split("/")[-1])
sent = compute_sentences(filepath)
words = compute_words(filepath)
chars = compute_chars(filepath)
chars_no_space = compute_chars_no_spaces(filepath)
chars_dict = compute_chars_dict(filepath)
chars_diacritics = compute_chars_diacritics(filepath)
print("Sentences =", format(sent, ",d"))
print("Words =", format(words, ",d"))
print("Total chars =", format(chars, ",d"))
print("Chars w/o spaces =", format(chars_no_space, ",d"))
print("Char with possible diacritics =", format(chars_diacritics, ",d"))
print("Chars dict =", chars_dict)
print("Unique chars =", len(chars_dict))
print("--------------")
if __name__ == "__main__":
compute_statistics("dataset/split/train.txt")
compute_statistics("dataset/split/dev.txt")
compute_statistics("dataset/split/test.txt")