-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokenizer.py
56 lines (47 loc) · 1.85 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import javalang
import sys
def camel_case_split(str):
words = [[str[0]]]
for c in str[1:]:
if words[-1][-1].islower() and c.isupper():
words.append(list(c))
else:
words[-1].append(c)
return [''.join(word) for word in words]
with open('expressions.txt') as f:
content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
# content = [x.strip() for x in content]
for index in range(0,len(content)):
expression = content[index]
# print("EXP " +expression)
tokens = list(javalang.tokenizer.tokenize(expression))
# print(tokens)
output = ""
for i in range(0,len(tokens)):
if i!=len(tokens)-1:
if(str(type(tokens[i]))=="<class 'javalang.tokenizer.Identifier'>"):
words = camel_case_split(tokens[i].value)
for j in range(0,len(words)):
output = output + words[j].lower()+"\n"
else:
output = output + tokens[i].value+"\n"
else:
if(str(type(tokens[i]))=="<class 'javalang.tokenizer.Identifier'>"):
words = camel_case_split(tokens[i].value)
size = len(words)
for j in range(0,size-1):
output = output + words[j].lower()+"\n"
output = output + words[size-1].lower()+"\n"
else:
output = output + tokens[i].value+"\n"
print(output)
# for i in range(0,len(content)):
# tokens = list(javalang.tokenizer.tokenize(content[i]))
# if(len(tokens)==0):
# output = output + "empty"+"\n\n"
# for j in range(0,len(tokens)):
# if j!=len(tokens)-1:
# output = output + tokens[j].value+"\n" #+ str(type(tokens[j]))+"\n"
# else:
# output = output + tokens[j].value+"\n\n"