-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
171 lines (112 loc) · 4.96 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Importing Relevant Libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
# Defining Urls
cam_url = "https://dictionary.cambridge.org/dictionary/english/"
fd_url = "https://fastdic.com/word/"
trs_url = "https://www.thesaurus.com/browse/"
# Opening File of Words
file = open("words_list.txt", "r")
# Words List Preprocessing
words = []
for word in file:
word = word[:-1]
words.append(word.lower())
# Defining a Pattern to Find Synonyms in Thesaurus
exp = re.compile("^e1ccqdb60.*")
# Creating a List of All Items
def_lst = []
ex_lst = []
ws_lst = []
# Scraping Websites
for word in words:
"""""
In this loop we use each word as an input and extract all data of it.
First Step:
* Adding word to the words list.
* Creating 3 urls of 3 websites for word.
Second Step:
* Getting and parsing data from thesaurus website.
* Creating an empty string for storing word's synonyms.
* Finding a list of the most important synonyms of the word.
* Adding synonms to the empty string we've created before.
* Adding the synonyms string to the defenitions list.
Third Step:
* Getting and parsing data from fastdic website.
* Creating an empty list for storing word's synonyms.
* Adding synonms to the empty list we've created before.
* Changing the list of synonims to a string variable.
Forth Step:
* Getting and parsing data from cambridge dictionary website.
* Creating an integer varibale with the initial value of 0 for storing numbers of examples founded for the word.
* Storing each block's data (one defenition and one or more examples) in two variables named defenition and examples.
* Adding defenition and examples variables defined before to the main defenition and example list.
"""
# First Step
ws_lst += [word]
url1 = trs_url + word
url2 = fd_url + word
url3 = cam_url + word
# Second Step
results1 = requests.get(url1, headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"})
doc1 = BeautifulSoup(results1.text, "html.parser")
syns_str = ""
grid = doc1.find_all(class_ = exp)
syns = grid[0].find_all(class_ = "css-1kg1yv8 eh475bn0")
for syn in syns:
if syns.index(syn) != len(syns) - 1:
syns_str += syn.text + "| "
else:
syns_str += syn.text
def_lst += [syns_str]
# Third Step
results2 = requests.get(url2, headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"})
doc2 = BeautifulSoup(results2.text, "html.parser")
farsi_meanings_lst = []
farsi_meanings_str = ""
farsi_meanings = doc2.find("ul", attrs = {"class" : "result"})
farsi_meanings = farsi_meanings.find_all("span", class_ = "")
for farsi_meaning in farsi_meanings:
farsi_meaning = farsi_meaning.text
farsi_meaning = farsi_meaning.replace("\n", "")
farsi_meaning = farsi_meaning.replace("\t", "")
farsi_meaning = farsi_meaning.replace("\u200c", " ")
farsi_meanings_lst.append(farsi_meaning)
for farsi_meaning in farsi_meanings_lst:
if farsi_meanings_lst.index(farsi_meaning) != (len(farsi_meanings_lst) - 1):
farsi_meaning += " | "
farsi_meanings_str = farsi_meanings_str + farsi_meaning
ex_lst += [farsi_meanings_str]
# Forth Step
results3 = requests.get(url3, headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"})
doc3 = BeautifulSoup(results3.text, "html.parser")
ex_num = 0
blocks = doc3.find_all(class_ = "def-block ddef_block")
for block in blocks:
defenition = ""
examples = []
defenition = ((block.find(class_ = "def ddef_d db")).text)[:-2]
examples = [example.text for example in block.find_all(class_ = "eg deg")]
if len(examples) == 0:
examples = [""]
def_lst = def_lst + [defenition] + (len(examples) - 1) * [""]
ex_lst += examples
ex_num += len(examples)
ws_lst += (ex_num + 1) * [""]
def_lst += [""]
ex_lst += [""]
# Creating a Pandas Dataframe
index = []
index_counter = 1
for i in ws_lst:
if len(i) > 0:
index += [index_counter]
index_counter += 1
else:
index += [""]
df = pd.DataFrame(list(zip(ws_lst, def_lst, ex_lst)), columns =["Word", "Def.", "Exp."])
df.index = index
# Saving The Dataframe File as an Excel File
df.to_excel("Dictionary.xlsx")