forked from geekcomputers/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mobilePhoneSpecsScrapper.py
113 lines (98 loc) · 4.42 KB
/
mobilePhoneSpecsScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import requests
from bs4 import BeautifulSoup
# import csv
import os
# import time
import json
class Phonearena:
def __init__(self):
self.phones = []
self.features = ["Brand", "Model Name", "Model Image"]
self.temp1 = []
self.phones_brands = []
self.url = "https://www.phonearena.com/phones/" # GSMArena website url
# Folder name on which files going to save.
self.new_folder_name = "GSMArenaDataset"
# It create the absolute path of the GSMArenaDataset folder.
self.absolute_path = os.getcwd().strip() + "/" + self.new_folder_name
def crawl_html_page(self, sub_url):
url = sub_url # Url for html content parsing.
# Handing the connection error of the url.
try:
page = requests.get(url)
# It parses the html data from requested url.
soup = BeautifulSoup(page.text, "html.parser")
return soup
except ConnectionError as err:
print("Please check your network connection and re-run the script.")
exit()
except Exception:
print("Please check your network connection and re-run the script.")
exit()
def crawl_phone_urls(self):
phones_urls = []
for i in range(1, 238): # Right now they have 237 page of phone data.
print(self.url + "page/" + str(i))
soup = self.crawl_html_page(self.url + "page/" + str(i))
table = soup.findAll("div", {"class": "stream-item"})
table_a = [k.find("a") for k in table]
for a in table_a:
temp = a["href"]
phones_urls.append(temp)
return phones_urls
def crawl_phones_models_specification(self, li):
phone_data = {}
for link in li:
print(link)
try:
soup = self.crawl_html_page(link)
model = soup.find(class_="page__section page__section_quickSpecs")
model_name = model.find("header").h1.text
model_img_html = model.find(class_="head-image")
model_img = model_img_html.find("img")["data-src"]
specs_html = model.find(
class_="phone__section phone__section_widget_quickSpecs"
)
release_date = specs_html.find(class_="calendar")
release_date = release_date.find(class_="title").p.text
display = specs_html.find(class_="display")
display = display.find(class_="title").p.text
camera = specs_html.find(class_="camera")
camera = camera.find(class_="title").p.text
hardware = specs_html.find(class_="hardware")
hardware = hardware.find(class_="title").p.text
storage = specs_html.find(class_="storage")
storage = storage.find(class_="title").p.text
battery = specs_html.find(class_="battery")
battery = battery.find(class_="title").p.text
os = specs_html.find(class_="os")
os = os.find(class_="title").p.text
phone_data[model_name] = {
"image": model_img,
"release_date": release_date,
"display": display,
"camera": camera,
"hardware": hardware,
"storage": storage,
"battery": battery,
"os": os,
}
with open(obj.absolute_path + "-PhoneSpecs.json", "w+") as of:
json.dump(phone_data, of)
except Exception as error:
print(f"Exception happened : {error}")
continue
return phone_data
if __name__ == "__main__":
obj = Phonearena()
try:
# Step 1: Scrape links to all the individual phone specs page and save it so that we don't need to run it again.
phone_urls = obj.crawl_phone_urls()
with open(obj.absolute_path + "-Phoneurls.json", "w") as of:
json.dump(phone_urls, of)
# Step 2: Iterate through all the links from the above execution and run the next command
with open("obj.absolute_path+'-Phoneurls.json", "r") as inp:
temp = json.load(inp)
phone_specs = obj.crawl_phones_models_specification(temp)
except KeyboardInterrupt:
print("File has been stopped due to KeyBoard Interruption.")