-
Notifications
You must be signed in to change notification settings - Fork 0
/
AnandMarathaListWebScrap.py
113 lines (94 loc) · 3.82 KB
/
AnandMarathaListWebScrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import requests
from bs4 import BeautifulSoup
import simplejson
import json
anandMarathaUrl = "https://www.anandmaratha.com"
fileName = "AnandMaratha.json"
# get cookies for further
# session = requests.Session()
# session.post(anandMarathaUrl)
# cookie = session.cookies.get_dict()
# print(cookie)
sessionId = input("Enter Coockie Id : ")
cookie = {'PHPSESSID': sessionId}
baseUrl = "https://www.anandmaratha.com/search_listing.php"
page = requests.post(baseUrl, cookies=cookie)
soup = BeautifulSoup(page.text, 'html.parser')
count = soup.findAll("b")[0].text.split("of")[1].replace(' ', '')
print("Total Pages : " + count)
table = soup.find("table", {"class": "clearfix"})
headers = [header.getText(strip=True) for header in table.findAll("th")]
headers.insert(0, "Images")
headers.insert(1, "DetailsUrl")
results = []
print(baseUrl)
for i in range(int(count)):
baseUrl = "https://www.anandmaratha.com/search_listing.php?&page=" + \
str(i + 1) + "&val="
print("Downloading... : " + baseUrl)
page = requests.post(baseUrl, cookies=cookie)
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find("table", {"class": "clearfix"})
for row in table.findAll("tr"):
tempRow = {}
for i, cell in enumerate(row.findAll("td")):
if len(cell.find_all("a")) == 0:
tempRow.update({headers[i]: cell.getText().replace(
'\n', '').replace(' ', '')})
else:
if(len(cell.find_all("img")) > 0):
tempRow.update(
{headers[0]: cell.find_all("img")[0]["src"]})
if(len(cell.find_all("a")) > 0):
tempRow.update(
{headers[1]: anandMarathaUrl + "/" + cell.find_all("a")[0]["href"]})
if tempRow != {}:
results.append(tempRow)
# List write file
# file = open("list.json", "w")
# simplejson.dump(results, file)
# file.close()
# get details of each record
detailsResult = []
results.pop()
resultLength = len(results)
print("Total number matches the search criteria : " + str(resultLength))
try:
# clear the file
file = open("AnandMaratha.json", "w")
file.close()
for i, res in enumerate(results, 1):
url = str(res.get("DetailsUrl"))
print("Download result " + str(i) + " out of " +
str(resultLength) + ": Url :" + url)
if url != "None":
# print(res.get("DetailsUrl"))
page = requests.post(res.get("DetailsUrl"), cookies=cookie)
soup = BeautifulSoup(page.text, 'html.parser')
div = soup.find_all("div", {"class": "about-us-content"})
imageDiv = div[0]
detailsDiv = div[1]
imagePath = imageDiv.find_all("img")[0]["src"]
result = {"Image": anandMarathaUrl + "/" + imagePath}
registrationNo = imageDiv.find_all("p")[0]
result.update({"Id": registrationNo.getText().split(":")[
0].replace('\n', '').replace(' ', '')})
for row in detailsDiv.find_all("h5"):
header = row.getText().split(":")[0].replace(
'\n', '').replace(' ', '')
data = row.getText().split(":")[1].replace(
'\n', '').replace(' ', '')
result.update({header: data})
detailsResult.append(result)
# if len(detailsResult) >= 20:
# # write file
# file = open("AnandMaratha_" + str(i) + ".json", "w")
# simplejson.dump(detailsResult, file)
# file.close()
# detailsResult = []
# write file
file = open("AnandMaratha_Complete.json", "w")
simplejson.dump(detailsResult, file)
file.close()
except Exception as e:
print(e)