forked from M-Shamshair/Amazon-Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAmazon_Scraper.py
248 lines (230 loc) · 10.3 KB
/
Amazon_Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#Note: This script includes auto bypass method for verfication account if it logins from a new area.
# email: [email protected]
"""
Place user text file in same directory
Usage: python amazon_scraper.py
after run program input user text file which contain account like >> email:password
enter output txt file name after execution
"""
import mechanize
from bs4 import BeautifulSoup
from BeautifulSoup import BeautifulSoup
from optparse import OptionParser
from time import sleep
import datetime
import os
import re
import sys
order_no_id = open('order_ids.txt', 'w+')
not_login = open('not_login.txt','w+')
result = open ('result.txt', 'w+')
def banner():
print '======================================================'
print '| Amazon Multi-Account Order Scraper |'
print '======================================================'
print '| Purpose: To keep the client order history records |'
print '| Date: 24-Feb-2015 |'
print '| Author: Ali Ahmer aka King Ali |'
print '| email: [email protected] |'
print '| Note: Don,t use for illegal purpose |'
print '======================================================'
def account_info(address_book_info):
address_book = open('billing_info.txt', 'w+')
soup = BeautifulSoup(address_book_info)
print 'Getting Billing Informations...\r\n'
for order in soup.findAll('ul', {'class': 'displayAddressUL'}):
for items in order.findAll('li', {'class': 'displayAddressLI displayAddressAddressLine1'}):
for add1 in items:
address_book.write(''.join(add1+'\r\n'))
for items in order.findAll('li', {'class': 'displayAddressLI displayAddressAddressLine2'}):
for add2 in items:
address_book.write(''.join(add2+'\r\n'))
for items in order.findAll('li', {'class': 'displayAddressLI displayAddressCityStateOrRegionPostalCode'}):
for s_r_p in items:
address_book.write(''.join(s_r_p+'\r\n'))
for items in order.findAll('li', {'class': 'displayAddressLI displayAddressCountryName'}):
for c_n in items:
address_book.write(''.join(c_n+'\r\n'))
for items in order.findAll('li', {'class': 'displayAddressLI displayAddressPhoneNumber'}):
for p_n in items:
address_book.write(''.join(p_n+'\r\n'))
address_book.close()
result.write('\r\nBilling Information: \r\n')
result.write('===================\r\n')
cleaning_format('billing_info.txt')
result.write('-----------------------------------------------------\r\n')
def orders_no_details_checking(html):
product_details = open('product_details.txt', 'w+')
buyer_details = open('buyer_details.txt', 'w+')
date_details = open ('date_details.txt', 'w+')
soup = BeautifulSoup(html)
print 'Geting Product Details...\r\n'
for order in soup.findAll('div', {'class': 'a-fixed-left-grid-col a-col-right'}):
for items in order.findAll('div', {'class': 'a-row'}):
for title in items.findAll('a', {'class': 'a-link-normal'}):
for title_text in title:
product_details.write(''.join(title_text))
for description in items.findAll('span', {'class': 'a-size-small'}):
for description_text in description:
product_details.write(''.join(description_text))
for sold_by in items.findAll('span', {'class': 'a-size-small a-color-secondary'}):
for sold_by_text in sold_by:
product_details.write(''.join(sold_by_text))
for price in items.findAll('span', {'class': 'a-size-small a-color-price'}):
for price_text in price:
product_details.write(''.join(price_text))
for condition in items.findAll('span', {'class': 'a-color-secondary'}):
for condition_text in condition:
product_details.write(''.join(condition_text))
print 'Getting Buyer Details....\r\n'
for order in soup.findAll('div', {'class': 'displayAddressDiv'}):
for items in order.findAll('ul', {'class': 'displayAddressUL'}):
for name in items.findAll('li', {'class': 'displayAddressLI displayAddressFullName'}):
for name_text in name:
buyer_details.write(''.join(name_text+'\r\n'))
for address in items.findAll('li', {'class': 'displayAddressLI displayAddressAddressLine1'}):
for address_text in address:
buyer_details.write(''.join(address_text+'\r\n'))
for city_state_postal_code in items.findAll('li', {'class': 'displayAddressLI displayAddressCityStateOrRegionPostalCode'}):
for city_state_postal_code_text in city_state_postal_code:
buyer_details.write(''.join(city_state_postal_code_text+'\r\n'))
for country in items.findAll('li', {'class': 'displayAddressLI displayAddressCountryName'}):
for country_text in country:
buyer_details.write(''.join(country_text+'\r\n'))
print 'Getting date information...\r\n'
for order in soup.findAll('div', {'class': 'a-row a-spacing-none'}):
for items in order.findAll('span', {'class': 'order-date-invoice-item'}):
for date_text in items:
date_details.write(''.join(str(date_text)))
product_details.close()
buyer_details.close()
date_details.close()
def cleaning_format(txt_file):
with open(txt_file, 'r+') as fff:
for line in fff:
cleanedLine = line.strip()
if cleanedLine: #is not empty
result.write(cleanedLine+'\r\n')
fff.close()
def Getting_orderID(htmls):
print 'Tracking Order No.s...\r\n'
soup = BeautifulSoup(htmls)
for order in soup.findAll("div", {"class": "a-row a-size-mini"}):
for item in order.findAll("span", {"class": "a-color-secondary value"}):
for idd in item:
order_no_id.write(''.join(idd))
##########################Main_Code_Starting_from_here######################################
if __name__ == '__main__':
banner()
sleep(1)
print '\r\n\r\n'
account_file = raw_input('Enter account file e.g. emails.txt: ')
try:
f1 = open(account_file).readlines()
except:
print '(!) Error in opening file. Please check file name with extension correctly.'
sys.exit(1)
x1 = str(len(f1))
print 'File Contains '+x1+' Entries.\r\n'
firstyear = raw_input('Enter Starting Year to start e.g. 2009: ')
with open(account_file, 'r+') as ff:
for words in ff:
print 'checking...'+words+'\r\n'
result.write('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
result.write('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\r\n')
result.write(words+':::::::::::::::::::::::::: \r\n')
m = re.search(':', words)
usr = words[:m.start()]
pwd = words[m.end():]
pwd = pwd.strip()
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [("User-agent", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"),('Accept', 'text/html, application/xml, */*')]
print 'Attempt to Logging in...'
resp = br.open("https://www.amazon.com/ap/signin?openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&pageId=mas_dev_portal2&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.assoc_handle=mas_dev_portal&openid.return_to=https%3A%2F%2Fdeveloper.amazon.com%2Fap_login.html&language=en_US&openid.pape.max_auth_age=1")
resp.set_data(re.sub('<!DOCTYPE(.*)>', '', resp.get_data()))
br.set_response(resp)
br.select_form(nr=0)
br["email"] = usr
br["password"] = pwd
logged_in = br.submit()
xx = logged_in.read()
error_str1 = "Important Message!"
error_str2 = "There was a problem with your request"
if error_str1 in xx:
print error_str1
print '\r\n'+'Please Check Account email and password...!\r\n'
not_login.write(words+'\r\n')
print 'The account which is not login will save in not_login.txt in same directory. \r\n'
if error_str2 in logged_in.read():
print error_str2
print '\r\n'+'Please Check Account email and password...!\r\n'
not_login.write(words+'\r\n')
print 'The account which is not login will save in not_login.txt in same directory. \r\n'
else:
print "Successfully Login...!\r\n"
sleep(2)
billing_details = br.open("https://www.amazon.com/gp/css/account/address/view.html?ie=UTF8&ref_=ya_manage_address_book_t1")
billing_details_info = billing_details.read()
account_info(billing_details_info)
sleep(2)
for year in range(int(firstyear), datetime.datetime.now().year):
orders_html = br.open("https://www.amazon.com/gp/css/history/orders/view.html?orderFilter=year-%s&startAtIndex=1000" % year)
print 'Getting Order No.s of '+str(year)+'\r\n'
orders_details = orders_html.read()
Getting_orderID(orders_details)
order_no_id.close()
orderss = open ('orders.txt', 'w+')
with open('order_ids.txt', 'r+') as fff:
for line in fff:
cleanedLine = line.strip()
if cleanedLine: #is not empty
orderss.write(cleanedLine+'\r\n')
orderss.close()
fff.close()
f11 = open('orders.txt').readlines()
x1 = str(len(f11))
print 'Total Orders is: '+x1+'\r\n'
result.write('Total Orders: '+x1+' \r\n')
print 'Getting Order Details....\r\n'
ff22 = open('orders.txt', 'r+')
sleep(2)
for order_details in ff22:
print 'Getting Details of Order No. '+str(order_details)+'\r\n'
order_link = br.open("https://www.amazon.com/gp/your-account/order-details/ref=oh_aui_or_o00_?ie=UTF8&orderID=%s" % order_details)
order_data = order_link.read()
orders_no_details_checking(order_data)
result.write('\r\nProduct Details: \r\n')
result.write('================\r\n')
cleaning_format('product_details.txt')
result.write('-----------------------------------------------------\r\n')
result.write('\r\nBuyer Details: \r\n')
result.write('==============\r\n')
cleaning_format('buyer_details.txt')
result.write('-----------------------------------------------------\r\n')
result.write('\r\nDate and Order No.: \r\n')
result.write('===================\r\n')
cleaning_format('date_details.txt')
result.write('-----------------------------------------------------\r\n')
ff22.close()
print 'Order History of All accounts have been saved in result.txt file...!\r\n'
print 'Check not_login.txt for invalid login Attempts. \r\n'
try:
result.close()
order_no_id.close()
not_login.close()
except:
pass
try:
os.remove('billing_info.txt')
os.remove('product_details.txt')
os.remove('buyer_details.txt')
os.remove('date_details.txt')
except:
pass
try:
os.remove('order_ids.txt')
os.remove('orders.txt')
except:
pass
print './ done'