-
Notifications
You must be signed in to change notification settings - Fork 1
/
brands_scrapper.py
41 lines (35 loc) · 1.16 KB
/
brands_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import requests
from bs4 import BeautifulSoup
import time
import os
alphabets = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '*'
]
BASE_URL = os.environ["brand_data_site"]
for alphabet in alphabets:
f = open("food_brands.csv", "a")
response = requests.get(f"{BASE_URL}&f={alphabet}")
first_page = BeautifulSoup(response.text, 'html.parser')
brand_table = first_page.find('div', class_ = 'leftCellContent')
brands = brand_table.find_all('h2')
pages_div = first_page.find('div', class_ = 'searchResultsPaging')
no_of_pages = len(pages_div.find_all('a'))
for brand in brands:
anchor = brand.find('a')
name = anchor['title']
f.write(f"{name},")
print(name)
if no_of_pages > 1:
for i in range(1, no_of_pages):
next_page = requests.get(f"{BASE_URL}&pg={i}&f={alphabet}")
page = BeautifulSoup(next_page.text, 'html.parser')
brand_table = page.find('div', class_ = 'leftCellContent')
brands = brand_table.find_all('h2')
for brand in brands:
anchor = brand.find('a')
name = anchor['title']
f.write(f"{name},")
print(name)
f.close()
time.sleep(2)