-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_epicurious_recipe_links_by_category.py
151 lines (93 loc) · 4.3 KB
/
scrape_epicurious_recipe_links_by_category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 18 09:40:47 2020
Use modules taken from the original Kaggle dataset to scrape recipes
from the epicurious website:
https://www.kaggle.com/hugodarwood/epirecipes?select=recipe.py
Here I do not search for all recipes in one go, but search for specific
recipe categories (e.g. vegan, paleo, etc.). The purpose is potentially
two-fold: 1.) I want to see if I get more recipes than by searching
without a search term; and 2.) I want to be able to assign new categories
if I think there are some missing from the recipes I currently have.
@author: sbuer
"""
# Package for scraping recipes from many popular websites, for details see
# https://github.com/sbuergers/recipe-scrapers/blob/master/recipe_scrapers/epicurious.py
from recipe_scrapers import scrape_me
# Get HTML from website
import requests
# Regular expressions
import re
# Input / output
import pickle
# Check execution time
import time
# Main function - gets all recipe links for a specific search term from
# epicurious and appends them to pickle file
def scrape_epi_links(search_term):
# URL of epicurious search for newest recipes:
# 'https://www.epicurious.com/search/vegan?content=recipe&sort=newest'
url_start = r'https://www.epicurious.com/search/'
url_end = r'?content=recipe&sort=newest'
initial_search_url = url_start + search_term + url_end
# After the first page the url also includes the page number as follows:
# https://www.epicurious.com/search?content=recipe&page=2&sort=newest
# scrape search url and get HTML text
page = requests.get(initial_search_url)
html_text = page.content.decode('utf-8')
# find recipe urls and collect unique recipe links in list
# Example: href="/recipes/food/views/spring-chicken-dinner-salad"
re_rec = r"\/recipes\/food\/views\/(\w+|\-)+"
recipe_links = list(set([x.group() for x in re.finditer(re_rec, html_text)]))
# Go through additional recipes by increasing the page number in the urlimport time
start_time = time.time()
pagenum = 2
while True:
#for i in range(0,10): # try with for-loop first for testing
# progress
if pagenum % 10 == 0:
print("Page #", pagenum, "Number of recipes scraped = ", len(recipe_links))
# get next recipe page in HTML
search_url = url_start + search_term + "?content=recipe&page={}&sort=newest".format(pagenum)
page = requests.get(search_url)
# stop looking when max page number is reached
if page:
html_text = page.content.decode('utf-8')
pagenum += 1
# collect recipe links and append to list
more_links = list(set([x.group() for x in re.finditer(re_rec, html_text)]))
recipe_links += more_links
else:
print("Reached bottom of page")
break
print("--- %s seconds ---" % (time.time() - start_time))
# Make sure recipe links are truly unique (should already be)
recipe_links = list(set(recipe_links))
return recipe_links
# # Load recipe links from pickle file
# with open('epi_recipe_links', 'rb') as io:
# # read the data as binary data stream
# recipe_links = pickle.load(io)
# Scrape recipe links by category (appends to pickle file)
# Try out for one search term to test
search_term = r'Vegan'
recipe_links = scrape_epi_links(search_term)
# Do I have all of these also in the non-directed search list of recipe links?
# Load in previously scraped data
with open('epi_recipes_detailed', 'r') as io:
data = json.load(io)
df = pd.DataFrame(data)
# Check how many recipes I already had collected
old_links = df['url'].values
new_links = [l[len('/recipes/food/views/')::] for l in recipe_links]
matches = [n in old_links for n in new_links]
print('We already had', sum(matches), 'out of', len(recipe_links), 'recipe links.')
# It's proboably save to say that the few additional ones are simply new.
# So we can conclude that searching for specific epicurious categories does
# not yield recipe links that we did not get with a simple unspecified search.
# The question remains: Why does epicurious claim to have 300k+ recipes?
# I can only find 40k- searching their website...
# Save recipe links to txt file
#with open('epi_recipe_links_by_category', 'a') as io:
# pickle.dump(recipe_links, io)
# eof