-
Notifications
You must be signed in to change notification settings - Fork 0
/
grubhub_scrape.py
129 lines (109 loc) · 5.44 KB
/
grubhub_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time, os, platform, json, sys
GH_URL = 'https://www.grubhub.com/restaurant/example_restaurant'
MENU = '1599675614973x652726785232916400'
dirpath = os.getcwd()
chromepath = dirpath + '/assets/chromedriver_%s' % (platform.system()).lower()
def get_item(browser, id, item_price):
""" given an id, scrape a menu item and all of its options """
button = browser.find_element_by_id(id)
browser.execute_script("arguments[0].click();", button)
time.sleep(1)
innerHTML = browser.page_source
html = BeautifulSoup(innerHTML, 'html.parser')
_options = []
options = html.find_all('div', class_='menuItemModal-options') # menuItemModal-choice-option-description
for option in options:
single_option=dict()
single_option['modifiername_text'] = name = option.find(class_='menuItemModal-choice-name').text
instruction_text = option.find(class_='menuItemModal-choice-instructions').text.replace('.','').split(' - ')
if instruction_text[0] == 'Required':
single_option['required_boolean']=True
single_option['numberallowedselections_number']= [int(s) for s in instruction_text[1].split() if s.isdigit()][0]
else:
single_option['required_boolean']=False
if instruction_text[1] == "Choose as many as you like":
single_option['numberallowedselections_number']=0
else:
single_option['numberallowedselections_number']= [int(s) for s in instruction_text[1].split() if s.isdigit()][0]
_choices=[]
choices = option.find_all('span', class_='menuItemModal-choice-option-description')
for choice in choices:
#print(choice.text.split(' + ')[0] + choice.text.split(' + ')[1])
if ' + ' in choice.text:
_choices.append({'name_text':choice.text.split(' + ')[0], 'price_number':float(choice.text.split(' + ')[1].replace('$', ''))})
# infering that if mod first item price is the same as top-level item price, then it must be a size-based modifier
if choices.index(choice) == 0 and float(choice.text.split(' + ')[1].replace('$', '')) == item_price:
single_option['modifiersize_boolean'] = True
else:
_choices.append({'name_text':choice.text, 'price_number':0})
single_option['modifiermenuitems_list_custom_menuitem'] = _choices
#append the dictionary
_options.append(single_option)
return _options
def scrape_menu(url, menu_id):
""" given a valid grubhub url, scrape the menu of a restaurant """
print('Running...')
chrome_options = Options()
# To disable headless mode (for debugging or troubleshooting), comment out the following line:
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(options=chrome_options, executable_path = chromepath)
browser.get(url)
time.sleep(7)
innerHTML = browser.page_source
html = BeautifulSoup(innerHTML, 'html.parser')
menu = html.find_all("div",{"class":"menuSectionsContainer"})[0]
#menu = html.find_element(By.XPATH, '//*[@id="ghs-restaurant-menu"]/div/div/ghs-impression-tracker/div')
if menu is None:
print('menu fail')
scrape_menu(url)
return
# Categories
cats = menu.find_all('ghs-restaurant-menu-section')
if not cats:
print('cats fail')
#scrape_menu(url)
return
#print(len(cats))
#cats = cats[1:2]
cat_titles = [cat.find('h3', class_='menuSection-title').text for cat in cats]
print(cat_titles)
cat_items = [[itm.text for itm in cat.find_all('a', class_='menuItem-name')] for cat in cats]
prices = [[p.text for p in cat.find_all('span', class_='menuItem-displayPrice')] for cat in cats]
descriptions = [[itm.text for itm in cat.find_all('p', class_='menuItemNew-description--truncate')] for cat in cats]
images = [[itm['src'] for itm in cat.find_all('img', class_='u-width-full u-height-full menuItemNew-imageMagazine-img')] for cat in cats]
print(images)
ids = []
for cat in cats:
cat_ids = []
items = cat.find_all('div', class_='menuItem-inner')
for item in items:
cat_ids.append(item.get('id'))
ids.append(cat_ids)
full_menu = {}
for ind, title in enumerate(cat_titles):
if title == 'Dinner Entrees':
all_items = []
for ind2, itm_name in enumerate(cat_items[ind]):
item = {}
item['name_text'] = itm_name
item['price_number'] = float(prices[ind][ind2].replace('$', ''))
item['description_text']= descriptions[ind][ind2]
item['menu_custom_menu'] = menu_id
try:
item['image_image'] = images[ind][ind2]
except:
pass
item['menuitemmodifiers_list_custom_menuitemmodifiers'] = get_item(browser, ids[ind][ind2], item['price_number'])
all_items.append(item)
full_menu[title] = all_items
path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
with open('data/data_cd_dinner2.json', 'w') as f:
json.dump(full_menu, f, indent=4)
print('[Finished]')
return full_menu
#print(sys.argv[1])
scrape_menu(GH_URL, 3)
#example link: 'https://www.grubhub.com/restaurant/insomnia-cookies-76-pearl-st-new-york/295836'