-
Notifications
You must be signed in to change notification settings - Fork 1
/
Yelp_explore.py
63 lines (38 loc) · 1.55 KB
/
Yelp_explore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# coding: utf-8
# In[46]:
import pandas as pd
import numpy as np
import os
# In[9]:
filename = 'yelp_business.csv'
filepath = os.path.join('~','homework','yelp-dataset',filename)
business_data_df = pd.read_csv(filepath)
print(f"Businesses in dataset: {business_data_df['business_id'].nunique()}")
business_data_df.head(5)
# In[39]:
# Extract Categories
food_industry_check = lambda x: 'Food' in x or 'Restaurants' in x or 'Bars' in x
categories = []
restaurant_categories = []
other_categories = []
for business in business_data_df['categories']:
these_categories = business.split(';')
if food_industry_check:
for category in these_categories:
restaurant_categories.append(category)
else:
for category in these_categories:
other_categories.append(category)
for category in these_categories:
categories.append(category)
categories_series = pd.Series(pd.Series(categories).unique())
restaurant_categories_series = pd.Series(pd.Series(restaurant_categories).unique())
other_categories = pd.Series(pd.Series(other_categories).unique())
# In[42]:
restaurant_categories_series.count()
# In[47]:
n_businesses = business_data_df['business_id'].loc[business_data_df['categories'].map(food_industry_check)].count()
print(f'Food Service Industry Businesses: {n_businesses}\n{np.round(100*n_businesses/business_data_df["business_id"].nunique())}% of dataset')
# In[49]:
restaurants = business_data_df.loc[business_data_df['categories'].map(food_industry_check)]
restaurants.to_csv('restaurants.csv',index=False)