-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
97 lines (81 loc) · 3.53 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
!pip install requests_aws4auth
import boto3
import datetime
import requests
import json, os
from datetime import datetime
# from elasticsearch import Elasticsearch
from botocore.exceptions import ClientError
from decimal import *
from time import sleep
# from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
aws_access_key_id = ""
aws_secret_access_key = ""
region = ""
service = ""
# host = ""
dynamodb = boto3.resource (
service_name = "dynamodb",
aws_access_key_id = aws_access_key_id,
aws_secret_access_key = aws_secret_access_key,
region_name = "us-west-2",
)
table = dynamodb.Table('Yelp-restaurant')
auth = AWS4Auth(aws_access_key_id, aws_secret_access_key, region, service)
import datetime
# dynamodb = boto3.resource('dynamodb', region_name='us-west-2')
# table = dynamodb.Table('Yelp-restaurants')
cuisine_types = ['chinese', 'indian', 'mexican']
def save_data_to_json(index_data, restaurant_data):
with open('restaurant_index.json', 'w') as index_file:
json.dump(index_data, index_file, indent=4)
with open('restaurant_data.json', 'w') as data_file:
json.dump(restaurant_data, data_file, indent=4)
for cuisine_type in cuisine_types:
offset = 0
for i in range(0, 1):
offset += 50
PARAMETERS = {
'term': 'restaurant',
'location': 'New York',
'radius': 40000,
'categories': cuisine_type,
'limit': 50,
'offset': offset,
'sort_by': 'best_match'
}
response = requests.get(url=ENDPOINT, params=PARAMETERS, headers=HEADERS)
# response = http.request('GET',
# url=ENDPOINT,
# body = json.dumps(some_data_structure),
# headers =HEADERS,
# retries = False)
if response.status_code == 200:
business_data = response.json()
with open('restaurant_data.json', 'w') as datafile:
json.dump(business_data, datafile, indent=4)
else:
print(f"Failed to retrieve data. Status code: {response.status_code}")
with table.batch_writer() as batch:
for biz in business_data['businesses']:
try:
batch.put_item(
Item={
'businessId': biz['id'],
'name': biz['name'],
'category': biz['categories'][0]['alias'],
'address': biz['location']['address1'],
'city': biz['location']['city'],
'zipcode': biz['location']['zip_code'],
'latitude': Decimal(str(biz['coordinates']['latitude'])),
'longitude': Decimal(str(biz['coordinates']['longitude'])),
'reviewCount': biz['review_count'],
'rating': Decimal(str(biz['rating'])),
'phone': biz['phone'],
'url': str(biz['url']),
'insertedAtTimestamp': str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
},
)
except ClientError as e:
print(e.response['Error']['Code'])