Skip to content

Commit eb811bb

Browse files
author
vlad
committed
Add version, add fine docs, allow many queries
1 parent 00ad55d commit eb811bb

File tree

4 files changed

+86
-22
lines changed

4 files changed

+86
-22
lines changed

outscraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from outscraper.api_client import *
1+
from outscraper.api_client import *

outscraper/api_client.py

Lines changed: 75 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
import json
33
from time import sleep
44

5+
from .utils import as_list
6+
7+
8+
VERSION = '0.0.7'
9+
510

611
class ApiClient(object):
712
"""OutScraper ApiClient - Python SDK that allows extracting data from Google services via OutScraper API.
@@ -15,28 +20,34 @@ class ApiClient(object):
1520
"""
1621

1722
_api_url = 'https://api.app.outscraper.com'
18-
_api_key = None
23+
_api_headers = {}
1924

2025
_max_ttl = 60 * 10
2126

2227
def __init__(self, api_key):
23-
self._api_key = api_key
24-
25-
def get_request_archive(self, request_id):
26-
"""Fetch request data from archive
27-
Parameters:
28-
request_id (int): unique id for the request provided by ['id']
29-
Returns:
30-
dict: result from the archive
31-
"""
28+
self._api_headers = {
29+
'X-API-KEY': api_key,
30+
'client': f'Python SDK {VERSION}'
31+
}
32+
33+
def get_request_archive(self, request_id: str):
34+
'''
35+
Fetch request data from archive
36+
37+
Parameters:
38+
request_id (int): unique id for the request provided by ['id']
39+
40+
Returns:
41+
dict: result from the archive
42+
'''
3243
response = requests.get(f'{self._api_url}/requests/{request_id}')
3344

3445
if 199 < response.status_code < 300:
3546
return response.json()
3647

3748
raise Exception(f'Response status code: {response.status_code}')
3849

39-
def _wait_request_archive(self, request_id, requests_pause):
50+
def _wait_request_archive(self, request_id: str, requests_pause: int):
4051
ttl = self._max_ttl / requests_pause
4152

4253
while ttl > 0:
@@ -49,47 +60,91 @@ def _wait_request_archive(self, request_id, requests_pause):
4960

5061
raise Exception('Timeout exceeded')
5162

52-
def google_search(self, query, language='en', region='us'):
63+
def google_search(self, query, language='en', region=None):
64+
'''
65+
Get data from Google search
66+
67+
Parameters:
68+
query (list): parameter defines the query or queries you want to search on Google. Using a lists allows multiple queries to be sent in one request and save on network latency time.
69+
language (str): parameter specifies the language to use for Google. Available values: "en", "de", "es", "es-419", "fr", "hr", "it", "nl", "pl", "pt-BR", "pt-PT", "vi", "tr", "ru", "ar", "th", "ko", "zh-CN", "zh-TW", "ja", "ach", "af", "ak", "ig", "az", "ban", "ceb", "xx-bork", "bs", "br", "ca", "cs", "sn", "co", "cy", "da", "yo", "et", "xx-elmer", "eo", "eu", "ee", "tl", "fil", "fo", "fy", "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "haw", "bem", "rn", "id", "ia", "xh", "zu", "is", "jw", "rw", "sw", "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz", "lua", "lg", "hu", "mg", "mt", "mi", "ms", "pcm", "no", "nso", "ny", "nn", "uz", "oc", "om", "xx-pirate", "ro", "rm", "qu", "nyn", "crs", "sq", "sk", "sl", "so", "st", "sr-ME", "sr-Latn", "su", "fi", "sv", "tn", "tum", "tk", "tw", "wo", "el", "be", "bg", "ky", "kk", "mk", "mn", "sr", "tt", "tg", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ps", "sd", "fa", "ckb", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te", "kn", "ml", "si", "lo", "my", "km", "chr".
70+
region (str): parameter specifies the language to use for Google. Available values: "AF", "AL", "DZ", "AS", "AD", "AO", "AI", "AG", "AR", "AM", "AU", "AT", "AZ", "BS", "BH", "BD", "BY", "BE", "BZ", "BJ", "BT", "BO", "BA", "BW", "BR", "VG", "BN", "BG", "BF", "BI", "KH", "CM", "CA", "CV", "CF", "TD", "CL", "CN", "CO", "CG", "CD", "CK", "CR", "CI", "HR", "CU", "CY", "CZ", "DK", "DJ", "DM", "DO", "EC", "EG", "SV", "EE", "ET", "FJ", "FI", "FR", "GA", "GM", "GE", "DE", "GH", "GI", "GR", "GL", "GT", "GG", "GY", "HT", "HN", "HK", "HU", "IS", "IN", "ID", "IQ", "IE", "IM", "IL", "IT", "JM", "JP", "JE", "JO", "KZ", "KE", "KI", "KW", "KG", "LA", "LV", "LB", "LS", "LY", "LI", "LT", "LU", "MG", "MW", "MY", "MV", "ML", "MT", "MU", "MX", "FM", "MD", "MN", "ME", "MS", "MA", "MZ", "MM", "NA", "NR", "NP", "NL", "NZ", "NI", "NE", "NG", "NU", "MK", "NO", "OM", "PK", "PS", "PA", "PG", "PY", "PE", "PH", "PN", "PL", "PT", "PR", "QA", "RO", "RU", "RW", "WS", "SM", "ST", "SA", "SN", "RS", "SC", "SL", "SG", "SK", "SI", "SB", "SO", "ZA", "KR", "ES", "LK", "SH", "VC", "SR", "SE", "CH", "TW", "TJ", "TZ", "TH", "TL", "TG", "TO", "TT", "TN", "TR", "TM", "VI", "UG", "UA", "AE", "GB", "US", "UY", "UZ", "VU", "VE", "VN", "ZM", "ZW".
71+
72+
Returns:
73+
dict: json result
74+
'''
5375
response = requests.get(f'{self._api_url}/search', params={
54-
'query': query,
76+
'query': as_list(query),
5577
'language': language,
5678
'region': region,
57-
}, headers={'X-API-KEY': self._api_key})
79+
}, headers=self._api_headers)
5880

5981
if 199 < response.status_code < 300:
6082
sleep(10)
6183
return self._wait_request_archive(response.json()['id'], 2)
6284

6385
raise Exception(f'Response status code: {response.status_code}')
6486

65-
def google_maps_search(self, query, language='en', region='us', limit=400, extract_contacts=False, coordinates=None):
87+
def google_maps_search(self, query, language='en', region=None, limit=400, extract_contacts=False, coordinates=None, drop_duplicates=False):
88+
'''
89+
Get data from Google Maps
90+
91+
Parameters:
92+
query (list): parameter defines the query or queries you want to search on Google Maps. Using a lists allows multiple queries to be sent in one request and save on network latency time.
93+
language (str): parameter specifies the language to use for Google. Available values: "en", "de", "es", "es-419", "fr", "hr", "it", "nl", "pl", "pt-BR", "pt-PT", "vi", "tr", "ru", "ar", "th", "ko", "zh-CN", "zh-TW", "ja", "ach", "af", "ak", "ig", "az", "ban", "ceb", "xx-bork", "bs", "br", "ca", "cs", "sn", "co", "cy", "da", "yo", "et", "xx-elmer", "eo", "eu", "ee", "tl", "fil", "fo", "fy", "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "haw", "bem", "rn", "id", "ia", "xh", "zu", "is", "jw", "rw", "sw", "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz", "lua", "lg", "hu", "mg", "mt", "mi", "ms", "pcm", "no", "nso", "ny", "nn", "uz", "oc", "om", "xx-pirate", "ro", "rm", "qu", "nyn", "crs", "sq", "sk", "sl", "so", "st", "sr-ME", "sr-Latn", "su", "fi", "sv", "tn", "tum", "tk", "tw", "wo", "el", "be", "bg", "ky", "kk", "mk", "mn", "sr", "tt", "tg", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ps", "sd", "fa", "ckb", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te", "kn", "ml", "si", "lo", "my", "km", "chr".
94+
region (str): parameter specifies the language to use for Google. Available values: "AF", "AL", "DZ", "AS", "AD", "AO", "AI", "AG", "AR", "AM", "AU", "AT", "AZ", "BS", "BH", "BD", "BY", "BE", "BZ", "BJ", "BT", "BO", "BA", "BW", "BR", "VG", "BN", "BG", "BF", "BI", "KH", "CM", "CA", "CV", "CF", "TD", "CL", "CN", "CO", "CG", "CD", "CK", "CR", "CI", "HR", "CU", "CY", "CZ", "DK", "DJ", "DM", "DO", "EC", "EG", "SV", "EE", "ET", "FJ", "FI", "FR", "GA", "GM", "GE", "DE", "GH", "GI", "GR", "GL", "GT", "GG", "GY", "HT", "HN", "HK", "HU", "IS", "IN", "ID", "IQ", "IE", "IM", "IL", "IT", "JM", "JP", "JE", "JO", "KZ", "KE", "KI", "KW", "KG", "LA", "LV", "LB", "LS", "LY", "LI", "LT", "LU", "MG", "MW", "MY", "MV", "ML", "MT", "MU", "MX", "FM", "MD", "MN", "ME", "MS", "MA", "MZ", "MM", "NA", "NR", "NP", "NL", "NZ", "NI", "NE", "NG", "NU", "MK", "NO", "OM", "PK", "PS", "PA", "PG", "PY", "PE", "PH", "PN", "PL", "PT", "PR", "QA", "RO", "RU", "RW", "WS", "SM", "ST", "SA", "SN", "RS", "SC", "SL", "SG", "SK", "SI", "SB", "SO", "ZA", "KR", "ES", "LK", "SH", "VC", "SR", "SE", "CH", "TW", "TJ", "TZ", "TH", "TL", "TG", "TO", "TT", "TN", "TR", "TM", "VI", "UG", "UA", "AE", "GB", "US", "UY", "UZ", "VU", "VE", "VN", "ZM", "ZW".
95+
limit (int): parameter specifies the limit of organizations to take from one query search. Usually, there are no more than 400 organizations per one query search on Google Maps. Use more precise categories (asian restaurant, italian restaurant, etc.) to overcome this limitation.
96+
extract_contacts (bool): parameter specifies whether the bot will scrape additional data (emails, social links, site keywords…) from companies’ websites. It increases the time of the extraction.
97+
coordinates (str): parameter defines the coordinates to use along with the query. Example: "@41.3954381,2.1628662,15.1z".
98+
drop_duplicates (bool): parameter specifies whether the bot will drop the same organizations from different queries. Using the parameter combines results from each query inside one big array.
99+
100+
Returns:
101+
dict: json result
102+
'''
66103
response = requests.get(f'{self._api_url}/maps/search', params={
67104
'query': query,
68105
'coordinates': coordinates,
69106
'language': language,
70107
'region': region,
71108
'organizationsPerQueryLimit': limit,
72109
'extractContacts': extract_contacts,
73-
}, headers={'X-API-KEY': self._api_key})
110+
'dropDuplicates': drop_duplicates,
111+
}, headers=self._api_headers)
74112

75113
if 199 < response.status_code < 300:
76114
sleep(15)
77115
return self._wait_request_archive(response.json()['id'], 5)
78116

79117
raise Exception(f'Response status code: {response.status_code}')
80118

81-
def google_maps_business_reviews(self, query, language='en', region='us', limit=100, cutoff=None, coordinates=None, sort='most_relevant', cutoff_rating=None):
119+
def google_maps_business_reviews(self, query, language='en', region=None, limit=100, cutoff=None, coordinates=None, sort='most_relevant', cutoff_rating=None, organizations_per_query_limit=1):
120+
'''
121+
Get data from Google search
122+
123+
Parameters:
124+
query (list): parameter defines the query or queries you want to search on Google Maps. Using a lists allows multiple queries to be sent in one request and save on network latency time.
125+
language (str): parameter specifies the language to use for Google. Available values: "en", "de", "es", "es-419", "fr", "hr", "it", "nl", "pl", "pt-BR", "pt-PT", "vi", "tr", "ru", "ar", "th", "ko", "zh-CN", "zh-TW", "ja", "ach", "af", "ak", "ig", "az", "ban", "ceb", "xx-bork", "bs", "br", "ca", "cs", "sn", "co", "cy", "da", "yo", "et", "xx-elmer", "eo", "eu", "ee", "tl", "fil", "fo", "fy", "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "haw", "bem", "rn", "id", "ia", "xh", "zu", "is", "jw", "rw", "sw", "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz", "lua", "lg", "hu", "mg", "mt", "mi", "ms", "pcm", "no", "nso", "ny", "nn", "uz", "oc", "om", "xx-pirate", "ro", "rm", "qu", "nyn", "crs", "sq", "sk", "sl", "so", "st", "sr-ME", "sr-Latn", "su", "fi", "sv", "tn", "tum", "tk", "tw", "wo", "el", "be", "bg", "ky", "kk", "mk", "mn", "sr", "tt", "tg", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ps", "sd", "fa", "ckb", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te", "kn", "ml", "si", "lo", "my", "km", "chr".
126+
region (str): parameter specifies the language to use for Google. Available values: "AF", "AL", "DZ", "AS", "AD", "AO", "AI", "AG", "AR", "AM", "AU", "AT", "AZ", "BS", "BH", "BD", "BY", "BE", "BZ", "BJ", "BT", "BO", "BA", "BW", "BR", "VG", "BN", "BG", "BF", "BI", "KH", "CM", "CA", "CV", "CF", "TD", "CL", "CN", "CO", "CG", "CD", "CK", "CR", "CI", "HR", "CU", "CY", "CZ", "DK", "DJ", "DM", "DO", "EC", "EG", "SV", "EE", "ET", "FJ", "FI", "FR", "GA", "GM", "GE", "DE", "GH", "GI", "GR", "GL", "GT", "GG", "GY", "HT", "HN", "HK", "HU", "IS", "IN", "ID", "IQ", "IE", "IM", "IL", "IT", "JM", "JP", "JE", "JO", "KZ", "KE", "KI", "KW", "KG", "LA", "LV", "LB", "LS", "LY", "LI", "LT", "LU", "MG", "MW", "MY", "MV", "ML", "MT", "MU", "MX", "FM", "MD", "MN", "ME", "MS", "MA", "MZ", "MM", "NA", "NR", "NP", "NL", "NZ", "NI", "NE", "NG", "NU", "MK", "NO", "OM", "PK", "PS", "PA", "PG", "PY", "PE", "PH", "PN", "PL", "PT", "PR", "QA", "RO", "RU", "RW", "WS", "SM", "ST", "SA", "SN", "RS", "SC", "SL", "SG", "SK", "SI", "SB", "SO", "ZA", "KR", "ES", "LK", "SH", "VC", "SR", "SE", "CH", "TW", "TJ", "TZ", "TH", "TL", "TG", "TO", "TT", "TN", "TR", "TM", "VI", "UG", "UA", "AE", "GB", "US", "UY", "UZ", "VU", "VE", "VN", "ZM", "ZW".
127+
limit (int): parameter specifies the limit of reviews to extract from one organization.
128+
cutoff (int): parameter specifies the maximum timestamp value for reviews. Using the cutoff parameter overwrites sort parameter to newest.
129+
coordinates (str): parameter defines the coordinates to use along with the query. Example: "@41.3954381,2.1628662,15.1z".
130+
sort (str): parameter specifies one of the sorting types. Available values: "most_relevant", "newest", "highest_rating", "lowest_rating".
131+
cutoff_rating (int): parameter specifies the maximum (for lowest_rating sorting) or minimum (for highest_rating sorting) rating for reviews. Using the cutoffRating requires sorting to be set to "lowest_rating" or "highest_rating".
132+
organizations_per_query_limit (str): parameter specifies the limit of organizations to take from one query search.
133+
134+
Returns:
135+
dict: json result
136+
'''
82137
response = requests.get(f'{self._api_url}/maps/reviews', params={
83-
'query': query,
138+
'query': as_list(query),
84139
'coordinates': coordinates,
85140
'language': language,
86141
'region': region,
87-
'limit': 1,
88142
'cutoff': cutoff,
89143
'cutoffRating': cutoff_rating,
144+
'organizationsPerQueryLimit': organizations_per_query_limit,
90145
'reviewsPerOrganizationLimit': limit,
91146
'sort': sort,
92-
}, headers={'X-API-KEY': self._api_key})
147+
}, headers=self._api_headers)
93148

94149
if 199 < response.status_code < 300:
95150
sleep(30)

outscraper/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
def as_list(value):
3+
if isinstance(value, list):
4+
return value
5+
else:
6+
return [value]

setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
from setuptools import setup
22

33

4+
from outscraper import VERSION
5+
6+
47
def readme():
58
with open('README.rst') as f:
69
return f.read()
710

811

912
setup(
1013
name='google-services-api',
11-
version='0.0.6',
14+
version=VERSION,
1215
description='Google services extractor by OutScraper API',
1316
long_description=readme(),
1417
classifiers = ['Programming Language :: Python',

0 commit comments

Comments
 (0)