-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsite_specific_parsing.py
executable file
·240 lines (225 loc) · 8.47 KB
/
site_specific_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import os, json
from bs4 import BeautifulSoup as BS
log = open('log','a+')
def readFile(fp):
# if the file is zero length the url fetcher failed
if os.stat(fp).st_size is 0:
raise Exception("[ERROR] File zero length: Does the url still exist?" +
" Consider checking the following file.",fp)
with open(fp,'r+',encoding='utf-8') as f:
lines = f.read()
return lines
def handleException(*args, **kwargs):
#print(e, args, kwargs)
if not os.path.exists('./DEBUG'):
os.makedirs('./DEBUG')
lines = kwargs['lines']
soup = kwargs['soup']
try:
with open('./DEBUG/html','w+') as f:
f.write(soup.prettify())
print(kwargs['e'])
except:
pass
#print(kwargs['d']['_reval'])
for key in kwargs['d']:
if key is '_reval':
for _rkey in kwargs['d']['_reval']:
print('{0}: {1}'.format(_rkey,kwargs['d']['_reval'][_rkey]))
continue
print('{0}: {1}'.format(key,kwargs['d'][key]))
print(kwargs['e'])
print(kwargs['t'])
print('''
#
#
#
#
#############
The html file has been written out to './DEBUG/html'
By scrolling up if the _type is 'json' then 'soup' and 'lines'
will may not exist. Otherwise what this means is that the
scraper was unable to find the name or price.
Pdb is a way to do inline debugging so all python one liners apply.
(Pdb) kwargs.keys()
dict_keys(['t', 'd', 'soup', 'e', 'lines'])
t = traceback. By following the line numbers in the traceback
we can figure out where it failed.
d = a dictionary filled with product info
e = the error
lines = what was being parsed at the time of error
type: 'soup' and/or 'lines' to view the html
type: 'dir() to see what to work with
type: 'continue' to ignore the error
type: 'l' to see where we are in the code
type: 'h' to see pdb commands
type: 'args' or 'kwargs to see what info is available related to the item in question
If 'soup' and/or 'lines' is empty that means the page
doesn't exist or failed to download
################
''')
import pdb;pdb.set_trace()
return
def returnJsonFromBulkSupplements(fp, leadsplit, endsplit):
try:
lines = readFile(fp)
except Exception as e:
_s = '\n'.join([e.args[0],e.args[1],''])
log.write(_s)
return None
_json = lines.split(leadsplit)[1].split(endsplit)[0]
_json = json.loads(_json)
return _json
def returnJsonFromJsonDumpFile(fp):
try:
_json= readFile(fp)
except Exception as e:
raise Exception(e)
_json = json.loads(_json)
return _json
def returnItemInfoFromJson(_json, *args, **kwargs):
'''
print(_json.keys())
print(_json['details'].keys())
dict_keys(['details', 'success'])
dict_keys(['sku', 'thumb', 'purchasable', 'baseThumb', 'unformattedPrice', 'unformattedRrp', 'upc', 'priceLabel', 'price', 'base', 'instock', 'baseImage', 'image', 'rrp'])
'''
fp = kwargs['fp']
name = None
price = None
# name(based on picture name in thumb image), price
if _json is None:
return
name = _json['details']['thumb'].split('/')[-1].split('.')[0].split('__')[0]
price = _json['details']['price']
check_name_price(fp,name,price)
return name,price
def check_error( e, fp, *args, **kwargs):
# import pdb;pdb.set_trace()
return
def check_name_price(fp, name, price):
# check
try:
if (name is None) and (price is None):
error = '[ERROR] Unable to find name and price: {0}'.format(fp)
return False, error
elif (name is None):
error = '[ERROR] NAME MISSING: {0}'.format(fp)
return False, error
elif (price is None):
error = '[ERROR] Price MISSING: {0}'.format(fp)
return False, error
else:
#print('Parsed',fp,'\n',name, price)
return True, None
except Exception as e:
# import pdb;pdb.set_trace()
return False, e
pass
return
def returnItemInfoFromSite(fp, *args, **kwargs):
'''
'''
retailer = kwargs['retailer']
_reval = kwargs['_reval']
_multi = kwargs['_dict']['multi_size']
_type = _reval[retailer]['_type']
handle_exceptions = kwargs['handle_exceptions']
total_units_list = kwargs['_dict']['total_units'].split(',')
name = None
price = None
units = None
s_unit_of_measure = None
prices = None
s_product_name = None
price = None
soup = None
lines = None
return_dict = {'scraped_units': '', #str split
's_unit_of_measure': '',
'smulti_prices': '',
's_product_name': '',
'price': '',
'leadsplit': '',
'endsplit': '',
'_type': '',
'scraped_name':'',
'scraped_price':'',
}
try:
if retailer is 'bulksupplements':
# import pdb;pdb.set_trace()
items = []
leadsplit = _reval[retailer]['leadsplit']
endsplit = _reval[retailer]['endsplit']
_json = returnJsonFromBulkSupplements(fp, leadsplit, endsplit)
for item in _json['attributes']['134']['options']:
for _id in _json['childProducts'].keys():
if _id in item['products'][0]:
name = item['label']
price = _json['childProducts'][_id]
product_id = _id # not used
check_name_price(fp,name,price)
items.append({"scraped_name": name,
"scraped_price":price})
return items
if (retailer is 'hardrhino') or (retailer is 'nootropicsdepot'):
_json = returnJsonFromJsonDumpFile(fp)
s_product_name, price = returnItemInfoFromJson(_json, fp=fp)
_r, _e = check_name_price(fp,s_product_name,price)
if not _r:
raise Exception(_e)
return_dict.update({"scraped_name": s_product_name, "scraped_price":price})
return return_dict
if _type is 'bs4':
# import pdb;pdb.set_trace()
lines = readFile(fp)
soup = BS(lines, 'html5lib')
if 'True' in _multi:
try:
units = eval(_reval[retailer]['units']) # Units
s_unit_of_measure = eval(_reval[retailer]['unit_of_measure']) # Unit of Measure
prices = eval(_reval[retailer]['prices']) #prices
if len(prices) is not len(total_units_list):
if len(units) is not len(prices):
raise Exception("[ERROR] Total units listed are not equal to the prices scraped")
s_product_name = eval(_reval[retailer]['product_name'])
return_dict.update({"scraped_units": units,
"s_unit_of_measure":s_unit_of_measure,
"smulti_prices": prices})
return return_dict
except Exception as e:
raise e
else:
try:
s_product_name = eval(_reval[retailer]['product_name'])
price = eval(_reval[retailer]['price'])
except Exception as e:
raise e
_r, _e = check_name_price(fp,s_product_name,price)
if not _r:
raise Exception(_e)
return_dict.update({"scraped_name": s_product_name, "scraped_price":price})
return return_dict
check_name_price(fp,s_product_name,price)
except Exception as e:
import traceback
t = traceback.format_exc()
d = {'fp': fp,
'retailer': retailer,
'_multi': _multi,
'_type': _type,
'name': name,
'price': price,
'units': units,
's_unit_of_measure': s_unit_of_measure,
'prices': prices,
's_product_name': s_product_name,
'_reval': _reval[retailer],
}
if handle_exceptions:
handleException(None,e=e, t=t, d=d, soup=soup, lines=lines)
return_dict.update({"scraped_name": s_product_name, "scraped_price":price})
return return_dict
return_dict.update({"scraped_name": s_product_name, "scraped_price":price})
return return_dict