-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
35 lines (25 loc) · 870 Bytes
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
import bs4
from fake_useragent import UserAgent
import re
def get_raw_html(url: str) -> str:
headers = {
'User-Agent': UserAgent().random
}
request_data = requests.get(url, headers=headers)
return request_data.text
def get_parsed_data(url: str) -> dict:
data = get_raw_html(url)
return parse_html(data)
def search_for_class(bs_tree, classname: str) -> list:
class_list = []
for tag in bs_tree.find_all(class_=re.compile(classname)):
if tag.text != '':
class_list.append(tag.text.replace('\n', ''))
return class_list
def parse_html(raw_html: str) -> dict:
bs = bs4.BeautifulSoup(raw_html, 'html.parser')
prices_list, products = [], []
products = search_for_class(bs, r'name')
prices_list = search_for_class(bs, r'price')
return dict(zip(prices_list, products))