-
Notifications
You must be signed in to change notification settings - Fork 0
/
russian wine guide pars.py
158 lines (101 loc) · 4.54 KB
/
russian wine guide pars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import requests
from bs4 import BeautifulSoup
import pandas as pd
import regex
import datetime
import os
import time
#%%
def get_wines_list(url):
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
ns = soup.findAll('noscript')
wl = ns[4].findAll('a')
return wl
def get_first_element(l):
if len(l)>0:
l = l[0]
else:
l = ''
return l
def get_property(property_name,properties):
for i,prop in enumerate(properties):
if prop.text == property_name:
return properties[i+1].text
return ''
def get_wine(url,wine_type):
wine = None
r= requests.get(url)
if r.status_code==200:
s = BeautifulSoup(r.text)
name = s.find('p', class_='product-subtitle').text
price_per_value = s.find('span',class_="info-price").text
price = regex.findall('\d*', price_per_value)
price = get_first_element(price)
price_for = regex.findall('\/0[,.]\d*',price_per_value)
price_for = get_first_element(price_for)
if len(price_for)>0:
price_for = regex.sub('/','',price_for)
price_for = price_for.replace(',','')
brand = s.find('h1',class_="h1 product-title").text
if len(brand)>0:
brand = brand.strip()
properties = s.find('div',class_="properties")
properties = properties.findAll('p')
company = get_property('Производитель',properties)#properties[1].text
research_year = get_property('Год исследования',properties)#properties[3].text
barcode = get_property('Штрихкод',properties)#properties[5].text
vintage = get_property('Год урожая',properties)#properties[7].text
sugar = get_property('Сахар',properties)#properties[9].text
color = get_property('Цвет',properties)#properties[11].text
rating_gost = get_property('Оценка по ГОСТ 32051-2013',properties)#properties[11].text
rating = s.find('div', class_ = 'starrating readonly d-inline-flex flex-row-reverse')
rating = rating.findAll('span')[0].text
wine = {'name': name,
'wine_type': wine_type,
'barcode': barcode,
'brand': brand,
'company': company,
'research_year': research_year,
'vintage': vintage,
'sugar': sugar,
'color': color,
'price': price,
'price_for': price_for,
'rating_gost': rating_gost,
'rating': rating ,
'url': url
}
return wine
def parse_wine_from_wine_list(winelist, wine_type):
df=None
k = 1
l=str(len(winelist))
for wine_page in winelist:
print(wine_type+'. Обрабатываем '+str(k) +' из ' +l)
url = 'https://rskrf.ru/'+wine_page['href']
print(url)
new_wine = get_wine(url, wine_type)
if df is None:
df = pd.DataFrame([new_wine])
else:
df = df.append(new_wine, ignore_index=True)
k=k+1
if k % 50 == 0:
time.sleep(5)
print('Timeout 5!')
return df
#%%
wine_df = None
winelist = get_wines_list('https://rskrf.ru/ratings/napitki/alkogolnye/krasnoe-vino/')
wine_df = parse_wine_from_wine_list(winelist,'Красное')
wine_df = wine_df.append(parse_wine_from_wine_list(get_wines_list('https://rskrf.ru/ratings/napitki/alkogolnye/beloe-vino/'),'Белое'))
wine_df = wine_df.append(parse_wine_from_wine_list(get_wines_list('https://rskrf.ru/ratings/napitki/alkogolnye/rozovoe-vino/'),'Розовое'))
wine_df = wine_df.append(parse_wine_from_wine_list(get_wines_list('https://rskrf.ru/ratings/napitki/alkogolnye/likyernoe-vino/'),'Ликерное'))
wine_df = wine_df.append(parse_wine_from_wine_list(get_wines_list('https://rskrf.ru/ratings/napitki/alkogolnye/igristoe/'),'Игристое'))
#%%%
wine_df['rating_gost'] = wine_df['rating_gost'].str.replace(',','.')
#%%%
wine_df.to_csv('wines_df_2021.csv',sep=';')#+str(datetime.datetime.now())+'.csv')
#%%