-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser_searcher_part_1.py
361 lines (316 loc) · 13.9 KB
/
parser_searcher_part_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import csv
import re
import sys
import requests
from bs4 import BeautifulSoup as BS
from selenium import webdriver
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
}
GOOGLE_URL = f"https://www.google.com/search?q=Стоматология&start="
MAIL_URL = f"https://go.mail.ru/search?fm=1&q=Стоматология&sf="
BING_URL = f"https://www.bing.com/search?q=Стоматология&sp=-1&pq=cтоматология&sc=8&qs=n&sk=&first="
YANDEX_URL = f"https://yandex.com/search/?clid=2186621&text=Стоматология&p="
ERROR_404 = 404
ERROR_429 = 429
def html_get(url):
"""This function copy html code and return him in str format
Arguments:
url {str} -- search query link
Returns:
Error code or html page if no error occurred
"""
try:
r = requests.get(url, headers=HEADERS)
# If the status of the error code is 200 or 404, then everything is fine
# Yes, and 404, because this is necessary in order to track the end of pages in the search engine results
assert (r.status_code == 200 or r.status_code == 404), """
!!! Too many response from your computer !!!
!!! Use VPN or try another network !!!"""
return r.text
except AssertionError as Assert:
print(Assert)
return r.status_code
except Exception as e:
print('Check your internet connection or try using VPN connection')
# def mail_html_get(url=MAIL_URL):
# """This function returns the html on the site mail.ru,
# since mail.ru is a dynamic site and
# it returns full HTML only after
# launching it in the browser
# For this feature to work, you must download the webdriver for Mozilla Firefox
# and add it to the /Scripts folder which is located
# in root folder with your Python (or in virtualenv)
# Keyword Arguments:
# url {str} -- [search query link to mail.ru] (default: {MAIL_URL})
# Returns:
# generated_html -- page layout as a string containing html code
# """
# try:
# options = webdriver.FirefoxOptions() # Options for our browser
# options.add_argument('--headless') # Hides browser display
# browser = webdriver.Firefox(firefox_options=options)
# browser.get(url) # open our url in browser
# generated_html = browser.page_source # copy generated with js html
# browser.quit() # close browser
# return generated_html
# except Exception: # if no internet connection, generate Exception
# print('Check your internet connection and try again')
# sys.exit() # exit from program, because no internet connection
def mail_ru_parse() -> list:
"""
This function "pulls out" only the data we need, such as
Anchor, URL, Snippet from mail.ru page
The function goes through the pages of the search engine mail.ru and
if the page returned error code 404 means the previous page was the last
and you can complete the search for items
Only this function uses selenium
Returns:
list -- Processed Results
"""
options = webdriver.FirefoxOptions() # Options for our browser
options.add_argument('--headless') # Hides browser display
browser = webdriver.Firefox(firefox_options=options)
datas = [] # Page parsing results
page = 0
while True:
# While we can find some data we need
# then the cycle will work, but how the valid data ends,
# the cycle will complete its work
worked_url = MAIL_URL
worked_url += str(page * 10) # Working URL with page number
try:
browser.get(worked_url) # open our url in browser
html = browser.page_source # copy generated with js html
except Exception: # if no internet connection, generate Exception
print('Check your internet connection or try using VPN connection ')
browser.quit() # close selenium browser
return # exit from program, because no internet connection
print('\rYou have finished %3d%%' % (page * 100 // 65), end='',
flush=True) # This strings print progress of parsing pages
soup = BS(html, 'html.parser') # Tree of our page in BS4 format
# list of our cards that the search engine found
results = soup.find_all('li', class_='result__li')
if not results:
break
# Disassemble html in parts
for item in results:
try:
datas.append(
{
'Searcher': 'mail.ru',
'Url': item.find('a', class_='light-link').get('href'),
'Anchor': item.find('a', class_='light-link').get_text(),
'Snippet': item.find('div', class_='SnippetResult-result').find('span').get_text(),
}
)
# If some elements were not found, then the data is not valid
# and we perform the next step
except Exception:
pass
page += 1
# This strings print that we finished of parsing pages
print('\rYou have finished %3d%%' % 100, end='', flush=True)
print('\n', page, 'pages was parsed')
browser.quit() # close selenium browser
return datas # return all valid data that we could find
def google_com_parse() -> list:
"""
This function "pulls out" only the data we need, such as
Anchor, URL, Snippet from google.com page
The function goes through the pages of the search engine google.com and
if the page returned error code 404 means the previous page was the last
and you can complete the search for items.
This function uses module requests
Returns:
list -- Processed Results
"""
datas = [] # Page parsing results
page = 0
while True:
# While we can find some data that we need,
# the cycle will work, but how the valid data ends,
# the cycle will complete its work
worked_url = GOOGLE_URL
worked_url += str(page * 10) # Working URL with page number
html = html_get(worked_url) # HTML markup
# If html_get returned one of the errors with code 404 or 429,
# then we end the function
if html in [ERROR_404, ERROR_429]:
return
soup = BS(html, 'html.parser') # Tree of our page in BS4 format
print('\rYou have finished %3d%%' % (page * 100 // 28), end='', flush=True) # This strings print progress of parsing pages
# list of our cards that the search engine found
results = soup.find_all('div', class_='g')
# If you didn’t find anything on this page, then it’s the last one and we exit loop
if not results:
break
for item in results:
try:
snippet = item.find('div', class_='s').find(
'span', class_='st').get_text()
# clean snippet from garbage with reqular expression
clean_snippet = re.sub('\xa0', '', snippet)
datas.append(
{
'Searcher': 'google.com',
'Url': item.find('div', class_='r').find('a').get('href'),
'Anchor': item.find('h3', class_='LC20lb DKV0Md').get_text(),
'Snippet': clean_snippet,
}
)
# If some elements were not found, then the data is not valid
# and we perform the next step
except Exception:
pass
page += 1
print('\rYou have finished %3d%%' % 100, end='', flush=True)
print('\n',page,'pages was parsed')
return datas
def yandex_com_parse() -> list:
"""
This function collects search results from the Yandex.com search engine.
! Do not run this function very often, because Yandex actively catches such requests
Returns:
data[list] -- data collected
"""
datas = []
page = 0
while True:
# While we can find some data that we need,
# the cycle will work, but how the valid data ends,
# the cycle will complete its work
worked_url = YANDEX_URL
worked_url += str(page)
html = html_get(worked_url)
if html in [ERROR_404, ERROR_429]:
return
soup = BS(html, 'html.parser')
print('\rYou have finished %3d%%' % (page * 100 // 25), end='', flush=True)
results = soup.find_all('li', class_='serp-item')
if not results:
break
for item in results:
try:
url = item.find('h2', class_='organic__title-wrapper').find('a', class_='link').get('href')
if re.search('yabs.yandex', url): # If we found a link starting on yabs.yandex then this is advertising, so we skip it
continue
anchor = item.find('div', class_='organic__url-text').get_text()
clean_anchor = re.sub('\xa0', '', anchor) # clean anchors from garbage
snippet = item.find('span', class_='extended-text__full').get_text()
clean_snippet = re.sub('\xa0', '', snippet) # clean snippet from garbage
datas.append(
{
'Searcher': 'yandex.com',
'Url': url,
'Anchor': clean_anchor.replace('Скрыть', '').strip(), #delete the inscription "Скрыть" at the end of each line
'Snippet': clean_snippet.replace('Скрыть', '').strip(), #delete the inscription "Скрыть" at the end of each line
}
)
# If some elements were not found, then the data is not valid
# and we perform the next step
except Exception:
continue
page += 1
print('\rYou have finished %3d%%' % 100, end='', flush=True)
print('\n',page,'pages was parsed')
return datas
def bing_com_parse() -> list:
"""
This function collects search results from the bing.com search engine.
Returns:
list -- [description]
"""
datas = []
num_page = 0 # Page number
while True:
#The bing.com search engine normally displays a maximum of 100 pages
# and each page has 10 articles. Therefore, we took a maximum of 1000 articles
# While we can find some data that we need,
# the cycle will work, but how the valid data ends,
# the cycle will complete its work
if num_page == 0:
page = 1
else:
page = ((num_page - 1) * 10 + 11)
if page >= 1000:
break
worked_url = BING_URL
worked_url += str(page)
html = html_get(worked_url)
if html in [ERROR_404, ERROR_429]:
return
soup = BS(html, 'html.parser')
print('\rYou have finished %3d%%' % (num_page * 100 // 100), end='', flush=True)
results = soup.find_all('li', class_='b_algo')
# If there are no elements on this page, then we proceed to the next
if not results:
num_page += 1
continue
for item in results:
url = item.find('h2').find('a').get('href')
anchor = item.find('h2').find('a').get_text()
clean_anchor = re.sub('\xa0', '', anchor) # clean anchor from garbage
snippet = item.find('div', class_='b_caption').find('p').get_text()
clean_snippet = re.sub('\xa0', '', snippet) # clean snippet from garbage
try:
datas.append(
{
'Searcher': 'bing.com',
'Url': url,
'Anchor': clean_anchor.strip(),
'Snippet': clean_snippet.strip(),
}
)
# If some elements were not found, then the data is not valid
# and we perform the next step
except Exception:
continue
num_page += 1
print('\rYou have finished %3d%%' % 100, end='', flush=True)
print('\n',num_page,'pages was parsed')
return datas
def search_stomatology():
"""
This function creates a file, collects data and writes it to a file.
Search_Stomatology.csv encoded in UTF-16.
Columns in the file are separated by a comma
Columns: ['Searcher', 'Url', 'Anchor', 'Snippet']
"""
with open('Search_Stomatology.csv', mode='w', encoding='utf-16', newline='') as csv_file:
fieldnames = ['Searcher', 'Url', 'Anchor', 'Snippet'] # Column Names
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',') # this is our so-called "writer" who will write our data to a file
writer.writeheader() # this function records the title
try:
print('Start parsing Mail.ru')
for item in mail_ru_parse():
writer.writerow(item)
print('Mail.ru -- OK\n')
except Exception:
print('Mail.ru -- Error\n')
try:
print('Start parsing Google.com')
for item in google_com_parse():
writer.writerow(item)
print('Google.com -- OK\n')
except Exception as e:
print(e)
print('Google.com -- Error\n')
try:
print('Start parsing Yandex.com')
for item in yandex_com_parse():
writer.writerow(item)
print('Yandex.com -- OK\n')
except Exception as e:
print(e)
print('Yandex.com -- Error\n')
try:
print('Start parsing Bing.com')
for item in bing_com_parse():
writer.writerow(item)
print('Bing.com -- OK\n')
except Exception as e:
print(e)
print('Bing.com -- Error\n')
if __name__ == "__main__":
search_stomatology()