-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
99 lines (82 loc) · 3.33 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
import os.path
from bs4 import BeautifulSoup
import requests, lxml.html, json
headers = {
'User-Agent': 'My User Agent 1.0',
'From': '[email protected]' # This is another valid field
}
def crawl_hospital_names():
with open("provinces.txt", "r") as provinces:
for province in provinces:
response = requests.get('https://www.haodf.com/yiyuan/'+province.strip()+'/list.htm',headers=headers)
root_node = lxml.html.fromstring(response.text)
nodes = root_node.xpath("//div[contains(@class, 'm_ctt_green')]/ul/li/a")
with open(province.strip()+".txt", "a") as province_file:
print(province.strip())
for name in nodes:
province_file.write(name.text+"\n")
hospitals_dict= {}
def load_hospitals():
with open("provinces.txt", "r") as provinces:
os.chdir('./Hospitals')
# print("Current working directory: {0}".format(os.getcwd()))
for province in provinces:
hospitals = []
with open(province.strip()+".txt", "r") as file:
for line in file:
hospitals.append(line.strip())
hospitals_dict[province.strip()] = hospitals
# print(province.strip() + ": " , len(hospitals))
os.chdir('../')
print("Current working directory: {0}".format(os.getcwd()))
headers2 = {
"User-Agent":
"Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140"
}
def crawl_hospital_url(name):
html = requests.get('https://www.baidu.com/s?&tn=baidu&wd='+name, headers=headers2)
print("status_code: ", html.status_code)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select('.result.c-container.new-pmd'):
dl = result.select_one('.c-showurl')
t = result.select_one('.t')
if t is None or dl is None:
continue
title = t.text
displayed_link = dl.text
if '官方' in title:
print(name + ": " + displayed_link)
return displayed_link
# crawl_hospital_url('中日医院')
def get_all_hospital_urls():
for province in hospitals_dict:
success = 0
failed = 0
ok_urls = []
nok_names = []
print(province + ": ", len(hospitals_dict[province]))
for hospital in hospitals_dict[province]:
url = crawl_hospital_url(hospital)
if url:
ok_urls.append(url + "\n")
success = success + 1
else:
nok_names.append(hospital + "\n")
failed = failed + 1
print('Cannot find url for: ' + hospital)
print(province + ": success:%d, failed=%d", success, failed)
write_file("website_" + province + '.txt', ok_urls)
write_file("failed_" + province + '.txt', nok_names)
def write_file(file_name, lines):
file= open(file_name, 'a')
try:
file.write("".join(lines))
finally:
file.close()
if __name__ == "__main__":
print(__name__)
#load_hospitals()
# get_all_hospital_urls()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/