-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathobtain_information_from_web.py
203 lines (173 loc) · 7.11 KB
/
obtain_information_from_web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#__anthor__==Mingchen Li, GSU
from bs4 import BeautifulSoup
import requests
from urllib import request
"""
# this script is used to obtain---
--province
--city 市
-- the hospital in this city
----the tel of this hospital
-----the address of this hospital
----the department (科室)
----doctor,etc
----good department (擅长科室)
the output file is all_information_end.txt, I obtained 1771 hospital's information
"""
def provience_w():
fw=open("provience_and_web.txt","w",encoding="utf-8")
html = requests.get('https://dxy.com/health/hospital/location/620000')
# print(html.text)
soup = BeautifulSoup(html.content,"html.parser")
# print(soup.text)
province_website=soup.find_all("ul",class_="nav-ul clearfix")
web=[]
province_name=[]
for p in (province_website[0].find_all("li")):
web.append("https://dxy.com"+(p.find_all("a")[0]["href"]))
province_name.append(p.get_text())
PW=dict(zip(province_name,web))
for key, value in PW.items():
fw.write(key+"\t"+value+"\n")
# return PW
def read_proviecne_web():
pro=[]
web=[]
with open("provience_and_web.txt","r",encoding="utf-8") as fr:
for line in fr.readlines():
line=line.strip().split("\t")
pro.append(line[0])
web.append(line[1])
dic_=dict(zip(pro,web))
return dic_
def city_hospivcal():
fw=open("procinece_city_hospital_web.txt","w",encoding="utf-8")
PW=read_proviecne_web()
for key, value in PW.items():
provience_name=key
provience_web=value
# print(provience_web)
# provience_web=request.urlopen(provience_web)
html = request.urlopen(provience_web)
# print(html.text)
soup = BeautifulSoup(html, "html.parser")
all_citys=soup.find_all("ul", class_="content-region j-hospital-region ")[0].find_all("li",class_="clearfix")
for ci in all_citys:
city=ci.find_all("h4")[0].get_text().strip()
hospitals=ci.find_all("ul")
# print(hospitals)
for hospital in hospitals:
print(hospital)
hospital_name_webs=hospital.find_all("a")
for hnw in hospital_name_webs:
name =provience_name.strip()+"|"+city+"|"+hnw.get_text()
web="https://dxy.com"+hnw["href"]
fw.write(name+"|"+web+"\n")
def obtain_department(id):
"""
this script is used to obtain the deparment name
:param id: id is the hopital id
:return:
"""
all_ds = ""
for i in range(5):
i = i + 1
URL = "https://dxy.com/view/i/hospital/section/list?page_index=%s&items_per_page=20&id=%s" % (str(i),str(id))
html = request.urlopen(URL)
soup = BeautifulSoup(html, "html.parser")
if "error" in str(soup):
continue
else:
dic_ = eval(soup.text)
all_departments = (dic_["data"]["items"])
for ds in all_departments:
# print(ds)
all_ds=all_ds+(ds["name"] + "&" +str(ds["hospital_id"])+"&" + str(ds["section_id"])+"&"+str(ds["doctor_number"]))+"**"
return all_ds
def hospital_information():
fw=open("all_information_1.txt","w",encoding="utf-8")
i=0
with open("procinece_city_hospital_web.txt","r",encoding="utf-8") as fr:
for line in fr.readlines():
i=i+1
print(i)
L=line.strip().split("|")
hospital_web=L[-1].strip()
fw.write(L[0]+"|"+L[1]+"|"+L[2]+"|")
# print(hospital_web)
html = request.urlopen(hospital_web)
# try:
# html = request.urlopen(hospital_web)
# except urllib.error.URLError as e:
# print(e)
# print(html)
hospital_content = BeautifulSoup(html, "html.parser")
# grade such as 三级甲等
grade=hospital_content.find_all("span",class_="grade")[0].get_text().strip().replace("(","").replace(")","")
fw.write(grade+"|")
#ensure 医保定点
ensure=hospital_content.find_all("span",class_="ensure")[0].get_text().strip()
fw.write(ensure + "|")
# site 地址
site= hospital_content.find_all("span", class_="hospital-basic__info")[0].get_text().strip()
fw.write(site+"|")
# tel 电话
tel = hospital_content.find_all("span", class_="hospital-basic__info")[1].get_text().strip()
fw.write(str(tel) + "|")
# print(tel)
# print(site)
if "擅长专长" in str(hospital_content):
good_department = hospital_content.find_all("div", class_="hospital-detail-intro")[0].get_text().strip()
# print(good_department
else:
good_department="擅长专长未登录"
fw.write(good_department+"|")
hospital_id=hospital_web.split("/")[-1]
# print(hospital_id)
# 科&1666&11&11 hospital_id section_id, doctor_number
department_name=obtain_department(hospital_id)
fw.write(department_name+"\n")
fw.flush()
def obtain_doctor_information(hospital_id,section_id):
all_ds = ""
for i in range(5):
i = i + 1
# print(i)
URL = "https://dxy.com/view/i/section/doctor/list?hospital_id=%s§ion_id=%s&page_index=%s&items_per_page=6" % (str(hospital_id),str(section_id),str(i))
# print(URL)
html = request.urlopen(URL)
soup = BeautifulSoup(html, "html.parser")
if "error" in str(soup):
continue
else:
dic_ = eval(soup.text)
all_doctors = (dic_["data"]["items"])
for doctor in all_doctors:
doctor_name = doctor["name"]
jobtile = doctor["jobtitle_type_name"]
all_ds = all_ds + (doctor_name + "-NJ-" + jobtile) + "$"
return all_ds
def department_information():
fw=open("all_information_end_4.txt","w",encoding="utf-8")
with open("all_information_4.txt","r",encoding="utf-8") as fr:
i=0
for line in fr.readlines():
i=i+1
print(i)
line=line.strip().split("|")
fw.write(line[0]+"|"+line[1]+"|"+line[2]+"|"+line[3]+"|"+line[4]+"|"+line[5]+"|"+line[6]+"|"+line[7]+"|")
department=line[8]
departments=department.split("**")[:-1]
for de in departments:
de=de.split("&")
department_name=de[0]
fw.write(department_name+"-DD-")
hospital_id=de[1]
section_id=de[2]
doctors=obtain_doctor_information(hospital_id,section_id)
fw.write(doctors+"&")
fw.write("\n")
fw.flush()
if __name__=="__main__":
# hospital_information()
department_information()