-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathADA-ERP-BS.py
91 lines (74 loc) · 2.48 KB
/
ADA-ERP-BS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import requests
from bs4 import BeautifulSoup
import re
import time
import pickle
import csv
def get_pages(soup):
'''
gets links to any subsequent pages
'''
base = 'https://professional.diabetes.org'
try:
page_links = soup.find('ul', {'class': 'pagination'}).find_all('a')
links = [base + a['href'] for a in page_links]
return set(links)
except:
return None
def get_org_dicts(soup):
'''
turn any listed organizations on page to dictionaries
'''
orgs = soup.find_all('div', {'class': 'col col-sm-4'})
org_dicts = []
for o in orgs:
meta = o.find_all('div')
org_dict = {}
# up to colon is key after is value
pattern = re.compile('(.*?):(.*)')
for m in meta:
try:
groups = re.search(pattern, m.text).groups()
title = groups[0].strip()
value = groups[1].strip()
org_dict[title] = value
except:
pass
org_dicts.append(org_dict)
return org_dicts
if __name__ == "__main__":
# get list of states from sample URL
init = 'https://professional.diabetes.org/erp_list?field_erp_state_value=NY'
res = requests.get(init)
soup = BeautifulSoup(res.text, 'html5lib')
options = soup.find(
'select', {'id': 'edit-field-erp-state-value'}).find_all('option')
states = [x['value'] for x in options]
# start iteration through state URLS
all_dicts = []
for s in states:
print(s)
state_link = 'https://professional.diabetes.org/erp_list?field_erp_state_value={}'.format(
s)
res = requests.get(state_link)
soup = BeautifulSoup(res.text, 'html5lib')
# get dicts
all_dicts.extend(get_org_dicts(soup))
pickle.dump(all_dicts, open('all-dicts.pkl', 'wb'))
# get extra pages
pages = get_pages(soup)
# cycle through subsequent pages
if pages != None:
for p in pages:
res = requests.get(p)
soup = BeautifulSoup(res.text, 'html5lib')
all_dicts.extend(get_org_dicts(soup))
time.sleep(1)
pickle.dump(all_dicts, open('all-dicts.pkl', 'wb'))
time.sleep(1)
# dump csv
with open('erp.csv', 'w') as csvfile:
fieldnames = list(all_dicts[0].keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_dicts)