-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_stsci_coIs.py
41 lines (36 loc) · 1.5 KB
/
scrape_stsci_coIs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from bs4 import BeautifulSoup
import requests
import pandas
specials = [1433, 1424] # odd numbering for a few proposals.
regular = list(range(1549, 2722)) # trying to avoid the commissioning proposals
fullindex = specials.copy()
fullindex.extend(regular)
exclude = [1602, 1628, 1629, 1630, 1631, 1632, 2586] # These are Commissioning proposals with weird numbers.
newlist = [x for x in fullindex if x not in exclude]
N_gsfc_users = 0
N_proposals_w_gsfc = 0
URLs_w_gsfc = []
outfilename = "scrape_stsci_JWSTcy1.txt"
wgetfilename = "wget_pdfs_JWSTcy1.txt"
outfile = open(outfilename, "w")
wgetfile = open(wgetfilename, 'w')
for ii in newlist :
print("DEBUGGING, trying", ii)
url = 'https://www.stsci.edu/cgi-bin/get-address-info?id=' + str(ii) + '&markupFormat=xml&observatory=JWST'
xml_data = requests.get(url).content
soup = BeautifulSoup(xml_data, 'lxml')
thisone=0
for entry in soup.findAll('institution'):
if ('Goddard') in entry.text or ('GSFC') in entry.text :
N_gsfc_users +=1
thisone += 1
if thisone:
N_proposals_w_gsfc += 1
URLs_w_gsfc.append(url)
outfile.write(url + "\n")
print(" HAS GSFC")
wgetfile.write("wget https://www.stsci.edu/jwst/phase2-public/" + str(ii) + ".pdf\n")
print("Finished! There were", N_proposals_w_gsfc, "proposals with GSFC investigators")
print(" and there were", N_gsfc_users, "GSFC proposers. See outfile", outfilename, "for each URL.")
outfile.close()
wgetfile.close()