-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClarkson_Camp.py
40 lines (35 loc) · 1.48 KB
/
Clarkson_Camp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
__author__ = 'Shrinivas'
from bs4 import BeautifulSoup
import csv
import re
import urllib.request
# setting up initial reading of csv
with open('names.csv', 'w') as csvfile:
fieldnames = ['Full Name', 'University', 'Department', 'EducationSource']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
#setting up initial access to main faculty page
with urllib.request.urlopen('http://www.clarkson.edu/camp/faculty/index.html') as response:
html = response.read()
soup = BeautifulSoup(html)
#print(soup.prettify())
#finding all of the individual faculty
faculty = soup.findAll('tbody')
for tbody in faculty:
#looping through each of the faculty and going to their individual information pages
facultyURLs = (tbody.findAll('a'))
for a in facultyURLs:
name = a.text
if "http" not in a['href']:
facultyURL = ("http://www.clarkson.edu/camp/faculty/" + a['href'])
else:
facultyURL = a['href']
if "html" in facultyURL:
with urllib.request.urlopen(facultyURL) as response:
html = response.read()
facultySoup = BeautifulSoup(html)
#was having trouble with putting it in a csv file, switched to print it instead
nameAndUni = facultySoup.title.text;
phdSource = facultySoup.findAll(text=re.compile('Ph.D'))
for i in phdSource:
print(nameAndUni + ', ' + name + ', ' + 'Camp, ' + i)