-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClarkson_MAE.py
60 lines (49 loc) · 2.34 KB
/
Clarkson_MAE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
__author__ = 'Shrinivas'
from bs4 import BeautifulSoup
import csv
import re
#setting up initial reading of csv
with open('names.csv', 'w') as csvfile:
fieldnames = ['Full Name', 'University', 'Department', 'EducationSource']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
#setting up initial access to main faculty page
import urllib.request
with urllib.request.urlopen('http://www.clarkson.edu/mae/faculty.html') as response:
html = response.read()
soup = BeautifulSoup(html)
#print(soup.prettify())
#finding all of the individual faculty
faculty = soup.findAll('tbody')
for tbody in faculty:
#looping through each of the faculty and going to their individual information pages
facultyURLs = (tbody.findAll('a', attrs={'class' : 'headerLinksBlue'}))
for a in facultyURLs:
facultyURL = ("http://www.clarkson.edu/mae/" + a['href'])
with urllib.request.urlopen(facultyURL) as response:
html = response.read()
facultySoup = BeautifulSoup(html)
#was having trouble with putting it in a csv file, switched to print it instead
nameAndUni = facultySoup.title.text;
phdSource = facultySoup.findAll(text=re.compile('Ph.D'))
for i in phdSource:
print(nameAndUni + ', ' + 'Mechanical & Aeronautical Engineering, ' + i)
with urllib.request.urlopen('http://www.clarkson.edu/biology/faculty_pages/index.html') as response:
html = response.read()
soup = BeautifulSoup(html)
#print(soup.prettify())
#finding all of the individual faculty
faculty = soup.findAll('tbody')
for tbody in faculty:
#looping through each of the faculty and going to their individual information pages
facultyURLs = (tbody.findAll('a', attrs={'class' : 'headerLinksBlue'}))
for a in facultyURLs:
facultyURL = ("http://www.clarkson.edu/biology/faculty_pages/" + a['href'])
with urllib.request.urlopen(facultyURL) as response:
html = response.read()
facultySoup = BeautifulSoup(html)
#was having trouble with putting it in a csv file, switched to print it instead
nameAndUni = facultySoup.title.text;
phdSource = facultySoup.findAll(text=re.compile('Ph.D'))
for i in phdSource:
print(nameAndUni + ', ' + 'Mechanical & Aeronautical Engineering, ' + i)