-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUrl.py
66 lines (57 loc) · 2.3 KB
/
Url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests
from bs4 import BeautifulSoup
import sys
class URL():
def __init__(self, url):
self.url_name = url
self.__all_forms_list = self.__get_all_forms()
self.forms_dict = dict()
self.__get_form_num(self.__all_forms_list)
def __get_html(self):
""" returns the html content from webpage """
try:
if self.url_name != '':
raw_html = requests.get(self.url_name).text
parsed_html = BeautifulSoup(raw_html, "html.parser")
return parsed_html
except requests.exceptions.ConnectionError:
print('Issues connecting to your internet. Check your wifi.')
sys.exit()
"""
Extracts all taxi form code attributes from html
input: a url's html code
ouput: list of every taxi form on url's code attributes
"""
def __get_all_forms(self):
total_forms_list = []
packages = self.__get_html().find_all('div', id='taxi-form-packages') # find all taxi form packages
for item in packages:
# remove any forms that are hidden within class: default-form-clone and aside tag
if not item.find_parents("aside", class_= "floating-cta-wrapper--fixed") and \
not item.find_parents("div", class_= "default-form-clone"):
total_forms_list.append(item.select("div > code").pop(-1).get('id'))
return total_forms_list
"""
Extracts form grouping and number
input: list of all form id's
output: list of all program id's
"""
def __get_form_num(self, all_forms_list):
for form in all_forms_list:
# extract program grouping and id
single_form = form.split("-")[:3]
form_name = "-".join(single_form)
if form_name in self.forms_dict:
self.forms_dict[form_name] += 1
else:
self.forms_dict[form_name] = 1
return self.forms_dict
def __repr__(self):
return('URL: {}\n'
'Forms used on page: {}'.format(self.url_name, self.forms_dict))
if __name__ == '__main__':
# Tests for webpage with forms
webpage_with_form = 'https://onlinemba.ucdavis.edu/'
# create url instance
url_with_form = URL(webpage_with_form)
print(url_with_form)