-
Notifications
You must be signed in to change notification settings - Fork 0
/
tests.py
295 lines (245 loc) · 13.4 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import unittest
from scraper import *
import pandas as pd
from bs4 import BeautifulSoup
import random
from dotenv import load_dotenv
load_dotenv()
class TestClass(unittest.TestCase):
@classmethod
def setUpClass(cls):
# create a BeautifulSoup object that contains a single tr element of a syllabus
with open('testing/html/row_with_date.html', encoding="utf8") as fp:
soup = BeautifulSoup(fp, 'html.parser')
cls.row_with_date = soup.tr
# create a BeautifulSoup object that contains a single tr element of a syllabus
with open('testing/html/row_without_date.html', encoding="utf8") as fp:
soup = BeautifulSoup(fp, 'html.parser')
cls.row_without_date = soup.tr
# create a BeautifulSoup object that contains all the HTML elements on CS344's syllabus page
with open('testing/html/OPERATING SYSTEMS I (CS_344_400_W2021).html', encoding="utf8") as fp:
cls.cs344_page = BeautifulSoup(fp, 'html.parser')
# create a BeautifulSoup object that contains a table with two tr elements of a syllabus
# one row shouldn't make it after processing
with open('testing/html/extraneous.html', encoding="utf8") as fp:
cls.two_row_table = BeautifulSoup(fp, 'html.parser')
# create a dummy syllabus df
tasks = ['task #1', 'task #2', 'task #3']
dates = ['Fri Dec 3, 2021', 'Sat Dec 4, 2021', 'Sun Dec 5, 2021']
times = ['11:59pm', '1:00pm', '6:00pm']
cls.df = create_df(tasks, dates, times)
def test_url_validator(self):
# invalid URL should return False
valid_url = is_valid_url('not_a_url')
self.assertFalse(valid_url)
def test_url_validator2(self):
# valid URL that doesn't point to a Canvas syllabus page should return False
valid_url = is_valid_url('https://www.google.com/')
self.assertFalse(valid_url)
def test_url_validator3(self):
# valid URL that points to a Canvas home page (not syllabus) should return False
valid_url = is_valid_url(
'https://canvas.oregonstate.edu/courses/1849691')
self.assertFalse(valid_url)
def test_url_validator4(self):
# valid Canvas syllabus URL should return True
valid_url = is_valid_url(
'https://canvas.oregonstate.edu/courses/1849691/assignments/syllabus')
self.assertTrue(valid_url)
def test_url_validator5(self):
# valid (non-OSU) Canvas syllabus URL should return True
valid_url = is_valid_url(
'https://canvas.northwestern.edu/courses/7060/assignments/syllabus')
self.assertTrue(valid_url)
def test_is_canvas_page(self):
# an empty page should return None
self.assertFalse(is_canvas_page(None))
def test_is_canvas_page2(self):
# 'https://github.com/jasminjohal/canvas-instructure-syllabus-scraper/tree/master/base' is technically
# a valid URL because it contains 'canvas' and 'syllabus' but # is_canvas_page should return False
# since it is not actually a Canvas page
with open('testing/html/github.html', encoding="utf8") as fp:
html = BeautifulSoup(fp, 'html.parser')
self.assertFalse(is_canvas_page(html))
def test_is_canvas_page3(self):
# valid Canvas page should return True
self.assertTrue(is_canvas_page(TestClass.cs344_page))
def test_course_name(self):
# get_course_name should return the correct course name
course_name = get_course_name(TestClass.cs344_page)
self.assertEqual(
course_name, 'OPERATING SYSTEMS I (CS_344_400_W2021)')
def test_remove_extraneous_rows(self):
# BeautifulSoup object should not contain tr with class special_date,
# span with class screenreader-only or i with class icon-assignment
# but should still contain all other elements
html = TestClass.two_row_table
remove_extraneous_rows(html)
self.assertEqual(
len(html.find_all("tr", {"class": "special_date"})), 0)
self.assertEqual(
len(html.find_all("span", {"class": "screenreader_only"})), 0)
self.assertEqual(
len(html.find_all("i", {"role": "presentation"})), 0)
def test_get_syllabus_rows(self):
# two_row_table has 2 tr but 1 is extraneous so should be removed by get_syllabus_rows
html = TestClass.two_row_table
get_syllabus_rows(html)
self.assertEqual(len(html), 1)
def test_contains_date(self):
# contains_date should return True if the tr has a class in the format "events_2020_09_27"
row = TestClass.row_with_date
self.assertTrue(contains_date(row))
def test_contains_date2(self):
# contains_date should return False if the tr does not have a class in the format "events_2020_09_27"
row = TestClass.row_without_date
self.assertFalse(contains_date(row))
def test_extract_date_with_date(self):
# extract_date should pull the date from an HTML class and convert it
# to a string in the form '%a %b %#d, %Y'
task_date = extract_date(TestClass.row_with_date)
# allow for os date format discrepancies
self.assertTrue(
task_date == 'Fri Dec 3, 2021' or task_date == 'Fri Dec 03, 2021')
def test_extract_date_without_date(self):
# extract_date should return '' if there is no date class
task_date = extract_date(TestClass.row_without_date)
self.assertEqual(task_date, '')
def test_extract_task_name(self):
# extract_task_name should pull the task name from a td in a row
task_name = extract_task_name(TestClass.row_with_date)
self.assertEqual(task_name, '5.7 - Discussion: Project Showcase')
def test_extract_time(self):
# extract_time should pull the raw time from a td in a row
task_time = extract_time(TestClass.row_with_date)
self.assertEqual(task_time, 'due by 11:59pm')
def test_create_df_column_names(self):
# create_df should have built a df with the correct column names
self.assertCountEqual(TestClass.df.columns.values, [
'Tasks', 'Dates', 'Times'])
def test_create_df_size(self):
# create_df should have built a df with the correct number of rows and columns
num_rows = TestClass.df.shape[0]
num_columns = TestClass.df.shape[1]
self.assertEqual(num_rows, 3)
self.assertEqual(num_columns, 3)
def test_convert_syllabus_to_df(self):
# TODO: replace this function call
html = TestClass.two_row_table
syllabus = get_syllabus_rows(html)
syllabus_df = convert_syllabus_to_df(syllabus)
# df should have 3 columns with 1 row
self.assertEqual(syllabus_df.shape[0], 1) # df has 1 row
self.assertEqual(syllabus_df['Tasks'].iat[0],
'1.3 - Sprint 1 Learning Quiz')
self.assertEqual(syllabus_df['Dates'].iat[0],
'Mon Jan 10, 2022')
self.assertEqual(syllabus_df['Times'].iat[0],
'due by 11:59pm')
def test_rename_columns_todoist(self):
# rename_columns should appropriately change name of df columns
df = TestClass.df.copy() # don't overwrite class df
df = rename_columns(df, 'todoist')
self.assertCountEqual(df.columns.values, ['CONTENT', 'DATE', 'Times'])
def test_rename_columns_asana(self):
# rename_columns should appropriately change name of df columns depending on task management system
df = TestClass.df.copy()
df = rename_columns(df, 'asana')
self.assertCountEqual(df.columns.values, [
'Name', 'Due Date', 'Description'])
def test_add_columns_todoist(self):
# add_columns should appropriately add colums to df depending on task management system
df = TestClass.df.copy()
add_columns(df, 'todoist')
self.assertCountEqual(df.columns.values, [
'Tasks', 'Dates', 'Times', 'TYPE', 'PRIORITY', 'INDENT', 'AUTHOR', 'RESPONSIBLE', 'DATE_LANG', 'TIMEZONE'])
def test_add_columns_asana(self):
# add_columns should appropriately add colums to df depending on task management system
df = TestClass.df.copy()
add_columns(df, 'asana')
self.assertCountEqual(df.columns.values, [
'Tasks', 'Dates', 'Times', 'Assignee', 'Collaborators', 'Start Date', 'Type', 'Section/Column'])
def test_format_date_column_todoist(self):
# format_date_column should appropriately modify the date column depending on task management system
df = TestClass.df.copy()
df = df.rename(columns={'Dates': 'DATE'})
format_date_column(df, 'todoist')
# allow for os date format discrepancies (Todoist will be fine with either format)
first_value_in_date_column = df['DATE'].iat[0]
self.assertTrue(
first_value_in_date_column == 'Fri Dec 3, 2021 @ 11:59pm' or first_value_in_date_column == 'Fri Dec 03, 2021 @ 11:59pm')
def test_format_date_column_asana(self):
# format_date_column should appropriately modify the date column depending on task management system
df = TestClass.df.copy()
df = df.rename(columns={'Dates': 'Due Date'})
format_date_column(df, 'asana')
first_value_in_date_column = df['Due Date'].iat[0]
self.assertEqual(first_value_in_date_column, '12/03/2021')
def reorder_columns_helper(self, tms, columns_with_correct_order):
# shuffle columns out of order, create a new df with incorrect column order,
# use reorder_columns to return a new df with column order reverted to initial state
num_columns = len(columns_with_correct_order)
columns_with_incorrect_order = random.sample(
columns_with_correct_order, num_columns)
data = [[0 for _ in range(num_columns)]]
incorrect_col_order_df = pd.DataFrame(
data, columns=columns_with_incorrect_order)
correct_col_order_df = reorder_columns(
incorrect_col_order_df, tms)
return correct_col_order_df
def test_reorder_columns_todoist(self):
# test whether reorder_columns works to revert column order to initial correct state for todoist
columns_with_correct_order = ['TYPE', 'CONTENT', 'PRIORITY', 'INDENT', 'AUTHOR',
'RESPONSIBLE', 'DATE', 'DATE_LANG', 'TIMEZONE']
df = self.reorder_columns_helper('todoist', columns_with_correct_order)
self.assertEqual(list(df.columns), columns_with_correct_order)
def test_reorder_columns_asana(self):
# test whether reorder_columns works to revert column order to initial correct state for asana
columns_with_correct_order = ['Name', 'Description', 'Assignee', 'Collaborators',
'Due Date', 'Start Date', 'Type', 'Section/Column']
df = self.reorder_columns_helper('asana', columns_with_correct_order)
self.assertEqual(list(df.columns), columns_with_correct_order)
def test_process_df_for_tms_todoist(self):
# checck that process_df_for_tms returns a new df with the required columns for todoist
df = TestClass.df.copy() # don't overwrite class df
self.assertEqual(list(df.columns), ['Tasks', 'Dates', 'Times'])
processed_df = process_df_for_tms(df, 'todoist')
columns_with_correct_order = ['TYPE', 'CONTENT', 'PRIORITY', 'INDENT', 'AUTHOR',
'RESPONSIBLE', 'DATE', 'DATE_LANG', 'TIMEZONE']
self.assertEqual(list(processed_df.columns),
columns_with_correct_order)
def test_process_df_for_tms_asana(self):
# checck that process_df_for_tms returns a new df with the required columns for asana
df = TestClass.df.copy() # don't overwrite class df
self.assertEqual(list(df.columns),
['Tasks', 'Dates', 'Times'])
processed_df = process_df_for_tms(df, 'asana')
columns_with_correct_order = ['Name', 'Description', 'Assignee', 'Collaborators',
'Due Date', 'Start Date', 'Type', 'Section/Column']
self.assertEqual(list(processed_df.columns),
columns_with_correct_order)
def test_setup_driver(self):
# ensure driver is properly setup by visiting a URL and comparing it to driver's current_url property
driver = setup_driver()
URL = "https://www.google.com/"
try:
driver.get(URL)
visited_url = driver.current_url
finally:
driver.quit()
self.assertTrue(URL in visited_url)
def test_scraper(self):
# every Canvas syllabus page has a div#syllabusContainer that contains the syllabus content
html = get_soupified_html(
'https://canvas.oregonstate.edu/courses/1792599/assignments/syllabus')
if html:
syllabus_container = html.find(
'div', {'id': 'syllabusContainer'})
self.assertIsNotNone(syllabus_container)
def test_scraper_timeout(self):
# get_soupified_html should return None if the URL doesn't contain Canvas syllabus content
html = get_soupified_html(
'https://github.com/jasminjohal/canvas-instructure-syllabus-scraper/tree/master/base')
self.assertIsNone(html)
if __name__ == '__main__':
unittest.main() # add verbosity=2 argument for more detail