-
Notifications
You must be signed in to change notification settings - Fork 1
/
subject_moderation.py
156 lines (131 loc) · 5.69 KB
/
subject_moderation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""Get CPXXXX assessment items from StudyFinder websites.
Manually set column C "Check"
=IF(AND(ISBLANK(H2), ISBLANK(M2), ISBLANK(R2), ISBLANK(W2)), 0, 1)
"""
from openpyxl import Workbook
from openpyxl.styles import Font
from playwright.sync_api import sync_playwright
YEAR = 2024
OUTPUT_EXCEL_FILENAME = "output/subject_assessment.xlsx"
def main():
"""Load CP assessment details from the web and save them to an Excel file."""
subject_to_items = {}
year_to_get = YEAR
# subjects = ['CP1401', 'CP1402', 'CP5638'] # Short list just for testing
subjects = load_subjects()
print("Processing assessment items for", subjects)
for subject in subjects:
subject_name, assessment_text = extract_assessment(year_to_get, subject)
items = extract_items(assessment_text)
subject_to_items[subject] = [subject_name, items]
# print(subject_to_items)
write_spreadsheet(subject_to_items)
def extract_assessment(year, subject_code):
"""Download and extract one subject's assessment details from JCU CSDB website."""
url = f"https://apps.jcu.edu.au/subjectsearch/#/subject/{year}/{subject_code}"
print("Getting", subject_code)
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
# Wait for the page to load completely
page.wait_for_selector("h3:has-text('Subject Assessment')")
# Get the subject name
title = page.query_selector("h2").text_content()
subject_name = title.split(" - ")[1]
# Locate the <h3> element and then find the following <ul>
assessment_content = page.query_selector("h3:has-text('Subject Assessment') + ul")
if assessment_content:
assessment_text = assessment_content.inner_text()
else:
assessment_text = "No assessment information found."
browser.close()
return subject_name, assessment_text
def load_subjects():
"""Load all subject codes from text file."""
file_in = open("data/all_subjects.txt")
subjects = [line.strip() for line in file_in]
file_in.close()
return subjects
def extract_items(block):
"""Extract assessment items from text block as list of tuples."""
items = []
for line in block.split('\n'):
parts = line.split(' - ')
try:
mode = parts[0]
weight = int(parts[1].strip('(').strip('%)'))
group = parts[2]
items.append((mode, weight, group))
except ValueError:
print(f"Fixing invalid int: {line}")
# This is almost surely a "-" in the assessment item name (only one for CP5638 at time of testing)
mode = f"{parts[0]} - {parts[1]}"
del parts[1]
weight = int(parts[1].strip('(').strip('%)'))
group = parts[2]
items.append((mode, weight, group))
except IndexError:
print(f"ERROR with: {line}")
return items
def write_spreadsheet(subject_to_items):
wb = Workbook()
ws = wb.active
ws.title = "Subject Assessment"
# Write header titles, named to support Excel data table where column headers must be unique
headers = ["Code", "Subject Title", "Check",
"Item1", "Mode1", "Weight1", "Group1", "Moderation1",
"Item2", "Mode2", "Weight2", "Group2", "Moderation2",
"Item3", "Mode3", "Weight3", "Group3", "Moderation3",
"Item4", "Mode4", "Weight4", "Group4", "Moderation4"]
# Write headers to the first row
for col_number, header in enumerate(headers, start=1):
cell = ws.cell(row=1, column=col_number, value=header)
cell.font = Font(bold=True)
# Write per-subject assessment items
row = 2
for subject_code, items in subject_to_items.items():
subject_name, assessment_items = items
col = 1
ws.cell(row=row, column=col, value=subject_code)
col += 1
ws.cell(row=row, column=col, value=subject_name)
col += 3
for item in assessment_items:
for element in item:
ws.cell(row=row, column=col, value=element)
col += 1
col += 2 # Leave room for comment and a gap between assessment items
row += 1
wb.save(OUTPUT_EXCEL_FILENAME)
def write_spreadsheet_2(all_subject_details):
# each value contains (a list of items, prerequisite string)
workbook = openpyxl.load_workbook(filename=OUTPUT_EXCEL_FILENAME)
sheet = workbook['Assessment-Mapping']
row = 12 # first row for assessment items
column = 2
for subject, items in all_subject_details.items():
# sheet.cell(row=row, column=column, value=subject)
prerequisite = items[1]
items = items[0] # effectively rename as (assessment) items, without prerequisite
item_row = row
for item_number, item in enumerate(items):
name, weight = item
try:
weight = int(weight)
except ValueError:
pass
sheet.cell(row=item_row + item_number, column=column, value=name)
sheet.cell(row=item_row + item_number, column=column + 3, value=weight)
sheet.cell(row=24, column=column, value=prerequisite)
column += 4 # distance to next subject (4 pieces of data per assessment)
# row += 7 # move down to write next year's items
# column = 2
workbook.save(filename=f"output/temp.xlsx")
def run_tests():
block = """Performance/Practice/Product > Software development/creation - (60%) - Individual
Performance/Practice/Product > Practical assessment/practical skills demonstration - (40%) - Individual"""
items = extract_items(block)
print(items)
# run_tests()
main()