Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update xlsx parsing #89

Merged
merged 4 commits into from
Oct 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion API/gimvicurnik/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ class LunchSchedule(Base):

id: Mapped[intpk]
date: Mapped[date_] = mapped_column(index=True)
time: Mapped[time_]
time: Mapped[time_ | None]

class_id: Mapped[class_fk] = mapped_column(index=True)
class_: Mapped[Class] = relationship()
Expand Down
107 changes: 10 additions & 97 deletions API/gimvicurnik/updaters/eclassroom.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,10 +270,6 @@ def parse_document(self, document: DocumentInfo, stream: BytesIO, effective: dat
match (document.type, document.extension):
case (DocumentType.SUBSTITUTIONS, "pdf"):
self._parse_substitutions_pdf(stream, effective)
case (DocumentType.LUNCH_SCHEDULE, "pdf"):
self._parse_lunch_schedule_pdf(stream, effective)
case (DocumentType.SUBSTITUTIONS, "xlsx"):
self._parse_substitutions_xlsx(stream, effective)
case (DocumentType.LUNCH_SCHEDULE, "xlsx"):
self._parse_lunch_schedule_xlsx(stream, effective)
case (DocumentType.SUBSTITUTIONS, _):
Expand Down Expand Up @@ -647,103 +643,15 @@ def _parse_substitutions_pdf(self, stream: BytesIO, effective: date) -> None:
if substitutions:
self.session.execute(insert(Substitution), substitutions)

def _parse_substitutions_xlsx(self, stream: BytesIO, effective: date) -> None:
"""Parse the substitutions xlsx document."""
# Currently not useful.
pass

def _parse_lunch_schedule_pdf(self, stream: BytesIO, effective: date) -> None:
"""Parse the lunch schedule pdf document."""

schedule = []

# Extract all tables from a PDF stream
tables = with_span(op="extract")(extract_tables)(stream)

for table in tables:
# Skip instructions
if not table[0][0] or "Dijaki prihajate v jedilnico" in table[0][0]:
continue

for index, row in enumerate(table):
# Handle incorrectly connected cells
if row[0] and "\n" in row[0] and len(row) == 4:
time, notes = row[0].split("\n", 1)
row[0] = time
row.insert(1, notes)

# Handle incorrectly connected cells
if row[0] and " " in row[0] and len(row) == 4:
time, notes = row[0].split(" ", 1)
row[0] = time
row.insert(1, notes)

# Skip the header
if row[0] and "ura" in row[0]:
continue

# Skip empty rows
if len(row) != 5 or not row[0]:
continue

# Skip invalid time formats
if "odj." in row[0]:
continue

# Handle multiple times in the same cell
times = row[0].split("\n", 1)
if len(times) == 2:
row[0] = times[0]
table[index + 1][0] = times[1]

# Handle incorrectly connected cells
if row[1] is None and len(row[0].split(" ", 1)) == 2:
row[0], row[1] = row[0].split(" ", 1)

# Parse time format
time = re.sub("cca|do", "", row[0]).replace(".", ":").strip()
time = datetime.strptime(time, "%H:%M").time() # type: ignore[assignment]

# Get notes, classes and location if they are specified
notes = row[1].strip() if row[1] else None # type: ignore[assignment]
classes = re.sub("[().]", "", row[2]).split(",") if row[2] else []
location = row[4].strip() if row[4] else None

# Handle special format for multiple classes
if len(classes) == 1 and isinstance(classes[0], str):
if search := re.search(r"(\d)\.? ?[lL]?(?:\.|$)", classes[0]):
class_letters = ["A", "B", "C", "D", "E", "F"]
classes = [search.group(1) + class_ for class_ in class_letters]

for class_ in classes:
if not class_.strip():
continue

class_id = get_or_create(self.session, model=Class, name=class_.strip())[0].id

schedule.append(
{
"class_id": class_id,
"date": effective,
"time": time,
"location": location,
"notes": notes,
}
)

# Store schedule to a database
self.session.query(LunchSchedule).filter(LunchSchedule.date == effective).delete()
self.session.execute(insert(LunchSchedule), schedule)

def _parse_lunch_schedule_xlsx(self, stream: BytesIO, effective: date) -> None:
"""
Parse the lunch schedule xlsx document.

Columns should be:
Columns:
- Time (Ura)
- Notes (Opombe/Prilagoditev)
- Class (Razred)
* number of students (stevilo dijakov) [ignored]
- Number of students (Stevilo dijakov)
- Location (Prostor)
"""

Expand All @@ -754,24 +662,29 @@ def _parse_lunch_schedule_xlsx(self, stream: BytesIO, effective: date) -> None:

# Parse lunch schedule
for ws in wb:
if ws.title != "kosilo":
continue

for wr in ws.iter_rows(min_row=3, max_col=5):
if not wr[2].value:
if not wr[3].value:
break

# Check for correct cell value type
if typing.TYPE_CHECKING:
assert isinstance(wr[0].value, datetime)
assert isinstance(wr[1].value, str)
assert isinstance(wr[2].value, str)
assert isinstance(wr[3].value, int)
assert isinstance(wr[4].value, str)

if "raz" in wr[2].value:
# Ignore rows that do not contain a class name
if not wr[2].value or "raz" in wr[2].value:
continue

schedule: dict[str, Any] = {}

# Time in format H:M
schedule["time"] = wr[0].value
schedule["time"] = wr[0].value if wr[0].value else None

schedule["notes"] = wr[1].value.strip() if wr[1].value else None

Expand Down
73 changes: 0 additions & 73 deletions API/gimvicurnik/updaters/menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from .base import BaseMultiUpdater, DocumentInfo
from ..database import DocumentType, LunchMenu, SnackMenu
from ..errors import MenuApiError, MenuDateError, MenuFormatError
from ..utils.pdf import extract_tables
from ..utils.sentry import with_span

if typing.TYPE_CHECKING:
Expand Down Expand Up @@ -121,10 +120,6 @@ def parse_document(self, document: DocumentInfo, stream: BytesIO, effective: dat
span.set_tag("document.format", document.extension)

match (document.type, document.extension):
case (DocumentType.SNACK_MENU, "pdf"):
self._parse_snack_menu_pdf(stream, effective)
case (DocumentType.LUNCH_MENU, "pdf"):
self._parse_lunch_menu_pdf(stream, effective)
case (DocumentType.SNACK_MENU, "xlsx"):
self._parse_snack_menu_xlsx(stream, effective)
case (DocumentType.LUNCH_MENU, "xlsx"):
Expand All @@ -136,41 +131,6 @@ def parse_document(self, document: DocumentInfo, stream: BytesIO, effective: dat
case _:
raise KeyError("Unknown document type for menu")

def _parse_snack_menu_pdf(self, stream: BytesIO, effective: datetime.date) -> None:
"""Parse the snack menu PDF document."""

# Extract all tables from a PDF stream
tables = with_span(op="extract")(extract_tables)(stream)

days = 0

# Parse tables into menus and store them
for table in tables:
for row in table:
if not row[1] or "NV in N" in row[1]:
continue

current = effective + datetime.timedelta(days=days)
days += 1

menu = {
"date": current,
"normal": row[1],
"poultry": row[2],
"vegetarian": row[3],
"fruitvegetable": row[4],
}

model = self.session.query(SnackMenu).filter(SnackMenu.date == current).first()

if not model:
model = SnackMenu()

for key, value in menu.items():
setattr(model, key, value)

self.session.add(model)

def _parse_snack_menu_xlsx(self, stream: BytesIO, effective: datetime.date) -> None:
"""Parse the snack menu XLSX document."""

Expand Down Expand Up @@ -241,39 +201,6 @@ def _parse_snack_menu_xlsx(self, stream: BytesIO, effective: datetime.date) -> N

wb.close()

def _parse_lunch_menu_pdf(self, stream: BytesIO, effective: datetime.date) -> None:
"""Parse the lunch menu PDF document."""

# Extract all tables from a PDF stream
tables = with_span(op="extract")(extract_tables)(stream)

days = 0

# Parse tables into menus and store them
for table in tables:
for row in table:
if not row[1] or "N KOSILO" in row[1]:
continue

current = effective + datetime.timedelta(days=days)
days += 1

menu = {
"date": current,
"normal": row[1],
"vegetarian": row[2],
}

model = self.session.query(LunchMenu).filter(LunchMenu.date == current).first()

if not model:
model = LunchMenu()

for key, value in menu.items():
setattr(model, key, value)

self.session.add(model)

def _parse_lunch_menu_xlsx(self, stream: BytesIO, effective: datetime.date) -> None:
"""Parse the lunch menu XLSX document."""

Expand Down
12 changes: 9 additions & 3 deletions website/src/components/menus/MenuDisplay.vue
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,15 @@
class="grey--text pb-2">
<h2 class="font-weight-regular pb-2">Razpored kosila</h2>
<p v-for="currentLunchSchedule in currentLunchSchedules" :key="currentLunchSchedule.time">
Ura: {{ currentLunchSchedule.time }}<br />
Prostor: {{ currentLunchSchedule.location }}<br />
Opombe: {{ currentLunchSchedule.notes }}<br />
<span v-if="currentLunchSchedule.time">
Ura: {{ currentLunchSchedule.time }}<br />
</span>
<span v-if="currentLunchSchedule.location">
Prostor: {{ currentLunchSchedule.location }}<br />
</span>
<span v-if="currentLunchSchedule.notes">
Opombe: {{ currentLunchSchedule.notes }}<br />
</span>
</p>
</v-card-text>
</v-card>
Expand Down
Loading