Skip to content

Commit

Permalink
fix: use outer join for census data (#1688)
Browse files Browse the repository at this point in the history
previous inner join resulted in missing records.
  • Loading branch information
PsypherPunk authored Dec 20, 2024
1 parent f01782a commit 73b17ce
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 8 deletions.
30 changes: 22 additions & 8 deletions data-pipeline/src/pipeline/pre_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ def prepare_cdc_data(cdc_file_path, current_year):

# noinspection PyTypeChecker
def prepare_census_data(workforce_census_path, pupil_census_path):
"""
Prepare workforce- and pupil-census data.
Note: either source may have orgs. present which the other lacks.
In either case, all rows must be retained in the resulting, merged
data.
:param workforce_census_path: readable source for workforce census
:param pupil_census_path: readable source for pupil census
"""
school_workforce_census = pd.read_excel(
workforce_census_path,
header=5,
Expand Down Expand Up @@ -79,13 +89,18 @@ def prepare_census_data(workforce_census_path, pupil_census_path):
else:
school_pupil_census["Pupil Dual Registrations"] = 0

census = school_pupil_census.join(
school_workforce_census,
on="URN",
how="inner",
rsuffix="_pupil",
lsuffix="_workforce",
).rename(columns=config.census_column_map)
census = (
school_pupil_census.join(
school_workforce_census,
on="URN",
how="outer",
rsuffix="_pupil",
lsuffix="_workforce",
)
.rename(columns=config.census_column_map)
.reset_index()
.set_index("URN")
)

census["Number of pupils"] = (
census["Number of pupils"] + census["Pupil Dual Registrations"]
Expand Down Expand Up @@ -1091,7 +1106,6 @@ def build_maintained_school_data(
ks2,
ks4,
):

maintained_schools_list = pd.read_csv(
maintained_schools_data_path,
encoding="unicode-escape",
Expand Down
91 changes: 91 additions & 0 deletions data-pipeline/tests/unit/pre_processing/test_census.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import io

import pandas as pd

from pipeline.pre_processing import prepare_census_data


def test_census_data_has_correct_output_columns(prepared_census_data: pd.DataFrame):
assert list(prepared_census_data.columns) == [
Expand Down Expand Up @@ -46,3 +50,90 @@ def test_total_nursery_computed_correctly(prepared_census_data: pd.DataFrame):

def test_total_sixth_form_computed_correctly(prepared_census_data: pd.DataFrame):
assert prepared_census_data.loc[100150]["TotalPupilsSixthForm"] == 40


def test_census_data_pupil_merge(
workforce_census_data: pd.DataFrame,
pupil_census_data: pd.DataFrame,
):
"""
Missing rows from the pupil-census data should not result in
missing rows from the final, merged dataset.
"""
pupil_census_data = pupil_census_data[pupil_census_data["URN"] != 100153]
pupil_csv = io.StringIO(pupil_census_data.to_csv())

output = io.BytesIO()
writer = pd.ExcelWriter(output)
workforce_census_data.to_excel(
writer, startrow=5, sheet_name="Schools 2022", index=False
)
writer.close()
output.seek(0)
workforce_xlsx = output

census = prepare_census_data(workforce_xlsx, pupil_csv)

assert sorted(list(pupil_census_data["URN"])) == [100150, 100152]
assert sorted(list(workforce_census_data["URN"])) == [100150, 100152, 100153]
assert sorted(list(census.index)) == [100150, 100152, 100153]


def test_census_data_workforce_merge(
workforce_census_data: pd.DataFrame,
pupil_census_data: pd.DataFrame,
):
"""
Missing rows from the workforce-census data should not result in
missing rows from the final, merged dataset.
"""
pupil_csv = io.StringIO(pupil_census_data.to_csv())

output = io.BytesIO()
writer = pd.ExcelWriter(output)
workforce_census_data = workforce_census_data[
workforce_census_data["URN"] != 100153
]
workforce_census_data.to_excel(
writer, startrow=5, sheet_name="Schools 2022", index=False
)
writer.close()
output.seek(0)
workforce_xlsx = output

census = prepare_census_data(workforce_xlsx, pupil_csv)

assert sorted(list(pupil_census_data["URN"])) == [100150, 100152, 100153]
assert sorted(list(workforce_census_data["URN"])) == [100150, 100152]
assert sorted(list(census.index)) == [100150, 100152, 100153]


def test_census_data_merge(
workforce_census_data: pd.DataFrame,
pupil_census_data: pd.DataFrame,
):
"""
Missing rows from the either census data should not result in
missing rows from the final, merged dataset.
"""
pupil_census_data = pupil_census_data[pupil_census_data["URN"] != 100153]
pupil_csv = io.StringIO(pupil_census_data.to_csv())

output = io.BytesIO()
writer = pd.ExcelWriter(output)
workforce_census_data = workforce_census_data[
workforce_census_data["URN"] != 100152
]
workforce_census_data.to_excel(
writer, startrow=5, sheet_name="Schools 2022", index=False
)
writer.close()
output.seek(0)
workforce_xlsx = output

census = prepare_census_data(workforce_xlsx, pupil_csv)

print(census)
assert sorted(list(pupil_census_data["URN"])) == [100150, 100152]
assert sorted(list(workforce_census_data["URN"])) == [100150, 100153]
assert sorted(list(census.index)) == [100150, 100152, 100153]

0 comments on commit 73b17ce

Please sign in to comment.