diff --git a/data-pipeline/src/pipeline/pre_processing.py b/data-pipeline/src/pipeline/pre_processing.py index 906c264ec..b1a29283b 100644 --- a/data-pipeline/src/pipeline/pre_processing.py +++ b/data-pipeline/src/pipeline/pre_processing.py @@ -44,6 +44,16 @@ def prepare_cdc_data(cdc_file_path, current_year): # noinspection PyTypeChecker def prepare_census_data(workforce_census_path, pupil_census_path): + """ + Prepare workforce- and pupil-census data. + + Note: either source may have orgs. present which the other lacks. + In either case, all rows must be retained in the resulting, merged + data. + + :param workforce_census_path: readable source for workforce census + :param pupil_census_path: readable source for pupil census + """ school_workforce_census = pd.read_excel( workforce_census_path, header=5, @@ -79,13 +89,18 @@ def prepare_census_data(workforce_census_path, pupil_census_path): else: school_pupil_census["Pupil Dual Registrations"] = 0 - census = school_pupil_census.join( - school_workforce_census, - on="URN", - how="inner", - rsuffix="_pupil", - lsuffix="_workforce", - ).rename(columns=config.census_column_map) + census = ( + school_pupil_census.join( + school_workforce_census, + on="URN", + how="outer", + rsuffix="_pupil", + lsuffix="_workforce", + ) + .rename(columns=config.census_column_map) + .reset_index() + .set_index("URN") + ) census["Number of pupils"] = ( census["Number of pupils"] + census["Pupil Dual Registrations"] @@ -1091,7 +1106,6 @@ def build_maintained_school_data( ks2, ks4, ): - maintained_schools_list = pd.read_csv( maintained_schools_data_path, encoding="unicode-escape", diff --git a/data-pipeline/tests/unit/pre_processing/test_census.py b/data-pipeline/tests/unit/pre_processing/test_census.py index 3a6473e13..b5925ae11 100644 --- a/data-pipeline/tests/unit/pre_processing/test_census.py +++ b/data-pipeline/tests/unit/pre_processing/test_census.py @@ -1,5 +1,9 @@ +import io + import pandas as pd +from pipeline.pre_processing import prepare_census_data + def test_census_data_has_correct_output_columns(prepared_census_data: pd.DataFrame): assert list(prepared_census_data.columns) == [ @@ -46,3 +50,90 @@ def test_total_nursery_computed_correctly(prepared_census_data: pd.DataFrame): def test_total_sixth_form_computed_correctly(prepared_census_data: pd.DataFrame): assert prepared_census_data.loc[100150]["TotalPupilsSixthForm"] == 40 + + +def test_census_data_pupil_merge( + workforce_census_data: pd.DataFrame, + pupil_census_data: pd.DataFrame, +): + """ + Missing rows from the pupil-census data should not result in + missing rows from the final, merged dataset. + """ + pupil_census_data = pupil_census_data[pupil_census_data["URN"] != 100153] + pupil_csv = io.StringIO(pupil_census_data.to_csv()) + + output = io.BytesIO() + writer = pd.ExcelWriter(output) + workforce_census_data.to_excel( + writer, startrow=5, sheet_name="Schools 2022", index=False + ) + writer.close() + output.seek(0) + workforce_xlsx = output + + census = prepare_census_data(workforce_xlsx, pupil_csv) + + assert sorted(list(pupil_census_data["URN"])) == [100150, 100152] + assert sorted(list(workforce_census_data["URN"])) == [100150, 100152, 100153] + assert sorted(list(census.index)) == [100150, 100152, 100153] + + +def test_census_data_workforce_merge( + workforce_census_data: pd.DataFrame, + pupil_census_data: pd.DataFrame, +): + """ + Missing rows from the workforce-census data should not result in + missing rows from the final, merged dataset. + """ + pupil_csv = io.StringIO(pupil_census_data.to_csv()) + + output = io.BytesIO() + writer = pd.ExcelWriter(output) + workforce_census_data = workforce_census_data[ + workforce_census_data["URN"] != 100153 + ] + workforce_census_data.to_excel( + writer, startrow=5, sheet_name="Schools 2022", index=False + ) + writer.close() + output.seek(0) + workforce_xlsx = output + + census = prepare_census_data(workforce_xlsx, pupil_csv) + + assert sorted(list(pupil_census_data["URN"])) == [100150, 100152, 100153] + assert sorted(list(workforce_census_data["URN"])) == [100150, 100152] + assert sorted(list(census.index)) == [100150, 100152, 100153] + + +def test_census_data_merge( + workforce_census_data: pd.DataFrame, + pupil_census_data: pd.DataFrame, +): + """ + Missing rows from the either census data should not result in + missing rows from the final, merged dataset. + """ + pupil_census_data = pupil_census_data[pupil_census_data["URN"] != 100153] + pupil_csv = io.StringIO(pupil_census_data.to_csv()) + + output = io.BytesIO() + writer = pd.ExcelWriter(output) + workforce_census_data = workforce_census_data[ + workforce_census_data["URN"] != 100152 + ] + workforce_census_data.to_excel( + writer, startrow=5, sheet_name="Schools 2022", index=False + ) + writer.close() + output.seek(0) + workforce_xlsx = output + + census = prepare_census_data(workforce_xlsx, pupil_csv) + + print(census) + assert sorted(list(pupil_census_data["URN"])) == [100150, 100152] + assert sorted(list(workforce_census_data["URN"])) == [100150, 100153] + assert sorted(list(census.index)) == [100150, 100152, 100153]