fix: use outer join for census data (#1688)

previous inner join resulted in missing records.
DFE-Digital · Dec 20, 2024 · 73b17ce · 73b17ce
1 parent f01782a
commit 73b17ce
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 8 deletions.
diff --git a/data-pipeline/src/pipeline/pre_processing.py b/data-pipeline/src/pipeline/pre_processing.py
@@ -44,6 +44,16 @@ def prepare_cdc_data(cdc_file_path, current_year):
 
 # noinspection PyTypeChecker
 def prepare_census_data(workforce_census_path, pupil_census_path):
+    """
+    Prepare workforce- and pupil-census data.
+
+    Note: either source may have orgs. present which the other lacks.
+    In either case, all rows must be retained in the resulting, merged
+    data.
+
+    :param workforce_census_path: readable source for workforce census
+    :param pupil_census_path: readable source for pupil census
+    """
     school_workforce_census = pd.read_excel(
         workforce_census_path,
         header=5,
@@ -79,13 +89,18 @@ def prepare_census_data(workforce_census_path, pupil_census_path):
     else:
         school_pupil_census["Pupil Dual Registrations"] = 0
 
-    census = school_pupil_census.join(
-        school_workforce_census,
-        on="URN",
-        how="inner",
-        rsuffix="_pupil",
-        lsuffix="_workforce",
-    ).rename(columns=config.census_column_map)
+    census = (
+        school_pupil_census.join(
+            school_workforce_census,
+            on="URN",
+            how="outer",
+            rsuffix="_pupil",
+            lsuffix="_workforce",
+        )
+        .rename(columns=config.census_column_map)
+        .reset_index()
+        .set_index("URN")
+    )
 
     census["Number of pupils"] = (
         census["Number of pupils"] + census["Pupil Dual Registrations"]
@@ -1091,7 +1106,6 @@ def build_maintained_school_data(
     ks2,
     ks4,
 ):
-
     maintained_schools_list = pd.read_csv(
         maintained_schools_data_path,
         encoding="unicode-escape",

diff --git a/data-pipeline/tests/unit/pre_processing/test_census.py b/data-pipeline/tests/unit/pre_processing/test_census.py
@@ -1,5 +1,9 @@
+import io
+
 import pandas as pd
 
+from pipeline.pre_processing import prepare_census_data
+
 
 def test_census_data_has_correct_output_columns(prepared_census_data: pd.DataFrame):
     assert list(prepared_census_data.columns) == [
@@ -46,3 +50,90 @@ def test_total_nursery_computed_correctly(prepared_census_data: pd.DataFrame):
 
 def test_total_sixth_form_computed_correctly(prepared_census_data: pd.DataFrame):
     assert prepared_census_data.loc[100150]["TotalPupilsSixthForm"] == 40
+
+
+def test_census_data_pupil_merge(
+    workforce_census_data: pd.DataFrame,
+    pupil_census_data: pd.DataFrame,
+):
+    """
+    Missing rows from the pupil-census data should not result in
+    missing rows from the final, merged dataset.
+    """
+    pupil_census_data = pupil_census_data[pupil_census_data["URN"] != 100153]
+    pupil_csv = io.StringIO(pupil_census_data.to_csv())
+
+    output = io.BytesIO()
+    writer = pd.ExcelWriter(output)
+    workforce_census_data.to_excel(
+        writer, startrow=5, sheet_name="Schools 2022", index=False
+    )
+    writer.close()
+    output.seek(0)
+    workforce_xlsx = output
+
+    census = prepare_census_data(workforce_xlsx, pupil_csv)
+
+    assert sorted(list(pupil_census_data["URN"])) == [100150, 100152]
+    assert sorted(list(workforce_census_data["URN"])) == [100150, 100152, 100153]
+    assert sorted(list(census.index)) == [100150, 100152, 100153]
+
+
+def test_census_data_workforce_merge(
+    workforce_census_data: pd.DataFrame,
+    pupil_census_data: pd.DataFrame,
+):
+    """
+    Missing rows from the workforce-census data should not result in
+    missing rows from the final, merged dataset.
+    """
+    pupil_csv = io.StringIO(pupil_census_data.to_csv())
+
+    output = io.BytesIO()
+    writer = pd.ExcelWriter(output)
+    workforce_census_data = workforce_census_data[
+        workforce_census_data["URN"] != 100153
+    ]
+    workforce_census_data.to_excel(
+        writer, startrow=5, sheet_name="Schools 2022", index=False
+    )
+    writer.close()
+    output.seek(0)
+    workforce_xlsx = output
+
+    census = prepare_census_data(workforce_xlsx, pupil_csv)
+
+    assert sorted(list(pupil_census_data["URN"])) == [100150, 100152, 100153]
+    assert sorted(list(workforce_census_data["URN"])) == [100150, 100152]
+    assert sorted(list(census.index)) == [100150, 100152, 100153]
+
+
+def test_census_data_merge(
+    workforce_census_data: pd.DataFrame,
+    pupil_census_data: pd.DataFrame,
+):
+    """
+    Missing rows from the either census data should not result in
+    missing rows from the final, merged dataset.
+    """
+    pupil_census_data = pupil_census_data[pupil_census_data["URN"] != 100153]
+    pupil_csv = io.StringIO(pupil_census_data.to_csv())
+
+    output = io.BytesIO()
+    writer = pd.ExcelWriter(output)
+    workforce_census_data = workforce_census_data[
+        workforce_census_data["URN"] != 100152
+    ]
+    workforce_census_data.to_excel(
+        writer, startrow=5, sheet_name="Schools 2022", index=False
+    )
+    writer.close()
+    output.seek(0)
+    workforce_xlsx = output
+
+    census = prepare_census_data(workforce_xlsx, pupil_csv)
+
+    print(census)
+    assert sorted(list(pupil_census_data["URN"])) == [100150, 100152]
+    assert sorted(list(workforce_census_data["URN"])) == [100150, 100153]
+    assert sorted(list(census.index)) == [100150, 100152, 100153]