From 94ab676aa96a26ee5e2a818fa2bf7837c85eb0a9 Mon Sep 17 00:00:00 2001 From: Iain Dillingham Date: Mon, 17 Apr 2023 12:09:28 +0100 Subject: [PATCH 1/4] Make query consistent with report text Several tables referenced by the query contain dates in the far future (e.g. the year 9999), probably because these dates represent null values. The report text, which was copied from the notebook text, says that the figures represent event activity from 2016-01-01 to the date the report was run. We can avoid having to handle dates in the far future in downstream actions by making the query consistent with the report text, and returning event activity to the date the report was run. --- analysis/query.sql | 51 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/analysis/query.sql b/analysis/query.sql index 699baca..74aaacf 100644 --- a/analysis/query.sql +++ b/analysis/query.sql @@ -1,12 +1,17 @@ DECLARE @from_date DATE; SET @from_date = DATEFROMPARTS(2016, 1, 1); +DECLARE @to_date DATE; +SET @to_date = CONVERT(DATE, GETDATE()); + SELECT 'CodedEvent' AS table_name, CONVERT(DATE, ConsultationDate) AS event_date, COUNT(*) AS event_count FROM CodedEvent -WHERE CONVERT(DATE, ConsultationDate) >= @from_date +WHERE + CONVERT(DATE, ConsultationDate) >= @from_date + AND CONVERT(DATE, ConsultationDate) <= @to_date GROUP BY CONVERT(DATE, ConsultationDate) UNION ALL @@ -16,7 +21,9 @@ SELECT CONVERT(DATE, SeenDate) AS event_date, COUNT(*) AS event_count FROM Appointment -WHERE CONVERT(DATE, SeenDate) >= @from_date +WHERE + CONVERT(DATE, SeenDate) >= @from_date + AND CONVERT(DATE, SeenDate) <= @to_date GROUP BY CONVERT(DATE, SeenDate) UNION ALL @@ -26,7 +33,9 @@ SELECT Admission_Date AS event_date, COUNT(*) AS event_count FROM APCS -WHERE Admission_Date >= @from_date +WHERE + Admission_Date >= @from_date + AND Admission_Date <= @to_date GROUP BY Admission_Date UNION ALL @@ -36,7 +45,9 @@ SELECT DateOfDeath AS event_date, COUNT(*) AS event_count FROM CPNS -WHERE DateOfDeath >= @from_date +WHERE + DateOfDeath >= @from_date + AND DateOfDeath <= @to_date GROUP BY DateOfDeath UNION ALL @@ -46,7 +57,9 @@ SELECT Arrival_Date AS event_date, COUNT(*) AS event_count FROM EC -WHERE Arrival_Date >= @from_date +WHERE + Arrival_Date >= @from_date + AND Arrival_Date <= @to_date GROUP BY Arrival_Date UNION ALL @@ -56,7 +69,9 @@ SELECT Appointment_Date AS event_date, COUNT(*) AS event_count FROM OPA -WHERE Appointment_Date >= @from_date +WHERE + Appointment_Date >= @from_date + AND Appointment_Date <= @to_date GROUP BY Appointment_Date UNION ALL @@ -66,7 +81,9 @@ SELECT CONVERT(DATE, IcuAdmissionDateTime) AS event_date, COUNT(*) AS event_count FROM ICNARC -WHERE CONVERT(DATE, IcuAdmissionDateTime) >= @from_date +WHERE + CONVERT(DATE, IcuAdmissionDateTime) >= @from_date + AND CONVERT(DATE, IcuAdmissionDateTime) <= @to_date GROUP BY CONVERT(DATE, IcuAdmissionDateTime) UNION ALL @@ -76,7 +93,9 @@ SELECT dod AS event_date, COUNT(*) AS event_count FROM ONS_Deaths -WHERE dod >= @from_date +WHERE + dod >= @from_date + AND dod <= @to_date GROUP BY dod UNION ALL @@ -86,7 +105,9 @@ SELECT Earliest_Specimen_Date AS event_date, COUNT(*) AS event_count FROM SGSS_Positive -WHERE Earliest_Specimen_Date >= @from_date +WHERE + Earliest_Specimen_Date >= @from_date + AND Earliest_Specimen_Date <= @to_date GROUP BY Earliest_Specimen_Date UNION ALL @@ -96,7 +117,9 @@ SELECT Earliest_Specimen_Date AS event_date, COUNT(*) AS event_count FROM SGSS_Negative -WHERE Earliest_Specimen_Date >= @from_date +WHERE + Earliest_Specimen_Date >= @from_date + AND Earliest_Specimen_Date <= @to_date GROUP BY Earliest_Specimen_Date UNION ALL @@ -106,7 +129,9 @@ SELECT Specimen_Date AS event_date, COUNT(*) AS event_count FROM SGSS_AllTests_Positive -WHERE Specimen_Date >= @from_date +WHERE + Specimen_Date >= @from_date + AND Specimen_Date <= @to_date GROUP BY Specimen_Date UNION ALL @@ -116,7 +141,9 @@ SELECT Specimen_Date AS event_date, COUNT(*) AS event_count FROM SGSS_AllTests_Negative -WHERE Specimen_Date >= @from_date +WHERE + Specimen_Date >= @from_date + AND Specimen_Date <= @to_date GROUP BY Specimen_Date ORDER BY table_name, event_date, event_count From b3852d01700b6b7a109b5393c075aa27fdbb6417 Mon Sep 17 00:00:00 2001 From: Iain Dillingham Date: Mon, 17 Apr 2023 12:36:11 +0100 Subject: [PATCH 2/4] Move `FROM_DATE` and `TO_DATE` We move the initialisation of `FROM_DATE` and `TO_DATE` from deep within `render_report` to `config`, to make them more visible and to highlight the redundancy -- their canonical values are available within *analysis/query.sql* (former) and at runtime (latter). SQL Runner doesn't accept parametrised queries [1] -- and even if it did, we'd still have redundancy in *project.yaml* -- and the date and time an upstream SQL Runner action was executed is not accessible to downstream actions [2], so the best we can do to prevent drift between the `query` action and the `render_report` action is to highlight the redundancy. [1]: https://github.com/opensafely-core/sqlrunner/issues/72 [2]: https://github.com/opensafely-core/sqlrunner/issues/86 --- analysis/config.py | 7 +++++++ analysis/query.sql | 2 ++ analysis/render_report.py | 6 +++--- 3 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 analysis/config.py diff --git a/analysis/config.py b/analysis/config.py new file mode 100644 index 0000000..184248a --- /dev/null +++ b/analysis/config.py @@ -0,0 +1,7 @@ +import datetime + + +# If you change the values of these variables, then also change their equivalents in +# analysis/query.sql. +FROM_DATE = datetime.date(2016, 1, 1) +TO_DATE = datetime.date.today() diff --git a/analysis/query.sql b/analysis/query.sql index 74aaacf..adb27fc 100644 --- a/analysis/query.sql +++ b/analysis/query.sql @@ -1,3 +1,5 @@ +-- If you change the values of these variables, +-- then also change their equivalents in analysis/config.py. DECLARE @from_date DATE; SET @from_date = DATEFROMPARTS(2016, 1, 1); diff --git a/analysis/render_report.py b/analysis/render_report.py index 464af9c..42c0f97 100644 --- a/analysis/render_report.py +++ b/analysis/render_report.py @@ -9,7 +9,7 @@ from jinja2 import Environment, FileSystemLoader, StrictUndefined -from analysis import utils +from analysis import config, utils ENVIRONMENT = Environment( @@ -24,8 +24,8 @@ def main(): rendered_report = render_report( { "tpp_epoch_date": datetime.date(2009, 1, 1), - "from_date": datetime.date(2016, 1, 1), # analysis/query.sql - "to_date": datetime.date.today(), + "from_date": config.FROM_DATE, + "to_date": config.TO_DATE, "plots": sorted((utils.OUTPUT_DIR / "plot").glob("*.png")), } ) From b4e64443828449c8349e17c32755da5a6ac6c89c Mon Sep 17 00:00:00 2001 From: Iain Dillingham Date: Mon, 17 Apr 2023 12:57:03 +0100 Subject: [PATCH 3/4] Comment `tpp_epoch_date` Because I don't know but my future self will assume that I did. --- analysis/render_report.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/analysis/render_report.py b/analysis/render_report.py index 42c0f97..2b66eaf 100644 --- a/analysis/render_report.py +++ b/analysis/render_report.py @@ -23,6 +23,12 @@ def main(): utils.makedirs(f_out.parent) rendered_report = render_report( { + # FIXME: I don't know what's special about 2009-01-01 (the name + # `tpp_epoch_date` is my best guess), so I asked on Slack. For more + # information, see: + # https://bennettoxford.slack.com/archives/C03FB777L1M/p1681721217659849 + # It's passed as a template variable so that we can format it consistently + # with other template variables. "tpp_epoch_date": datetime.date(2009, 1, 1), "from_date": config.FROM_DATE, "to_date": config.TO_DATE, From 72f30e1623e9b1990a01d06d4427532cbd7e1882 Mon Sep 17 00:00:00 2001 From: Iain Dillingham Date: Mon, 17 Apr 2023 15:32:35 +0100 Subject: [PATCH 4/4] Assert event_date is a DatetimeIndex If a column given by parse_dates cannot be represented as an array of datetimes, then the column is returned as a string; no error is raised. We often encounter such columns, but we would like to know sooner rather than later, and with a more helpful error message. --- analysis/aggregate.py | 14 +++++++++++++- tests/test_aggregate.py | 9 +++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/analysis/aggregate.py b/analysis/aggregate.py index 9781a8d..d13618e 100644 --- a/analysis/aggregate.py +++ b/analysis/aggregate.py @@ -19,7 +19,7 @@ def read(f_in): date_col = "event_date" index_cols = ["table_name", date_col] value_col = "event_count" - return ( + event_counts = ( pandas.read_csv( f_in, parse_dates=[date_col], @@ -30,6 +30,18 @@ def read(f_in): .sort_index() ) + # If a column given by parse_dates cannot be represented as an array of datetimes, + # then the column is returned as a string; no error is raised. We often encounter + # such columns, but we would like to know sooner rather than later, and with a more + # helpful error message. The duck-typing way of testing that an index is an array of + # datetimes is to call .is_all_dates. However, this property was removed in v2.0.0 + # so, for the benefit of our future self, we'll call isinstance instead. + assert isinstance( + event_counts.index.get_level_values(date_col), pandas.DatetimeIndex + ), f"The {date_col} column cannot be parsed into a DatetimeIndex" + + return event_counts + def aggregate(event_counts, offset, func): group_by, resample_by = event_counts.index.names diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py index 413c20e..9ee607a 100644 --- a/tests/test_aggregate.py +++ b/tests/test_aggregate.py @@ -1,8 +1,17 @@ import pandas +import pytest from analysis import aggregate +def test_read_with_unparsable_date(tmp_path): + rows_csv = tmp_path / "rows.csv" + unparsable_date = "9999-01-01" + rows_csv.write_text(f"table_name,event_date,event_count\nAPCS,{unparsable_date},1") + with pytest.raises(AssertionError): + aggregate.read(rows_csv) + + def make_series(event_dates): index = pandas.MultiIndex.from_product( (["table_1", "table_2"], pandas.to_datetime(event_dates)),