From 94ab676aa96a26ee5e2a818fa2bf7837c85eb0a9 Mon Sep 17 00:00:00 2001
From: Iain Dillingham <iain@dillingham.me.uk>
Date: Mon, 17 Apr 2023 12:09:28 +0100
Subject: [PATCH 1/4] Make query consistent with report text

Several tables referenced by the query contain dates in the far future
(e.g. the year 9999), probably because these dates represent null
values. The report text, which was copied from the notebook text, says
that the figures represent event activity from 2016-01-01 to the date
the report was run. We can avoid having to handle dates in the far
future in downstream actions by making the query consistent with the
report text, and returning event activity to the date the report was
run.
---
 analysis/query.sql | 51 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/analysis/query.sql b/analysis/query.sql
index 699baca..74aaacf 100644
--- a/analysis/query.sql
+++ b/analysis/query.sql
@@ -1,12 +1,17 @@
 DECLARE @from_date DATE;
 SET @from_date = DATEFROMPARTS(2016, 1, 1);
 
+DECLARE @to_date DATE;
+SET @to_date = CONVERT(DATE, GETDATE());
+
 SELECT
     'CodedEvent' AS table_name,
     CONVERT(DATE, ConsultationDate) AS event_date,
     COUNT(*) AS event_count
 FROM CodedEvent
-WHERE CONVERT(DATE, ConsultationDate) >= @from_date
+WHERE
+    CONVERT(DATE, ConsultationDate) >= @from_date
+    AND CONVERT(DATE, ConsultationDate) <= @to_date
 GROUP BY CONVERT(DATE, ConsultationDate)
 
 UNION ALL
@@ -16,7 +21,9 @@ SELECT
     CONVERT(DATE, SeenDate) AS event_date,
     COUNT(*) AS event_count
 FROM Appointment
-WHERE CONVERT(DATE, SeenDate) >= @from_date
+WHERE
+    CONVERT(DATE, SeenDate) >= @from_date
+    AND CONVERT(DATE, SeenDate) <= @to_date
 GROUP BY CONVERT(DATE, SeenDate)
 
 UNION ALL
@@ -26,7 +33,9 @@ SELECT
     Admission_Date AS event_date,
     COUNT(*) AS event_count
 FROM APCS
-WHERE Admission_Date >= @from_date
+WHERE
+    Admission_Date >= @from_date
+    AND Admission_Date <= @to_date
 GROUP BY Admission_Date
 
 UNION ALL
@@ -36,7 +45,9 @@ SELECT
     DateOfDeath AS event_date,
     COUNT(*) AS event_count
 FROM CPNS
-WHERE DateOfDeath >= @from_date
+WHERE
+    DateOfDeath >= @from_date
+    AND DateOfDeath <= @to_date
 GROUP BY DateOfDeath
 
 UNION ALL
@@ -46,7 +57,9 @@ SELECT
     Arrival_Date AS event_date,
     COUNT(*) AS event_count
 FROM EC
-WHERE Arrival_Date >= @from_date
+WHERE
+    Arrival_Date >= @from_date
+    AND Arrival_Date <= @to_date
 GROUP BY Arrival_Date
 
 UNION ALL
@@ -56,7 +69,9 @@ SELECT
     Appointment_Date AS event_date,
     COUNT(*) AS event_count
 FROM OPA
-WHERE Appointment_Date >= @from_date
+WHERE
+    Appointment_Date >= @from_date
+    AND Appointment_Date <= @to_date
 GROUP BY Appointment_Date
 
 UNION ALL
@@ -66,7 +81,9 @@ SELECT
     CONVERT(DATE, IcuAdmissionDateTime) AS event_date,
     COUNT(*) AS event_count
 FROM ICNARC
-WHERE CONVERT(DATE, IcuAdmissionDateTime) >= @from_date
+WHERE
+    CONVERT(DATE, IcuAdmissionDateTime) >= @from_date
+    AND CONVERT(DATE, IcuAdmissionDateTime) <= @to_date
 GROUP BY CONVERT(DATE, IcuAdmissionDateTime)
 
 UNION ALL
@@ -76,7 +93,9 @@ SELECT
     dod AS event_date,
     COUNT(*) AS event_count
 FROM ONS_Deaths
-WHERE dod >= @from_date
+WHERE
+    dod >= @from_date
+    AND dod <= @to_date
 GROUP BY dod
 
 UNION ALL
@@ -86,7 +105,9 @@ SELECT
     Earliest_Specimen_Date AS event_date,
     COUNT(*) AS event_count
 FROM SGSS_Positive
-WHERE Earliest_Specimen_Date >= @from_date
+WHERE
+    Earliest_Specimen_Date >= @from_date
+    AND Earliest_Specimen_Date <= @to_date
 GROUP BY Earliest_Specimen_Date
 
 UNION ALL
@@ -96,7 +117,9 @@ SELECT
     Earliest_Specimen_Date AS event_date,
     COUNT(*) AS event_count
 FROM SGSS_Negative
-WHERE Earliest_Specimen_Date >= @from_date
+WHERE
+    Earliest_Specimen_Date >= @from_date
+    AND Earliest_Specimen_Date <= @to_date
 GROUP BY Earliest_Specimen_Date
 
 UNION ALL
@@ -106,7 +129,9 @@ SELECT
     Specimen_Date AS event_date,
     COUNT(*) AS event_count
 FROM SGSS_AllTests_Positive
-WHERE Specimen_Date >= @from_date
+WHERE
+    Specimen_Date >= @from_date
+    AND Specimen_Date <= @to_date
 GROUP BY Specimen_Date
 
 UNION ALL
@@ -116,7 +141,9 @@ SELECT
     Specimen_Date AS event_date,
     COUNT(*) AS event_count
 FROM SGSS_AllTests_Negative
-WHERE Specimen_Date >= @from_date
+WHERE
+    Specimen_Date >= @from_date
+    AND Specimen_Date <= @to_date
 GROUP BY Specimen_Date
 
 ORDER BY table_name, event_date, event_count

From b3852d01700b6b7a109b5393c075aa27fdbb6417 Mon Sep 17 00:00:00 2001
From: Iain Dillingham <iain@dillingham.me.uk>
Date: Mon, 17 Apr 2023 12:36:11 +0100
Subject: [PATCH 2/4] Move `FROM_DATE` and `TO_DATE`

We move the initialisation of `FROM_DATE` and `TO_DATE` from deep within
`render_report` to `config`, to make them more visible and to highlight
the redundancy -- their canonical values are available within
*analysis/query.sql* (former) and at runtime (latter). SQL Runner
doesn't accept parametrised queries [1] -- and even if it did, we'd
still have redundancy in *project.yaml* -- and the date and time an
upstream SQL Runner action was executed is not accessible to downstream
actions [2], so the best we can do to prevent drift between the `query`
action and the `render_report` action is to highlight the redundancy.

[1]: https://github.com/opensafely-core/sqlrunner/issues/72
[2]: https://github.com/opensafely-core/sqlrunner/issues/86
---
 analysis/config.py        | 7 +++++++
 analysis/query.sql        | 2 ++
 analysis/render_report.py | 6 +++---
 3 files changed, 12 insertions(+), 3 deletions(-)
 create mode 100644 analysis/config.py

diff --git a/analysis/config.py b/analysis/config.py
new file mode 100644
index 0000000..184248a
--- /dev/null
+++ b/analysis/config.py
@@ -0,0 +1,7 @@
+import datetime
+
+
+# If you change the values of these variables, then also change their equivalents in
+# analysis/query.sql.
+FROM_DATE = datetime.date(2016, 1, 1)
+TO_DATE = datetime.date.today()
diff --git a/analysis/query.sql b/analysis/query.sql
index 74aaacf..adb27fc 100644
--- a/analysis/query.sql
+++ b/analysis/query.sql
@@ -1,3 +1,5 @@
+-- If you change the values of these variables,
+-- then also change their equivalents in analysis/config.py.
 DECLARE @from_date DATE;
 SET @from_date = DATEFROMPARTS(2016, 1, 1);
 
diff --git a/analysis/render_report.py b/analysis/render_report.py
index 464af9c..42c0f97 100644
--- a/analysis/render_report.py
+++ b/analysis/render_report.py
@@ -9,7 +9,7 @@
 
 from jinja2 import Environment, FileSystemLoader, StrictUndefined
 
-from analysis import utils
+from analysis import config, utils
 
 
 ENVIRONMENT = Environment(
@@ -24,8 +24,8 @@ def main():
     rendered_report = render_report(
         {
             "tpp_epoch_date": datetime.date(2009, 1, 1),
-            "from_date": datetime.date(2016, 1, 1),  # analysis/query.sql
-            "to_date": datetime.date.today(),
+            "from_date": config.FROM_DATE,
+            "to_date": config.TO_DATE,
             "plots": sorted((utils.OUTPUT_DIR / "plot").glob("*.png")),
         }
     )

From b4e64443828449c8349e17c32755da5a6ac6c89c Mon Sep 17 00:00:00 2001
From: Iain Dillingham <iain@dillingham.me.uk>
Date: Mon, 17 Apr 2023 12:57:03 +0100
Subject: [PATCH 3/4] Comment `tpp_epoch_date`

Because I don't know but my future self will assume that I did.
---
 analysis/render_report.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/analysis/render_report.py b/analysis/render_report.py
index 42c0f97..2b66eaf 100644
--- a/analysis/render_report.py
+++ b/analysis/render_report.py
@@ -23,6 +23,12 @@ def main():
     utils.makedirs(f_out.parent)
     rendered_report = render_report(
         {
+            # FIXME: I don't know what's special about 2009-01-01 (the name
+            # `tpp_epoch_date` is my best guess), so I asked on Slack. For more
+            # information, see:
+            # https://bennettoxford.slack.com/archives/C03FB777L1M/p1681721217659849
+            # It's passed as a template variable so that we can format it consistently
+            # with other template variables.
             "tpp_epoch_date": datetime.date(2009, 1, 1),
             "from_date": config.FROM_DATE,
             "to_date": config.TO_DATE,

From 72f30e1623e9b1990a01d06d4427532cbd7e1882 Mon Sep 17 00:00:00 2001
From: Iain Dillingham <iain@dillingham.me.uk>
Date: Mon, 17 Apr 2023 15:32:35 +0100
Subject: [PATCH 4/4] Assert event_date is a DatetimeIndex

If a column given by parse_dates cannot be represented as an array of
datetimes, then the column is returned as a string; no error is raised.
We often encounter such columns, but we would like to know sooner rather
than later, and with a more helpful error message.
---
 analysis/aggregate.py   | 14 +++++++++++++-
 tests/test_aggregate.py |  9 +++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/analysis/aggregate.py b/analysis/aggregate.py
index 9781a8d..d13618e 100644
--- a/analysis/aggregate.py
+++ b/analysis/aggregate.py
@@ -19,7 +19,7 @@ def read(f_in):
     date_col = "event_date"
     index_cols = ["table_name", date_col]
     value_col = "event_count"
-    return (
+    event_counts = (
         pandas.read_csv(
             f_in,
             parse_dates=[date_col],
@@ -30,6 +30,18 @@ def read(f_in):
         .sort_index()
     )
 
+    # If a column given by parse_dates cannot be represented as an array of datetimes,
+    # then the column is returned as a string; no error is raised. We often encounter
+    # such columns, but we would like to know sooner rather than later, and with a more
+    # helpful error message. The duck-typing way of testing that an index is an array of
+    # datetimes is to call .is_all_dates. However, this property was removed in v2.0.0
+    # so, for the benefit of our future self, we'll call isinstance instead.
+    assert isinstance(
+        event_counts.index.get_level_values(date_col), pandas.DatetimeIndex
+    ), f"The {date_col} column cannot be parsed into a DatetimeIndex"
+
+    return event_counts
+
 
 def aggregate(event_counts, offset, func):
     group_by, resample_by = event_counts.index.names
diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py
index 413c20e..9ee607a 100644
--- a/tests/test_aggregate.py
+++ b/tests/test_aggregate.py
@@ -1,8 +1,17 @@
 import pandas
+import pytest
 
 from analysis import aggregate
 
 
+def test_read_with_unparsable_date(tmp_path):
+    rows_csv = tmp_path / "rows.csv"
+    unparsable_date = "9999-01-01"
+    rows_csv.write_text(f"table_name,event_date,event_count\nAPCS,{unparsable_date},1")
+    with pytest.raises(AssertionError):
+        aggregate.read(rows_csv)
+
+
 def make_series(event_dates):
     index = pandas.MultiIndex.from_product(
         (["table_1", "table_2"], pandas.to_datetime(event_dates)),