Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle dates in the far future #42

Merged
merged 4 commits into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion analysis/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def read(f_in):
date_col = "event_date"
index_cols = ["table_name", date_col]
value_col = "event_count"
return (
event_counts = (
pandas.read_csv(
f_in,
parse_dates=[date_col],
Expand All @@ -30,6 +30,18 @@ def read(f_in):
.sort_index()
)

# If a column given by parse_dates cannot be represented as an array of datetimes,
# then the column is returned as a string; no error is raised. We often encounter
# such columns, but we would like to know sooner rather than later, and with a more
# helpful error message. The duck-typing way of testing that an index is an array of
# datetimes is to call .is_all_dates. However, this property was removed in v2.0.0
# so, for the benefit of our future self, we'll call isinstance instead.
assert isinstance(
event_counts.index.get_level_values(date_col), pandas.DatetimeIndex
), f"The {date_col} column cannot be parsed into a DatetimeIndex"

return event_counts


def aggregate(event_counts, offset, func):
group_by, resample_by = event_counts.index.names
Expand Down
7 changes: 7 additions & 0 deletions analysis/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import datetime


# If you change the values of these variables, then also change their equivalents in
# analysis/query.sql.
FROM_DATE = datetime.date(2016, 1, 1)
TO_DATE = datetime.date.today()
53 changes: 41 additions & 12 deletions analysis/query.sql
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
-- If you change the values of these variables,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AIUI sqlrunner doesn't (yet) support parameterised sql statements, to ensure consistency with the values in config.py could a configure or similar action be used preceding the query action to populate the values in this file with the ones from the configuration?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aware this might be lily-gilding

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could! Such an action could also use Jinja2, as the render_report action does. That said, whilst I wouldn't say the current implementation is beautiful, I also wouldn't say it needs further ornamentation.

To put it another way, I don't think the risks justify the effort. And if they did, or came close, then the effort should probably be invested in SQL Runner rather than in this study (opensafely-core/sqlrunner#72).

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would be a much more sensible approach!

-- then also change their equivalents in analysis/config.py.
DECLARE @from_date DATE;
SET @from_date = DATEFROMPARTS(2016, 1, 1);

DECLARE @to_date DATE;
SET @to_date = CONVERT(DATE, GETDATE());

SELECT
'CodedEvent' AS table_name,
CONVERT(DATE, ConsultationDate) AS event_date,
COUNT(*) AS event_count
FROM CodedEvent
WHERE CONVERT(DATE, ConsultationDate) >= @from_date
WHERE
CONVERT(DATE, ConsultationDate) >= @from_date
AND CONVERT(DATE, ConsultationDate) <= @to_date
GROUP BY CONVERT(DATE, ConsultationDate)

UNION ALL
Expand All @@ -16,7 +23,9 @@ SELECT
CONVERT(DATE, SeenDate) AS event_date,
COUNT(*) AS event_count
FROM Appointment
WHERE CONVERT(DATE, SeenDate) >= @from_date
WHERE
CONVERT(DATE, SeenDate) >= @from_date
AND CONVERT(DATE, SeenDate) <= @to_date
GROUP BY CONVERT(DATE, SeenDate)

UNION ALL
Expand All @@ -26,7 +35,9 @@ SELECT
Admission_Date AS event_date,
COUNT(*) AS event_count
FROM APCS
WHERE Admission_Date >= @from_date
WHERE
Admission_Date >= @from_date
AND Admission_Date <= @to_date
GROUP BY Admission_Date

UNION ALL
Expand All @@ -36,7 +47,9 @@ SELECT
DateOfDeath AS event_date,
COUNT(*) AS event_count
FROM CPNS
WHERE DateOfDeath >= @from_date
WHERE
DateOfDeath >= @from_date
AND DateOfDeath <= @to_date
GROUP BY DateOfDeath

UNION ALL
Expand All @@ -46,7 +59,9 @@ SELECT
Arrival_Date AS event_date,
COUNT(*) AS event_count
FROM EC
WHERE Arrival_Date >= @from_date
WHERE
Arrival_Date >= @from_date
AND Arrival_Date <= @to_date
GROUP BY Arrival_Date

UNION ALL
Expand All @@ -56,7 +71,9 @@ SELECT
Appointment_Date AS event_date,
COUNT(*) AS event_count
FROM OPA
WHERE Appointment_Date >= @from_date
WHERE
Appointment_Date >= @from_date
AND Appointment_Date <= @to_date
GROUP BY Appointment_Date

UNION ALL
Expand All @@ -66,7 +83,9 @@ SELECT
CONVERT(DATE, IcuAdmissionDateTime) AS event_date,
COUNT(*) AS event_count
FROM ICNARC
WHERE CONVERT(DATE, IcuAdmissionDateTime) >= @from_date
WHERE
CONVERT(DATE, IcuAdmissionDateTime) >= @from_date
AND CONVERT(DATE, IcuAdmissionDateTime) <= @to_date
GROUP BY CONVERT(DATE, IcuAdmissionDateTime)

UNION ALL
Expand All @@ -76,7 +95,9 @@ SELECT
dod AS event_date,
COUNT(*) AS event_count
FROM ONS_Deaths
WHERE dod >= @from_date
WHERE
dod >= @from_date
AND dod <= @to_date
GROUP BY dod

UNION ALL
Expand All @@ -86,7 +107,9 @@ SELECT
Earliest_Specimen_Date AS event_date,
COUNT(*) AS event_count
FROM SGSS_Positive
WHERE Earliest_Specimen_Date >= @from_date
WHERE
Earliest_Specimen_Date >= @from_date
AND Earliest_Specimen_Date <= @to_date
GROUP BY Earliest_Specimen_Date

UNION ALL
Expand All @@ -96,7 +119,9 @@ SELECT
Earliest_Specimen_Date AS event_date,
COUNT(*) AS event_count
FROM SGSS_Negative
WHERE Earliest_Specimen_Date >= @from_date
WHERE
Earliest_Specimen_Date >= @from_date
AND Earliest_Specimen_Date <= @to_date
GROUP BY Earliest_Specimen_Date

UNION ALL
Expand All @@ -106,7 +131,9 @@ SELECT
Specimen_Date AS event_date,
COUNT(*) AS event_count
FROM SGSS_AllTests_Positive
WHERE Specimen_Date >= @from_date
WHERE
Specimen_Date >= @from_date
AND Specimen_Date <= @to_date
GROUP BY Specimen_Date

UNION ALL
Expand All @@ -116,7 +143,9 @@ SELECT
Specimen_Date AS event_date,
COUNT(*) AS event_count
FROM SGSS_AllTests_Negative
WHERE Specimen_Date >= @from_date
WHERE
Specimen_Date >= @from_date
AND Specimen_Date <= @to_date
GROUP BY Specimen_Date

ORDER BY table_name, event_date, event_count
12 changes: 9 additions & 3 deletions analysis/render_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from jinja2 import Environment, FileSystemLoader, StrictUndefined

from analysis import utils
from analysis import config, utils


ENVIRONMENT = Environment(
Expand All @@ -23,9 +23,15 @@ def main():
utils.makedirs(f_out.parent)
rendered_report = render_report(
{
# FIXME: I don't know what's special about 2009-01-01 (the name
# `tpp_epoch_date` is my best guess), so I asked on Slack. For more
# information, see:
# https://bennettoxford.slack.com/archives/C03FB777L1M/p1681721217659849
# It's passed as a template variable so that we can format it consistently
# with other template variables.
"tpp_epoch_date": datetime.date(2009, 1, 1),
"from_date": datetime.date(2016, 1, 1), # analysis/query.sql
"to_date": datetime.date.today(),
"from_date": config.FROM_DATE,
"to_date": config.TO_DATE,
"plots": sorted((utils.OUTPUT_DIR / "plot").glob("*.png")),
}
)
Expand Down
9 changes: 9 additions & 0 deletions tests/test_aggregate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import pandas
import pytest

from analysis import aggregate


def test_read_with_unparsable_date(tmp_path):
rows_csv = tmp_path / "rows.csv"
unparsable_date = "9999-01-01"
rows_csv.write_text(f"table_name,event_date,event_count\nAPCS,{unparsable_date},1")
with pytest.raises(AssertionError):
aggregate.read(rows_csv)


def make_series(event_dates):
index = pandas.MultiIndex.from_product(
(["table_1", "table_2"], pandas.to_datetime(event_dates)),
Expand Down