Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to inclusion table for T1OO handling #1034

Merged
merged 8 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 25 additions & 16 deletions cohortextractor/tpp_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@
SLEEP = 4
BACKOFF_FACTOR = 4

T1OO_TABLE = "PatientsWithTypeOneDissent"

# This table has its name for historical reasons, and reads slightly oddly: it should be
# interpreted as "allowed patients with regard to type one dissents"
ALLOWED_PATIENTS_TABLE = "AllowedPatientsWithTypeOneDissent"


class TPPBackend:
Expand Down Expand Up @@ -414,24 +417,30 @@ def get_queries(self, covariate_definitions):

wheres = [f'{output_columns["population"]} = 1']

def get_t1oo_exclude_expressions():
# If this query has been explictly flagged as including T1OO patients then
# return unmodified
if self.include_t1oo:
return [], []
# Otherwise we add an extra LEFT OUTER JOIN on the T1OO table and
# WHERE clause which will exclude any patient IDs found in the T1OO table
return (
[
f"LEFT OUTER JOIN {T1OO_TABLE} ON {T1OO_TABLE}.Patient_ID = {patient_id_expr}"
],
[f"{T1OO_TABLE}.Patient_ID IS null"],
if not self.include_t1oo:
# If this query has not been explictly flagged as including T1OO patients
# then we add an extra JOIN on the allowed patients table to ensure that we
# only include patients which exist in that table
#
# PLEASE NOTE: This logic is referenced in our public documentation, so if
# we make any changes here we should ensure that the documentation is kept
# up-to-date:
# https://github.com/opensafely/documentation/blob/ea2e1645/docs/type-one-opt-outs.md
#
# From Cohort Extractor's point of view, the construction of the "allowed
# patients" table is opaque. For discussion of the approach currently used
# to populate this see:
# https://docs.google.com/document/d/1nBAwDucDCeoNeC5IF58lHk6LT-RJg6YZRp5RRkI7HI8/
joins.append(
f"JOIN {ALLOWED_PATIENTS_TABLE} ON {ALLOWED_PATIENTS_TABLE}.Patient_ID = {patient_id_expr}",
)
# This condition is theoretically redundant given the RIGHT JOIN above, but
# it feels safer to be explicit here
wheres.append(
f"{ALLOWED_PATIENTS_TABLE}.Patient_ID IS NOT NULL",
)

t100_join, t1oo_where = get_t1oo_exclude_expressions()
joins.extend(t100_join)
joins_str = "\n ".join(joins)
wheres.extend(t1oo_where)
where_str = " AND ".join(wheres)

joined_output_query = f"""
Expand Down
12 changes: 6 additions & 6 deletions tests/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ def test_study_definition_initial_stats_logging(logger):
# study definition - population, event_date_1, event_min_date and event_date_2,
# which is defined as a parameter to event_min_date
# tables - Patient, temp event table for each codelist
# joins - Patient to each event table, and t1oo
{"output_column_count": 5, "table_count": 3, "table_joins_count": 3},
# joins - Patient to each event table
{"output_column_count": 5, "table_count": 3, "table_joins_count": 2},
# variable_count is a count of the top-level variables defined in the study def (i.e. not event_date_2)
{"variable_count": 4},
# 2 variables use a codelist (event_date_1, and the nested event_date_2)
Expand All @@ -147,8 +147,8 @@ def test_stats_logging_tpp_backend(logger):
expected_initial_study_def_logs = [
# output columns include patient_id, and the 2 variables (population, event)
# defined in the study defniiton
# tables - Patient, temp event table, t1oo for codelist
{"output_column_count": 3, "table_count": 2, "table_joins_count": 2},
# tables - Patient, temp event table
{"output_column_count": 3, "table_count": 2, "table_joins_count": 1},
{"variable_count": 2},
{"variables_using_codelist_count": 1},
{"variable_using_codelist": "event", "codelist_size": 1},
Expand Down Expand Up @@ -299,7 +299,7 @@ def test_stats_logging_generate_cohort(
expected_initial_study_def_logs = [
# these 3 are logged from StudyDefinition instantiation
# patient_id, population, sex - all from patient table, but we make one temp # table per variable
{"output_column_count": 3, "table_count": 2, "table_joins_count": 2},
{"output_column_count": 3, "table_count": 2, "table_joins_count": 1},
{"variable_count": 2}, # population, sex
{"variables_using_codelist_count": 0},
# index_date_count logged from generate_cohort
Expand Down Expand Up @@ -451,7 +451,7 @@ def test_stats_logging_generate_cohort_with_index_dates(
{"index_date_count": 3},
{"min_index_date": "2020-01-01", "max_index_date": "2020-03-01"},
# output_column/table/joins_count is logged in tpp_backend on backend instantiation so it's repeated for each index date
*[{"output_column_count": 3, "table_count": 2, "table_joins_count": 2}] * 4,
*[{"output_column_count": 3, "table_count": 2, "table_joins_count": 1}] * 4,
*[
{"resetting_backend_index_date": ix_date}
for ix_date in expected_index_dates
Expand Down
69 changes: 44 additions & 25 deletions tests/test_tpp_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
ICNARC,
OPA,
UKRR,
AllowedPatientsWithTypeOneDissent,
APCS_Der,
Appointment,
BuildProgress,
Expand All @@ -54,7 +55,6 @@
Organisation,
Patient,
PatientAddress,
PatientsWithTypeOneDissent,
PotentialCareHomeAddress,
RegistrationHistory,
SGSS_AllTests_Negative,
Expand All @@ -73,20 +73,29 @@

@pytest.fixture(autouse=True)
def set_database_url(monkeypatch):
# The StudyDefinition code expects a single DATABASE_URL to tell it where
# to connect to, but the test environment needs to supply multiple
# connections (one for each backend type) so we copy the value in here
# The StudyDefinition code expects a single DATABASE_URL to tell it where to connect
# to, but the test environment needs to supply multiple connections (one for each
# backend type) so we copy the value in here. We append the `include_t1oo` parameter
# because in general test fixtures do not have sufficient registration history to
# pass the T1OO filter.
if "TPP_DATABASE_URL" in os.environ:
monkeypatch.setenv("DATABASE_URL", os.environ["TPP_DATABASE_URL"])
monkeypatch.setenv(
"DATABASE_URL",
f'{os.environ["TPP_DATABASE_URL"]}?opensafely_include_t1oo=true',
)


@pytest.fixture
def set_database_url_with_t1oo(monkeypatch):
def set_db_url(t1oo_value):
if "TPP_DATABASE_URL" in os.environ:
if t1oo_value is not None:
query_string = f"?opensafely_include_t1oo={t1oo_value}"
else:
query_string = ""
monkeypatch.setenv(
"DATABASE_URL",
f'{os.environ["TPP_DATABASE_URL"]}?opensafely_include_t1oo={t1oo_value}',
f'{os.environ["TPP_DATABASE_URL"]}{query_string}',
)

return set_db_url
Expand Down Expand Up @@ -141,7 +150,7 @@ def setup_function(function):
session.query(UKRR).delete()
session.query(Patient).delete()
session.query(BuildProgress).delete()
session.query(PatientsWithTypeOneDissent).delete()
session.query(AllowedPatientsWithTypeOneDissent).delete()

session.commit()

Expand Down Expand Up @@ -248,13 +257,14 @@ def test_minimal_study_with_reserved_keywords():
assert_results(study.to_dicts(), all=["M", "F"], asc=["40", "55"])


def test_minimal_study_with_t1oo_default():
def test_minimal_study_with_t1oo_default(set_database_url_with_t1oo):
set_database_url_with_t1oo(None)
# Test that type 1 opt-outs are excluded by default
session = make_session()
patient_1 = Patient(Patient_ID=1, DateOfBirth="1980-01-01", Sex="M")
patient_2 = Patient(Patient_ID=2, DateOfBirth="1965-01-01", Sex="F")
t1oo_1 = PatientsWithTypeOneDissent(Patient_ID=1)
session.add_all([patient_1, patient_2, t1oo_1])
allowed_2 = AllowedPatientsWithTypeOneDissent(Patient_ID=2)
session.add_all([patient_1, patient_2, allowed_2])
session.commit()
study = StudyDefinition(
population=patients.all(),
Expand All @@ -268,29 +278,38 @@ def test_minimal_study_with_t1oo_default():
@pytest.mark.parametrize(
"flag,expected",
[
("", ["1", "4"]),
("False", ["1", "4"]),
("false", ["1", "4"]),
("1", ["1", "4"]),
("True", ["1", "2", "3", "4"]),
("true", ["1", "2", "3", "4"]),
],
)
def test_minimal_study_with_t1oo_flag(set_database_url_with_t1oo, flag, expected):
@pytest.mark.parametrize("simple_population", [True, False])
def test_minimal_study_with_t1oo_flag(
set_database_url_with_t1oo, flag, expected, simple_population
):
set_database_url_with_t1oo(flag)
# Test that type 1 opt-outs are only included if flag is explicitly set to "True"
fixtures = [
Patient(Patient_ID=1, Sex="M"),
Patient(Patient_ID=2, Sex="F"),
Patient(Patient_ID=3, Sex="M"),
Patient(Patient_ID=4, Sex="F"),
AllowedPatientsWithTypeOneDissent(Patient_ID=1),
AllowedPatientsWithTypeOneDissent(Patient_ID=4),
]
session = make_session()
patient_1 = Patient(Patient_ID=1, DateOfBirth="1980-01-01", Sex="M")
patient_2 = Patient(Patient_ID=2, DateOfBirth="1965-01-01", Sex="F")
patient_3 = Patient(Patient_ID=3, DateOfBirth="1975-01-01", Sex="F")
patient_4 = Patient(Patient_ID=4, DateOfBirth="1985-01-01", Sex="F")
t1oo_2 = PatientsWithTypeOneDissent(Patient_ID=2)
t1oo_3 = PatientsWithTypeOneDissent(Patient_ID=3)
session.add_all([patient_1, patient_2, patient_3, patient_4, t1oo_2, t1oo_3])
session.add_all(fixtures)
session.commit()
study = StudyDefinition(
population=patients.all(),
)
if simple_population:
study = StudyDefinition(
population=patients.all(),
)
else:
study = StudyDefinition(
population=patients.satisfying(
"sex = 'M' OR sex = 'F'",
sex=patients.sex(),
),
)
assert_results(study.to_dicts(), patient_id=expected)


Expand Down
8 changes: 8 additions & 0 deletions tests/tpp_backend_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,3 +1063,11 @@ class PatientsWithTypeOneDissent(Base):
# Patient_ID might be the primary key, TBC
pk = Column(Integer, primary_key=True)
Patient_ID = Column(types.BIGINT)


# Temporary placeholder name until we confirm the real one
class AllowedPatientsWithTypeOneDissent(Base):
__tablename__ = "AllowedPatientsWithTypeOneDissent"
# fake pk to satisfy the ORM
pk = Column(Integer, primary_key=True)
Patient_ID = Column(types.BIGINT)
Loading