Skip to content

Commit

Permalink
Merge pull request #14 from genes-and-health/improve-tools
Browse files Browse the repository at this point in the history
Improve tools
  • Loading branch information
CarolineMorton authored Jan 24, 2024
2 parents 1a170a7 + afab20d commit 625c40f
Show file tree
Hide file tree
Showing 24 changed files with 408 additions and 74 deletions.
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include tretools/datasets/configs/NHS_D/apc.json
include tretools/datasets/configs/NHS_D/op.json
include tretools/datasets/configs/NHS_D/civ_reg.json
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
name="tretools",
version=version,
packages=find_namespace_packages(exclude=["tests"]),
include_package_data=False,
include_package_data=True,
url="https://github.com/genes-and-health/tre-tools",
description="Tools for working with the Genes and Health Trusted Research Environment",
author="Caroline Morton",
Expand Down
6 changes: 3 additions & 3 deletions tests/codelists/test_codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ def test_bad_opcs_wrong_format():
validation_check = Codelist.validate_opcs_code("A010.1")
assert validation_check == False

# Fourth character is a dot if present
validation_check = Codelist.validate_opcs_code("A01A")
assert validation_check == False
# Fourth character is a dot or a number if present
validation_check = Codelist.validate_opcs_code("A011")
assert validation_check == True

# Do not need a dot
validation_check = Codelist.validate_opcs_code("A01")
Expand Down
9 changes: 5 additions & 4 deletions tests/datasets/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def test_processed_log():
ingested_data = Dataset(path="tests/test_data/primary_care/procedures_many_diffs.csv", dataset_type="primary_care", coding_system="SNOMED")
ingested_data.log.append("test log")

assert ingested_data.log == ["test log"]
assert len(ingested_data.log) == 1
assert "Loaded data from tests/test_data/primary_care/procedures_many_diffs.csv using separator ',' with these values as null:" in ingested_data.log[0]
assert ingested_data.log[1] == "test log"
assert len(ingested_data.log) == 2


def test_log_gets_written():
Expand All @@ -51,7 +52,7 @@ def test_log_gets_written():

# check first line and how many lines in the log - should be 7 (6 logs and empty line)
assert "first test log" in log
assert len(log.split("\n")) == 3
assert len(log.split("\n")) == 4

# write the log again but this time append
ingested_data.write_to_log("tests/test_data/primary_care/test_log.txt", overwrite_or_append="append")
Expand All @@ -60,7 +61,7 @@ def test_log_gets_written():
with open("tests/test_data/primary_care/test_log.txt", "r") as f:
log = f.read()

assert len(log.split("\n")) == 5
assert len(log.split("\n")) == 7

# delete the log file
os.remove("tests/test_data/primary_care/test_log.txt")
Expand Down
16 changes: 11 additions & 5 deletions tests/datasets/test_demograhics_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from tretools.datasets.demographic_dataset import DemographicDataset


import pytest
from datetime import datetime

DEMOGRAPHIC_MAPPING_FILE = "tests/test_data/demographics/mapping.txt"
Expand All @@ -19,26 +19,32 @@


def test_demographic_dataset():
data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)

assert data.mapped_data.shape == (3, 4)
assert data.demographics.shape == (3, 11)


def test_demographic_dataset_process():
data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
data.process_dataset(MAPPING_CONFIG)

assert data.data.shape == (3, 3)


def test_demographic_dataset_process_round_to_day():
data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
data.process_dataset(MAPPING_CONFIG, 9)

expected_dob = ["1983-10-09", "1979-01-09", "1948-06-09"]
expected_dob_as_dt = [datetime.strptime(x, "%Y-%m-%d").date() for x in expected_dob]

assert data.data["dob"].to_list() == expected_dob_as_dt



def test_cannot_process_if_already_data():
data = DemographicDataset(path="tests/test_data/demographics/processed.csv")
with pytest.raises(ValueError) as e:
data.process_dataset(MAPPING_CONFIG)

assert str(e.value) == "This method should not be called if demographic data is already loaded."
100 changes: 99 additions & 1 deletion tests/datasets/test_processed_dataset.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import pytest
import datetime
import polars as pl

from tretools.datasets.processed_dataset import ProcessedDataset
from tretools.datasets.errors import DeduplicationError
from tretools.datasets.demographic_dataset import DemographicDataset
from tretools.codelists.codelist_types import CodelistType
from tretools.datasets.errors import DeduplicationError, CodeNotMappable

def test_load_processed_dataset():
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system="SNOMED")
Expand Down Expand Up @@ -89,3 +92,98 @@ def test_dedupe_with_date_limit():
data = dedup.data
specific_row = data.filter((data["nhs_number"].eq("84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A")) & (data["code"].eq("100000001")))
assert specific_row["date"][0] == "2018-11-05"


def test_mapped_to_icd():
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.SNOMED.value)
mapped_dataset = observed_dataset.map_snomed_to_icd(mapping_file="tests/test_data/mapping_files/snomed_icd_map.csv")

# there are 7 rows in the original dataset, of these there are 3 instances of 100000001, 1 instance of 100000002
# and 3 instances of 200000001. Our mapping file only has maps for 100000001 and 100000002. So, we should have
# ended up with 4 rows in the mapped dataset.
assert mapped_dataset.data.shape == (4, 3)

first_patient = mapped_dataset.data.filter(mapped_dataset.data["nhs_number"].eq("84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A"))
first_patient = first_patient.sort(pl.col("date"))
assert first_patient['code'][0] == "A010"
assert first_patient['code'][1] == "A010"
assert first_patient['code'][2] == "A020"


def test_mapped_to_icd_with_specific_col():
with pytest.raises(CodeNotMappable) as e:
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.ICD10.value)
observed_dataset.map_snomed_to_icd(mapping_file="tests/test_data/mapping_files/snomed_icd_map.csv", snomed_col="conceptID", icd_col="mapTarget")

assert "Coding system must be SNOMED for mapping to ICD10" in str(e.value)

def test_mapped_to_icd_logs():
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.SNOMED.value, log_path="tests/test_data/primary_care/processed_data_log.txt")
mapped_dataset = observed_dataset.map_snomed_to_icd(mapping_file="tests/test_data/mapping_files/snomed_icd_map.csv")

# there are 7 logs in the original dataset, 1 from loading and 4 created by the mapping function
assert len(mapped_dataset.log) == 11
assert "Loading mapping file from tests/test_data/mapping_files/snomed_icd_map.csv" in mapped_dataset.log[7]
assert "Pre-mapping dataset has 7 rows" in mapped_dataset.log[8]
assert "Post-mapping dataset has 4 rows" in mapped_dataset.log[9]



def test_truncate_icd_to_3_digits():
observed_dataset = ProcessedDataset(path="tests/test_data/barts_health/diagnosis.csv", dataset_type="secondary_care", coding_system=CodelistType.ICD10.value)
truncated_dataset = observed_dataset.truncate_icd_to_3_digits()

# there are 10 rows in the original dataset. there should be 10 rows in the truncated dataset
assert truncated_dataset.data.shape == (10, 3)

# first patient. There are 2 codes with A01 (A01 and A01X). The truncated dataset should only have A01
first_patient = truncated_dataset.data.filter(truncated_dataset.data["nhs_number"].eq("84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A"))
first_patient = first_patient.sort(pl.col("date"), pl.col("code"))

assert first_patient['code'][0] == "A01"
assert first_patient['code'][1] == "A01"


def test_truncate_icd_to_3_digits_logs():
observed_dataset = ProcessedDataset(path="tests/test_data/barts_health/diagnosis.csv", dataset_type="secondary_care", coding_system=CodelistType.ICD10.value, log_path="tests/test_data/barts_health/diagnosis_log.txt")
truncated_dataset = observed_dataset.truncate_icd_to_3_digits()

# there are 7 logs in the original dataset, 1 from loading and 3 created by the truncation function
assert len(truncated_dataset.log) == 11
assert "Post-truncation dataset has 10 rows" in truncated_dataset.log[9]


def test_truncate_icd_to_3_digits_wrong_coding_system():
with pytest.raises(CodeNotMappable) as e:
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.SNOMED.value)
observed_dataset.truncate_icd_to_3_digits()

assert "Coding system must be ICD10 for truncating to 3 digits" in str(e.value)

def test_removes_unrealistic_data():
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/procedures_with_unrealistic_values.csv", dataset_type="primary_care", coding_system="SNOMED")
cleaned_dataset = observed_dataset.remove_unrealistic_dates(before_born=False)

# assert no data is dropped
assert cleaned_dataset.data.shape == (1, 3)

# assert that the date is 1910-01-01
assert cleaned_dataset.data["date"][0] == "1910-01-01"


def test_removes_unrealistic_data_before_born():
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/procedures_with_unrealistic_values.csv", dataset_type="primary_care", coding_system="SNOMED")
demographic_dataset = DemographicDataset(path="tests/test_data/demographics/processed.csv")

cleaned_dataset = observed_dataset.remove_unrealistic_dates(before_born=True, demographic_dataset=demographic_dataset)
assert cleaned_dataset.data.shape == (0, 3)



def test_removes_unrealistic_data_before_born_but_no_data():
observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/procedures_with_unrealistic_values.csv", dataset_type="primary_care", coding_system="SNOMED")

with pytest.raises(ValueError) as e:
observed_dataset.remove_unrealistic_dates(before_born=True)

assert "A demographic dataset must be provided if before_born is True" in str(e.value)
29 changes: 18 additions & 11 deletions tests/datasets/test_raw_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from tretools.datasets.raw_dataset import RawDataset
from tretools.datasets.errors import ColumnsValidationError, DeduplicationError
from tretools.datasets.dataset_enums.dataset_types import DatasetType



Expand Down Expand Up @@ -31,15 +32,6 @@ def test_check_col_validation_on_load():
assert observed_dataset.column_validation == True


def test_check_col_validation_second_time():
observed_dataset = RawDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system="SNOMED")

with pytest.raises(ColumnsValidationError) as e:
observed_dataset._standarise_column_names({"original_code": "code", "clinical_effective_date": "date", "pseudo_nhs_number": "nhs_number"})

assert "Column names have already been validated" in str(e.value)


def test__standarise_column_names():
observed_dataset = RawDataset(path="tests/test_data/primary_care/good_procedures_no_extra_cols.csv", dataset_type="primary_care", coding_system="SNOMED")
assert observed_dataset.column_validation == False
Expand Down Expand Up @@ -152,7 +144,7 @@ def test__drop_all_null_rows():
observed_dataset._drop_all_null_rows()

# check shape of data
assert observed_dataset.data.shape == (5, 4)
assert observed_dataset.data.shape == (3, 4)


def test__deduplicate():
Expand Down Expand Up @@ -245,7 +237,7 @@ def test_with_nhs_digital_with_incorrect_type():


def test_with_nhs_digital_with_process_dataset():
raw_data = RawDataset(path="tests/test_data/nhs_digital/civreg.txt", dataset_type="nhs_digital", coding_system="ICD10")
raw_data = RawDataset(path="tests/test_data/nhs_digital/civreg.txt", dataset_type=DatasetType.NHS_DIGITAL.value, coding_system="ICD10")

processed_data = raw_data.process_dataset(deduplication_options=["nhs_number", "code", "date"], nhs_digital_subtype="CIV_REG")

Expand Down Expand Up @@ -313,3 +305,18 @@ def test_with_nhs_digital_with_process_dataset_with_op():
assert third_patient_data['date'][0] == correct_date
assert third_patient_data['date'][1] == correct_date
assert third_patient_data['date'][2] == correct_date


def test_accepts_scientific_notation_values():
observed_dataset = RawDataset(path="tests/test_data/primary_care/procedures_with_scientific_notation.csv", dataset_type="primary_care", coding_system="SNOMED")

# assert no data is dropped
assert observed_dataset.data.shape == (9, 5)

# assert that original code column is int64
assert observed_dataset.data["original_code"].dtype == pl.Int64

# assert that 882784691000119e3 is converted to 882784691000119040
observed_dataset.data = observed_dataset.data.sort('original_code', descending=True)
assert observed_dataset.data["original_code"][0] == 882784691000119040

4 changes: 2 additions & 2 deletions tests/phenotype_report/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def test_report_with_demographics():
primary_care = ProcessedDataset(PRIMARY_CARE_DATASET, "primary_care", "SNOMED")

# load the demographic data
demographic_data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
demographic_data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
demographic_data.process_dataset(MAPPING_CONFIG)

report = PhenotypeReport("Disease A")
Expand Down Expand Up @@ -210,7 +210,7 @@ def test_report_with_demographics():
# Now we are changing the rounding to the 1st of the month, therefore the patient with nhs number
# 84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A should have an age of 35 as the
# patient was born on 1st Oct 1982 and had an event on 5th Oct 2018.
demographic_data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
demographic_data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
demographic_data.process_dataset(MAPPING_CONFIG, round_to_day_in_month=1)

report = PhenotypeReport("Disease A")
Expand Down
4 changes: 2 additions & 2 deletions tests/report_transformers/test_regenie_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os


MAPPING_PATH = "tests/test_data/mapping_files/mapping_file.csv"
MAPPING_PATH = "tests/test_data/mapping_files/regenie_mapping_file.csv"
MAPPING_CONFIG = {"Pseudonhs_2023-11-08_uniq": "nhs_number",
"40028exomes_release_2023-JUL-07": "broad_id",
"51176GSA_Oct2023release": "gsa_id"
Expand All @@ -31,7 +31,7 @@ def test_summary_report_transformer():
assert set(first_mapping["gsa_id"]) == {"15001987654321_123456789012_R01C01"}

# make the summary report
result = regenie_reporter.transform()
result = regenie_reporter.transform("tests/report_transformers/regenie_reports")
first_phenotype = result[1]
assert set(first_phenotype['FID']) == {1}

Expand Down
2 changes: 1 addition & 1 deletion tests/report_transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def make_phenotype_reports_for_testing():
icd_codelist = Codelist(ICD_CODELIST, CodelistType.ICD10.value)

# load the demographic data
demographic_data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
demographic_data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
demographic_data.process_dataset(MAPPING_CONFIG)

# Make a list of PhenotypeReports
Expand Down
4 changes: 4 additions & 0 deletions tests/test_data/demographics/processed.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
nhs_number,gender,dob
84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A,2,1983-10-15
73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,1,1979-01-15
53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C,1,1948-06-15
3 changes: 3 additions & 0 deletions tests/test_data/mapping_files/snomed_icd_map.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
conceptID,mapTarget,ICD10_3digit
100000001,A010,A01
100000002,A020,A02
6 changes: 3 additions & 3 deletions tests/test_data/primary_care/good_procedures_with_nulls.csv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"pseudo_nhs_number","clinical_effective_date","original_code","original_term"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2018-10-05","100000001","Disease A - 1"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2018-11-15","100000001","Disease A - 1"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2019-02-12",,"Disease A - 2"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2019-02-12","NA","Disease A - 2"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","","200000001","Disease B - 1"
"73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B","2012-11-21","NULL","Disease A - 1"
"73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B",,"100000001","Disease A - 1"
"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","2016-07-19","200000001","Disease B - 1"
"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","2016-08-20","200000001","Disease B - 1"
"44966CC0716B4C241F8223EDBCF77AE87DB71814979C7D47EDE416B81D4A104B","2015-10-19","100000002","Disease A - 2"
"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","2016-08-20",".","Disease B - 1"
"44966CC0716B4C241F8223EDBCF77AE87DB71814979C7D47EDE416B81D4A104B","2015-10-19"," ","Disease A - 2"
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"pseudo_nhs_number","id","clinical_effective_date","original_code","original_term"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144854","2018-10-05","100000001","Disease A - 1"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144858","2018-11-15","100000001","Disease A - 1"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144859","2019-02-12","100000002","Disease A - 2"
"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144860","2020-05-22","200000001","Disease B - 1"
"73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B","6144855","2012-11-21","100000001","Disease A - 1"
"73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B","6144861","2013-06-03","100000001","Disease A - 1"
"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","6144856","2016-07-19","200000001","Disease B - 1"
"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","6144862","2016-08-20","200000001","Disease B - 1"
"44966CC0716B4C241F8223EDBCF77AE87DB71814979C7D47EDE416B81D4A104B","6144857","2015-10-19","882784691000119e3","Disease A - 2"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
nhs_number,code,date
73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,200000001,2030-01-01
73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,100000002,1900-01-10
73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,100000001,1910-01-01
2 changes: 1 addition & 1 deletion tretools/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.1
0.1.0
8 changes: 3 additions & 5 deletions tretools/codelists/codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,12 +185,10 @@ def validate_opcs_code(code: str) -> bool:
- The code must be 3-5 characters long
- The first character must be a letter
- The second and third characters must be numbers
- The fourth character if present is a dot
- The fifth character if present is a number
- The fifth character must be present if the fourth character is a dot
- If there is a fourth character and it is a dot, there must be a number after the dot
- The fifth character, if present, is a number
"""
pattern = re.compile(r"^[A-Z]\d{2}(\.\d{1,2})?$")

pattern = re.compile(r"^[A-Z]\d{2}(\.\d{1,2}|\d{1,2})?$")
if len(code) > 5:
return False

Expand Down
Loading

0 comments on commit 625c40f

Please sign in to comment.