Merge pull request #14 from genes-and-health/improve-tools

Improve tools
genes-and-health · Jan 24, 2024 · 625c40f · 625c40f
2 parents 1a170a7 + afab20d
commit 625c40f
Show file tree

Hide file tree

Showing 24 changed files with 408 additions and 74 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include tretools/datasets/configs/NHS_D/apc.json
+include tretools/datasets/configs/NHS_D/op.json
+include tretools/datasets/configs/NHS_D/civ_reg.json
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
     name="tretools",
     version=version,
     packages=find_namespace_packages(exclude=["tests"]),
-    include_package_data=False,
+    include_package_data=True,
     url="https://github.com/genes-and-health/tre-tools",
     description="Tools for working with the Genes and Health Trusted Research Environment",
     author="Caroline Morton",

diff --git a/tests/codelists/test_codelist.py b/tests/codelists/test_codelist.py
@@ -112,9 +112,9 @@ def test_bad_opcs_wrong_format():
     validation_check = Codelist.validate_opcs_code("A010.1")
     assert validation_check == False
 
-    # Fourth character is a dot if present
-    validation_check = Codelist.validate_opcs_code("A01A")
-    assert validation_check == False
+    # Fourth character is a dot or a number if present
+    validation_check = Codelist.validate_opcs_code("A011")
+    assert validation_check == True
 
     # Do not need a dot
     validation_check = Codelist.validate_opcs_code("A01")

diff --git a/tests/datasets/test_base.py b/tests/datasets/test_base.py
@@ -33,8 +33,9 @@ def test_processed_log():
     ingested_data = Dataset(path="tests/test_data/primary_care/procedures_many_diffs.csv", dataset_type="primary_care", coding_system="SNOMED")
     ingested_data.log.append("test log")
 
-    assert ingested_data.log == ["test log"]
-    assert len(ingested_data.log) == 1
+    assert "Loaded data from tests/test_data/primary_care/procedures_many_diffs.csv using separator ',' with these values as null:" in ingested_data.log[0]
+    assert ingested_data.log[1] == "test log"
+    assert len(ingested_data.log) == 2
 
 
 def test_log_gets_written():
@@ -51,7 +52,7 @@ def test_log_gets_written():
 
     # check first line and how many lines in the log - should be 7 (6 logs and empty line)
     assert "first test log" in log
-    assert len(log.split("\n")) == 3
+    assert len(log.split("\n")) == 4
 
     # write the log again but this time append
     ingested_data.write_to_log("tests/test_data/primary_care/test_log.txt", overwrite_or_append="append")
@@ -60,7 +61,7 @@ def test_log_gets_written():
     with open("tests/test_data/primary_care/test_log.txt", "r") as f:
         log = f.read()
 
-    assert len(log.split("\n")) == 5
+    assert len(log.split("\n")) == 7
 
     # delete the log file
     os.remove("tests/test_data/primary_care/test_log.txt")

diff --git a/tests/datasets/test_demograhics_dataset.py b/tests/datasets/test_demograhics_dataset.py
@@ -1,6 +1,6 @@
 from tretools.datasets.demographic_dataset import DemographicDataset
 
-
+import pytest
 from datetime import datetime
 
 DEMOGRAPHIC_MAPPING_FILE = "tests/test_data/demographics/mapping.txt"
@@ -19,26 +19,32 @@
 
 
 def test_demographic_dataset():
-    data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
+    data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
 
     assert data.mapped_data.shape == (3, 4)
     assert data.demographics.shape == (3, 11)
 
 
 def test_demographic_dataset_process():
-    data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
+    data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
     data.process_dataset(MAPPING_CONFIG)
 
     assert data.data.shape == (3, 3)
 
 
 def test_demographic_dataset_process_round_to_day():
-    data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
+    data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
     data.process_dataset(MAPPING_CONFIG, 9)
 
     expected_dob = ["1983-10-09", "1979-01-09", "1948-06-09"]
     expected_dob_as_dt = [datetime.strptime(x, "%Y-%m-%d").date() for x in expected_dob]
 
     assert data.data["dob"].to_list() == expected_dob_as_dt
 
-
+
+def test_cannot_process_if_already_data():
+    data = DemographicDataset(path="tests/test_data/demographics/processed.csv")
+    with pytest.raises(ValueError) as e:
+        data.process_dataset(MAPPING_CONFIG)
+
+    assert str(e.value) == "This method should not be called if demographic data is already loaded."
diff --git a/tests/datasets/test_processed_dataset.py b/tests/datasets/test_processed_dataset.py
@@ -1,8 +1,11 @@
 import pytest
 import datetime
+import polars as pl
 
 from tretools.datasets.processed_dataset import ProcessedDataset
-from tretools.datasets.errors import DeduplicationError
+from tretools.datasets.demographic_dataset import DemographicDataset
+from tretools.codelists.codelist_types import CodelistType
+from tretools.datasets.errors import DeduplicationError, CodeNotMappable
 
 def test_load_processed_dataset():
     observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system="SNOMED")
@@ -89,3 +92,98 @@ def test_dedupe_with_date_limit():
     data = dedup.data
     specific_row = data.filter((data["nhs_number"].eq("84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A")) & (data["code"].eq("100000001")))
     assert specific_row["date"][0] == "2018-11-05"
+
+
+def test_mapped_to_icd():
+    observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.SNOMED.value)
+    mapped_dataset = observed_dataset.map_snomed_to_icd(mapping_file="tests/test_data/mapping_files/snomed_icd_map.csv")
+
+    # there are 7 rows in the original dataset, of these there are 3 instances of 100000001, 1 instance of 100000002 
+    # and 3 instances of 200000001. Our mapping file only has maps for 100000001 and 100000002. So, we should have
+    # ended up with 4 rows in the mapped dataset.
+    assert mapped_dataset.data.shape == (4, 3)
+
+    first_patient = mapped_dataset.data.filter(mapped_dataset.data["nhs_number"].eq("84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A"))
+    first_patient = first_patient.sort(pl.col("date"))
+    assert first_patient['code'][0] == "A010"
+    assert first_patient['code'][1] == "A010"
+    assert first_patient['code'][2] == "A020"
+
+
+def test_mapped_to_icd_with_specific_col():
+    with pytest.raises(CodeNotMappable) as e:
+        observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.ICD10.value)
+        observed_dataset.map_snomed_to_icd(mapping_file="tests/test_data/mapping_files/snomed_icd_map.csv", snomed_col="conceptID", icd_col="mapTarget")
+
+    assert "Coding system must be SNOMED for mapping to ICD10" in str(e.value)
+
+def test_mapped_to_icd_logs():
+    observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.SNOMED.value, log_path="tests/test_data/primary_care/processed_data_log.txt")
+    mapped_dataset = observed_dataset.map_snomed_to_icd(mapping_file="tests/test_data/mapping_files/snomed_icd_map.csv")
+
+    # there are 7 logs in the original dataset, 1 from loading and 4 created by the mapping function
+    assert len(mapped_dataset.log) == 11
+    assert "Loading mapping file from tests/test_data/mapping_files/snomed_icd_map.csv" in mapped_dataset.log[7]
+    assert "Pre-mapping dataset has 7 rows" in mapped_dataset.log[8]
+    assert "Post-mapping dataset has 4 rows" in mapped_dataset.log[9]
+
+
+
+def test_truncate_icd_to_3_digits():
+    observed_dataset = ProcessedDataset(path="tests/test_data/barts_health/diagnosis.csv", dataset_type="secondary_care", coding_system=CodelistType.ICD10.value)
+    truncated_dataset = observed_dataset.truncate_icd_to_3_digits()
+
+    # there are 10 rows in the original dataset. there should be 10 rows in the truncated dataset
+    assert truncated_dataset.data.shape == (10, 3)
+
+    # first patient. There are 2 codes with A01 (A01 and A01X). The truncated dataset should only have A01
+    first_patient = truncated_dataset.data.filter(truncated_dataset.data["nhs_number"].eq("84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A"))
+    first_patient = first_patient.sort(pl.col("date"), pl.col("code"))
+
+    assert first_patient['code'][0] == "A01"
+    assert first_patient['code'][1] == "A01"
+
+
+def test_truncate_icd_to_3_digits_logs():
+    observed_dataset = ProcessedDataset(path="tests/test_data/barts_health/diagnosis.csv", dataset_type="secondary_care", coding_system=CodelistType.ICD10.value, log_path="tests/test_data/barts_health/diagnosis_log.txt")
+    truncated_dataset = observed_dataset.truncate_icd_to_3_digits()
+
+    # there are 7 logs in the original dataset, 1 from loading and 3 created by the truncation function
+    assert len(truncated_dataset.log) == 11
+    assert "Post-truncation dataset has 10 rows" in truncated_dataset.log[9]
+
+
+def test_truncate_icd_to_3_digits_wrong_coding_system():
+    with pytest.raises(CodeNotMappable) as e:
+        observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system=CodelistType.SNOMED.value)
+        observed_dataset.truncate_icd_to_3_digits()
+
+    assert "Coding system must be ICD10 for truncating to 3 digits" in str(e.value)
+
+def test_removes_unrealistic_data():
+    observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/procedures_with_unrealistic_values.csv", dataset_type="primary_care", coding_system="SNOMED")
+    cleaned_dataset = observed_dataset.remove_unrealistic_dates(before_born=False)
+
+    # assert no data is dropped
+    assert cleaned_dataset.data.shape == (1, 3)
+
+    # assert that the date is 1910-01-01
+    assert cleaned_dataset.data["date"][0] == "1910-01-01"
+
+
+def test_removes_unrealistic_data_before_born():
+    observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/procedures_with_unrealistic_values.csv", dataset_type="primary_care", coding_system="SNOMED")
+    demographic_dataset = DemographicDataset(path="tests/test_data/demographics/processed.csv")
+
+    cleaned_dataset = observed_dataset.remove_unrealistic_dates(before_born=True, demographic_dataset=demographic_dataset)
+    assert cleaned_dataset.data.shape == (0, 3)
+
+
+
+def test_removes_unrealistic_data_before_born_but_no_data():
+    observed_dataset = ProcessedDataset(path="tests/test_data/primary_care/procedures_with_unrealistic_values.csv", dataset_type="primary_care", coding_system="SNOMED")
+
+    with pytest.raises(ValueError) as e:
+        observed_dataset.remove_unrealistic_dates(before_born=True)
+
+    assert "A demographic dataset must be provided if before_born is True" in str(e.value)
diff --git a/tests/datasets/test_raw_dataset.py b/tests/datasets/test_raw_dataset.py
@@ -4,6 +4,7 @@
 
 from tretools.datasets.raw_dataset import RawDataset
 from tretools.datasets.errors import ColumnsValidationError, DeduplicationError
+from tretools.datasets.dataset_enums.dataset_types import DatasetType
 
 
 
@@ -31,15 +32,6 @@ def test_check_col_validation_on_load():
     assert observed_dataset.column_validation == True
 
 
-def test_check_col_validation_second_time():
-    observed_dataset = RawDataset(path="tests/test_data/primary_care/processed_data.csv", dataset_type="primary_care", coding_system="SNOMED")
-
-    with pytest.raises(ColumnsValidationError) as e:
-        observed_dataset._standarise_column_names({"original_code": "code", "clinical_effective_date": "date", "pseudo_nhs_number": "nhs_number"})
-
-    assert "Column names have already been validated" in str(e.value)
-
-
 def test__standarise_column_names():
     observed_dataset = RawDataset(path="tests/test_data/primary_care/good_procedures_no_extra_cols.csv", dataset_type="primary_care", coding_system="SNOMED")
     assert observed_dataset.column_validation == False
@@ -152,7 +144,7 @@ def test__drop_all_null_rows():
     observed_dataset._drop_all_null_rows()
 
     # check shape of data
-    assert observed_dataset.data.shape == (5, 4)
+    assert observed_dataset.data.shape == (3, 4)
 
 
 def test__deduplicate():
@@ -245,7 +237,7 @@ def test_with_nhs_digital_with_incorrect_type():
 
 
 def test_with_nhs_digital_with_process_dataset():
-    raw_data = RawDataset(path="tests/test_data/nhs_digital/civreg.txt", dataset_type="nhs_digital", coding_system="ICD10")
+    raw_data = RawDataset(path="tests/test_data/nhs_digital/civreg.txt", dataset_type=DatasetType.NHS_DIGITAL.value, coding_system="ICD10")
 
     processed_data = raw_data.process_dataset(deduplication_options=["nhs_number", "code", "date"], nhs_digital_subtype="CIV_REG")
 
@@ -313,3 +305,18 @@ def test_with_nhs_digital_with_process_dataset_with_op():
     assert third_patient_data['date'][0] == correct_date
     assert third_patient_data['date'][1] == correct_date
     assert third_patient_data['date'][2] == correct_date
+
+
+def test_accepts_scientific_notation_values():
+    observed_dataset = RawDataset(path="tests/test_data/primary_care/procedures_with_scientific_notation.csv", dataset_type="primary_care", coding_system="SNOMED")
+
+    # assert no data is dropped
+    assert observed_dataset.data.shape == (9, 5)
+
+    # assert that original code column is int64
+    assert observed_dataset.data["original_code"].dtype == pl.Int64
+
+    # assert that 882784691000119e3 is converted to 882784691000119040
+    observed_dataset.data = observed_dataset.data.sort('original_code', descending=True)
+    assert observed_dataset.data["original_code"][0] == 882784691000119040
+
diff --git a/tests/phenotype_report/test_report.py b/tests/phenotype_report/test_report.py
@@ -172,7 +172,7 @@ def test_report_with_demographics():
     primary_care = ProcessedDataset(PRIMARY_CARE_DATASET, "primary_care", "SNOMED")
 
     # load the demographic data
-    demographic_data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
+    demographic_data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
     demographic_data.process_dataset(MAPPING_CONFIG)
 
     report = PhenotypeReport("Disease A")
@@ -210,7 +210,7 @@ def test_report_with_demographics():
     # Now we are changing the rounding to the 1st of the month, therefore the patient with nhs number
     # 84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A should have an age of 35 as the
     # patient was born on 1st Oct 1982 and had an event on 5th Oct 2018.
-    demographic_data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
+    demographic_data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
     demographic_data.process_dataset(MAPPING_CONFIG, round_to_day_in_month=1)
 
     report = PhenotypeReport("Disease A")

diff --git a/tests/report_transformers/test_regenie_report.py b/tests/report_transformers/test_regenie_report.py
@@ -6,7 +6,7 @@
 import os
 
 
-MAPPING_PATH = "tests/test_data/mapping_files/mapping_file.csv"
+MAPPING_PATH = "tests/test_data/mapping_files/regenie_mapping_file.csv"
 MAPPING_CONFIG = {"Pseudonhs_2023-11-08_uniq": "nhs_number",
                   "40028exomes_release_2023-JUL-07": "broad_id",
                   "51176GSA_Oct2023release": "gsa_id"
@@ -31,7 +31,7 @@ def test_summary_report_transformer():
     assert set(first_mapping["gsa_id"]) == {"15001987654321_123456789012_R01C01"}
 
     # make the summary report
-    result = regenie_reporter.transform()
+    result = regenie_reporter.transform("tests/report_transformers/regenie_reports")
     first_phenotype = result[1]
     assert set(first_phenotype['FID']) == {1}
 

diff --git a/tests/report_transformers/utils.py b/tests/report_transformers/utils.py
@@ -35,7 +35,7 @@ def make_phenotype_reports_for_testing():
     icd_codelist = Codelist(ICD_CODELIST, CodelistType.ICD10.value)
 
     # load the demographic data
-    demographic_data = DemographicDataset(DEMOGRAPHIC_MAPPING_FILE, DEMOGRAPHIC_FILE)
+    demographic_data = DemographicDataset(path_to_mapping_file=DEMOGRAPHIC_MAPPING_FILE, path_to_demographic_file=DEMOGRAPHIC_FILE)
     demographic_data.process_dataset(MAPPING_CONFIG)
 
     # Make a list of PhenotypeReports

diff --git a/tests/test_data/demographics/processed.csv b/tests/test_data/demographics/processed.csv
@@ -0,0 +1,4 @@
+nhs_number,gender,dob
+84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A,2,1983-10-15
+73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,1,1979-01-15
+53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C,1,1948-06-15
diff --git a/.../test_data/mapping_files/mapping_file.csv → ...ta/mapping_files/regenie_mapping_file.csv b/.../test_data/mapping_files/mapping_file.csv → ...ta/mapping_files/regenie_mapping_file.csv
diff --git a/tests/test_data/mapping_files/snomed_icd_map.csv b/tests/test_data/mapping_files/snomed_icd_map.csv
@@ -0,0 +1,3 @@
+conceptID,mapTarget,ICD10_3digit
+100000001,A010,A01
+100000002,A020,A02
diff --git a/tests/test_data/primary_care/good_procedures_with_nulls.csv b/tests/test_data/primary_care/good_procedures_with_nulls.csv
@@ -1,10 +1,10 @@
 "pseudo_nhs_number","clinical_effective_date","original_code","original_term"
 "84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2018-10-05","100000001","Disease A - 1"
 "84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2018-11-15","100000001","Disease A - 1"
-"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2019-02-12",,"Disease A - 2"
+"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","2019-02-12","NA","Disease A - 2"
 "84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","","200000001","Disease B - 1"
 "73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B","2012-11-21","NULL","Disease A - 1"
 "73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B",,"100000001","Disease A - 1"
 "53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","2016-07-19","200000001","Disease B - 1"
-"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","2016-08-20","200000001","Disease B - 1"
-"44966CC0716B4C241F8223EDBCF77AE87DB71814979C7D47EDE416B81D4A104B","2015-10-19","100000002","Disease A - 2"
+"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","2016-08-20",".","Disease B - 1"
+"44966CC0716B4C241F8223EDBCF77AE87DB71814979C7D47EDE416B81D4A104B","2015-10-19","               ","Disease A - 2"
diff --git a/tests/test_data/primary_care/procedures_with_scientific_notation.csv b/tests/test_data/primary_care/procedures_with_scientific_notation.csv
@@ -0,0 +1,10 @@
+"pseudo_nhs_number","id","clinical_effective_date","original_code","original_term"
+"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144854","2018-10-05","100000001","Disease A - 1"
+"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144858","2018-11-15","100000001","Disease A - 1"
+"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144859","2019-02-12","100000002","Disease A - 2"
+"84950DE0614A5C241F7223FBCCD27BE87DB61915972C7E49EDF519B72A3A104A","6144860","2020-05-22","200000001","Disease B - 1"
+"73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B","6144855","2012-11-21","100000001","Disease A - 1"
+"73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B","6144861","2013-06-03","100000001","Disease A - 1"
+"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","6144856","2016-07-19","200000001","Disease B - 1"
+"53952EF0503F7F341D9121DBCCC39DE95EA83713167E5E57EDB613A60D4C104C","6144862","2016-08-20","200000001","Disease B - 1"
+"44966CC0716B4C241F8223EDBCF77AE87DB71814979C7D47EDE416B81D4A104B","6144857","2015-10-19","882784691000119e3","Disease A - 2"
diff --git a/tests/test_data/primary_care/procedures_with_unrealistic_values.csv b/tests/test_data/primary_care/procedures_with_unrealistic_values.csv
@@ -0,0 +1,4 @@
+nhs_number,code,date
+73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,200000001,2030-01-01
+73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,100000002,1900-01-10
+73951AB0712D6E241E8222EDCCF28AE86DA72814078D6F48ECE512C91B5B104B,100000001,1910-01-01
diff --git a/tretools/VERSION b/tretools/VERSION
@@ -1 +1 @@
-0.0.1
+0.1.0
diff --git a/tretools/codelists/codelist.py b/tretools/codelists/codelist.py
@@ -185,12 +185,10 @@ def validate_opcs_code(code: str) -> bool:
         - The code must be 3-5 characters long
         - The first character must be a letter
         - The second and third characters must be numbers
-        - The fourth character if present is a dot
-        - The fifth character if present is a number
-        - The fifth character must be present if the fourth character is a dot
+        - If there is a fourth character and it is a dot, there must be a number after the dot
+        - The fifth character, if present, is a number
         """
-        pattern = re.compile(r"^[A-Z]\d{2}(\.\d{1,2})?$")
-
+        pattern = re.compile(r"^[A-Z]\d{2}(\.\d{1,2}|\d{1,2})?$")
         if len(code) > 5:
             return False