Merge pull request #317 from EHDEN/changes_from_partial_run

Changes from partial run
EHDEN · Aug 4, 2021 · b9ebd75 · b9ebd75
2 parents a810929 + 7d72378
commit b9ebd75
Show file tree

Hide file tree

Showing 8 changed files with 244 additions and 27 deletions.
diff --git a/resources/synthetic_data/Hes_500,000/generate_gp_tables.py b/resources/synthetic_data/Hes_500,000/generate_gp_tables.py
@@ -0,0 +1,56 @@
+import random
+import pandas as pd
+import numpy as np
+
+# Number of unique ids
+length = 500000
+eid = list(range(length))
+
+# Create a range of dates to take from
+min_d = pd.to_datetime('01/01/2000')
+max_d = pd.to_datetime('31/12/2020')
+d = (max_d - min_d).days + 1
+
+# GP_clinical file
+# Create random number of indexes for each person with id
+gp_clin_temp = []
+for i in eid:
+    x = random.randint(1, 11)
+    for j in range(x):
+        gp_clin_temp.append([i])
+
+# Create the main hospital data (gp_clinical)
+gp_clin = pd.DataFrame(gp_clin_temp, columns=['eid'])
+n_clin = len(gp_clin)
+print(f'length of gp_clinical: {n_clin}')
+
+# Add columns we use in the transformation with randomised values (gp_clinincal)
+gp_clin['event_dt'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_clin), unit='d')).strftime("%d/%m/%Y")
+
+# Add to the unused the most frequent value from the scan report (gp_clinical)
+gp_clin['code_type'] = 0
+gp_clin['code'] = '42L..'
+gp_clin['value'] = 9.1000004
+
+# GP_scipts file
+# Create random number of indexes for each person with id
+gp_scr_temp = []
+for i in eid:
+    x = random.randint(1, 11)
+    for j in range(x):
+        gp_scr_temp.append([i])
+
+# Create the main hospital data (gp_scripts)
+gp_scr = pd.DataFrame(gp_scr_temp, columns=['eid'])
+n_scr = len(gp_scr)
+print(f'length of gp_scripts: {n_scr}')
+
+# Add columns we use in the transformation with randomised values (gp_scripts)
+gp_scr['issue_date'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_scr), unit='d')).strftime("%d/%m/%Y")
+
+# Add to the unused the most frequent value from the scan report (gp_scripts)
+gp_scr['dmd_code'] = 321177003
+
+# Write to csv
+gp_clin.to_csv('covid19_tpp_gp_clinical.csv', sep=',', index=False)
+gp_scr.to_csv('covid19_tpp_gp_scripts.csv', sep=',', index=False)
diff --git a/resources/synthetic_data/Hes_500,000/generate_hes_tables.py b/resources/synthetic_data/Hes_500,000/generate_hes_tables.py
@@ -0,0 +1,129 @@
+import random
+import pandas as pd
+import numpy as np
+
+# Number of unique ids
+length = 7919
+eid = list(range(length))
+
+# Create a range of dates to take from
+min_d = pd.to_datetime('01/01/2000')
+max_d = pd.to_datetime('31/12/2020')
+d = (max_d - min_d).days + 1
+
+# HESIN file
+# Create random number of indexes for each person with id+index unique
+hes_temp = []
+for i in eid:
+    x = random.randint(1, 137)
+    for j in range(x):
+        hes_temp.append([i, j])
+
+# Create the main hospital data (hesin)
+hesin = pd.DataFrame(hes_temp, columns=['eid', 'ins_index'])
+n_hes = len(hesin)
+print(f'length of main hes: {n_hes}')
+
+# Add columns we use in the transformation with randomised values (hesin)
+hesin['dsource'] = np.random.choice(['HES', 'SMR', 'PEDW'], size=n_hes)
+hesin['epistart'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
+hesin['epiend'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
+hesin['spell_index'] = np.random.choice(range(1, 967), size=n_hes)
+hesin['elecdate'] = '18/02/2010'
+hesin['admidate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
+hesin['admimeth'] = np.random.choice(range(1, 101), size=n_hes)
+hesin['admisorc'] = np.random.choice(range(1, 101), size=n_hes)
+hesin['disdate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
+hesin['disdest'] = np.random.choice(range(1, 101), size=n_hes)
+
+# Add to the unused the most frequent value from the scan report (hesin)
+hesin['source'] = 6
+hesin['epidur'] = 0
+hesin['bedyear'] = 0
+hesin['epistat'] = 3
+hesin['epitype'] = 1
+hesin['epiorder'] = 1
+hesin['spell_seq'] = 0
+hesin['spelbgin'] = 2
+hesin['spelend'] = 'Y'
+hesin['speldur'] = 0
+hesin['pctcode'] = '5N1'
+hesin['gpprpct'] = '5N1'
+hesin['category'] = 10
+hesin['elecdur'] = 1
+hesin['admimeth_uni'] = 1001
+hesin['admisorc_uni'] = 1000
+hesin['firstreg'] = 8
+hesin['classpat_uni'] = 1000
+hesin['classpat'] = 2
+hesin['intmanag_uni'] = 200
+hesin['intmanag'] = 2
+hesin['mainspef_uni'] = 1350
+hesin['mainspef'] = 100
+hesin['tretspef_uni'] = 1490
+hesin['tretspef'] = 300
+hesin['operstat'] = 1
+hesin['dismeth_uni'] = 1000
+hesin['dismeth'] = 1
+hesin['disdest_uni'] = 1000
+hesin['carersi'] = 99
+
+# HESIN_DIAG
+# Create random number of indexes for each person with id+index unique
+diag_temp = []
+for i in eid:
+    x = random.randint(1, 197)
+    for j in range(x):
+        diag_temp.append([i, j])
+
+# Create the diagnosis data (hesin_diag)
+hesin_diag = pd.DataFrame(diag_temp, columns=['eid', 'ins_index'])
+n_diag = len(hesin_diag)
+print(f'length of hes_diagnosis: {n_diag}')
+
+# Add columns we use in the transformation with randomised values (hesin_diag)
+hesin_diag['level'] = np.random.choice(range(1, 3), size=n_diag)
+hesin_diag['diag_icd9'] = np.random.choice(['V252', '6262', '78909', '4549', '7890', '7865',
+                                            '6359', '4556', '1749', '5509', '5742', '6117',
+                                            '6289'], size=n_diag)
+hesin_diag['diag_icd10'] = np.random.choice(['I10', 'E119', 'Z864', 'E780', 'J459', 'Z511',
+                                             'Z867', 'Z921', 'E039', 'I209', 'I48', 'I259',
+                                             'C509'], size=n_diag)
+
+# Add to the unused the most frequent value from the scan report (hesin_diag)
+hesin_diag['arr_index'] = 0
+hesin_diag['diag_icd9_nb'] = None
+hesin_diag['diag_icd10_nb'] = 2
+
+# HESIN_OPER
+# Create random number of indexes for each person with id+index unique
+oper_temp = []
+for i in eid:
+    x = random.randint(1, 163)
+    for j in range(x):
+        oper_temp.append([i, j])
+
+# Create the diagnosis data (hesin_diag)
+hesin_oper = pd.DataFrame(oper_temp, columns=['eid', 'ins_index'])
+n_oper = len(oper_temp)
+print(f'length of hes_operations: {n_oper}')
+
+# Add columns we use in the transformation with randomised values (hesin_oper)
+hesin_oper['level'] = np.random.choice(range(1, 2), size=n_oper)
+hesin_oper['opdate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_oper), unit='d')).strftime("%d/%m/%Y")
+hesin_oper['oper3'] = np.random.choice([704, 687, 4011, 387, 6961, 6512, 3811,
+                                        608, 7421, 913, 979], size=n_oper)
+hesin_oper['oper4'] = np.random.choice(['X998', 'Z942', 'Z943', 'G451', 'X403', 'Y981', 'H229',
+                                        'C751', 'C712', 'Y534', 'Z274'], size=n_oper)
+
+# Add to the unused the most frequent value from the scan report (hesin_oper)
+hesin_oper['arr_index'] = 0
+hesin_oper['oper3_nb'] = None
+hesin_oper['oper4_nb'] = None
+hesin_oper['posopdur'] = 0
+hesin_oper['preopdur'] = 0
+
+# Write to csv
+hesin.to_csv('hesin.csv', sep=',', index=False)
+hesin_diag.to_csv('hesin_diag.csv', sep=',', index=False)
+hesin_oper.to_csv('hesin_oper.csv', sep=',', index=False)
diff --git a/src/main/python/transformation/covid19_emis_gp_clinical_scripts_to_visit_occurrence.py b/src/main/python/transformation/covid19_emis_gp_clinical_scripts_to_visit_occurrence.py
@@ -10,20 +10,28 @@
 
 
 def covid19_emis_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.VisitOccurrence]:
-    clinical_source = wrapper.source_data.get_source_file('covid19_emis_gp_clinical.csv')
-    clinical = clinical_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'event_dt'])
+    source = wrapper.source_data._source_dir
+    clinical = pd.read_csv(source / 'covid19_emis_gp_clinical.csv', usecols=['eid', 'event_dt'],
+                           dtype={'eid': 'Int32', 'event_id': 'datetime64'})
     clinical = clinical[["eid", "event_dt"]].rename(columns={'event_dt': 'date'})
 
-    scripts_source = wrapper.source_data.get_source_file('covid19_emis_gp_scripts.csv')
-    scripts = scripts_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'issue_date'])
+    scripts = pd.read_csv(source / 'covid19_emis_gp_scripts.csv', usecols=['eid', 'issue_date'],
+                          dtype={'eid': 'Int32', 'event_id': 'datetime64'})
     scripts = scripts[["eid", "issue_date"]].rename(columns={'issue_date': 'date'})
 
-    df = pd.concat([scripts, clinical])
-    df = df.drop_duplicates(['eid', 'date'])
+    clinical = clinical.append(scripts)
+    del scripts  # to reduce memory use
 
-    for _, row in df.iterrows():
-        visit_date = wrapper.get_gp_datetime(row['date'],
-                                             person_source_value=row['eid'],
+    clinical = clinical.drop_duplicates(['eid', 'date'])
+
+    for _, row in clinical.iterrows():
+        if row.isnull().any():
+            continue
+        eid = row['eid']
+        eid_str = str(eid)
+        date = row['date']
+        visit_date = wrapper.get_gp_datetime(date,
+                                             person_source_value=eid_str,
                                              format="%d/%m/%Y",
                                              default_date=None)
 
@@ -32,8 +40,8 @@ def covid19_emis_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> Li
             continue
 
         yield wrapper.cdm.VisitOccurrence(
-            visit_occurrence_id=create_gp_emis_visit_occurrence_id(row['eid'], visit_date),
-            person_id=row['eid'],
+            visit_occurrence_id=create_gp_emis_visit_occurrence_id(eid_str, visit_date),
+            person_id=eid,
             visit_concept_id=38004453,  # Family Practice
             visit_start_date=visit_date.date(),
             visit_start_datetime=visit_date,

diff --git a/src/main/python/transformation/covid19_tpp_gp_clinical_scripts_to_visit_occurrence.py b/src/main/python/transformation/covid19_tpp_gp_clinical_scripts_to_visit_occurrence.py
@@ -10,20 +10,29 @@
 
 
 def covid19_tpp_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.VisitOccurrence]:
-    clinical_source = wrapper.source_data.get_source_file('covid19_tpp_gp_clinical.csv')
-    clinical = clinical_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'event_dt'])
+    source = wrapper.source_data._source_dir
+    clinical = pd.read_csv(source / 'covid19_tpp_gp_clinical.csv', usecols=['eid', 'event_dt'],
+                           dtype={'eid': 'Int32', 'event_id': 'datetime64'})
     clinical = clinical[["eid", "event_dt"]].rename(columns={'event_dt': 'date'})
 
-    scripts_source = wrapper.source_data.get_source_file('covid19_tpp_gp_scripts.csv')
-    scripts = scripts_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'issue_date'])
+    scripts = pd.read_csv(source / 'covid19_tpp_gp_scripts.csv', usecols=['eid', 'issue_date'],
+                          dtype={'eid': 'Int32', 'event_id': 'datetime64'})
     scripts = scripts[["eid", "issue_date"]].rename(columns={'issue_date': 'date'})
 
-    df = pd.concat([scripts, clinical])
-    df = df.drop_duplicates(['eid', 'date'])
+    clinical = clinical.append(scripts)
+    del scripts  # to reduce memory use
 
-    for _, row in df.iterrows():
-        visit_date = wrapper.get_gp_datetime(row['date'],
-                                             person_source_value=row['eid'],
+    clinical = clinical.drop_duplicates(['eid', 'date'])
+
+    for _, row in clinical.iterrows():
+        if row.isnull().any():
+            continue
+        eid = row['eid']
+        eid_str = str(eid)
+        date = row['date']
+
+        visit_date = wrapper.get_gp_datetime(date,
+                                             person_source_value=eid_str,
                                              format="%d/%m/%Y",
                                              default_date=None)
 
@@ -32,8 +41,8 @@ def covid19_tpp_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> Lis
             continue
 
         yield wrapper.cdm.VisitOccurrence(
-            visit_occurrence_id=create_gp_tpp_visit_occurrence_id(row['eid'], visit_date),
-            person_id=row['eid'],
+            visit_occurrence_id=create_gp_tpp_visit_occurrence_id(eid_str, visit_date),
+            person_id=eid,
             visit_concept_id=38004453,  # Family Practice
             visit_start_date=visit_date.date(),
             visit_start_datetime=visit_date,

diff --git a/src/main/python/transformation/hesin_diag_to_condition_occurrence.py b/src/main/python/transformation/hesin_diag_to_condition_occurrence.py
@@ -14,10 +14,14 @@
 
 
 def hesin_diag_to_condition_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.ConditionOccurrence]:
+    # Load hesin and hesin_diag tables, with selected columns to avoid memory failures
     hesin_diag_source = wrapper.source_data.get_source_file('hesin_diag.csv')
-    hesin_diag = hesin_diag_source.get_csv_as_df(apply_dtypes=False)
+    hesin_diag = hesin_diag_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index',
+                                                                              'diag_icd9', 'diag_icd10',
+                                                                              'level'])
     hesin_source = wrapper.source_data.get_source_file('hesin.csv')
-    hesin = hesin_source.get_csv_as_df(apply_dtypes=False)
+    hesin = hesin_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', 'spell_index',
+                                                                    'admidate', 'dsource'])
     hesin = hesin.drop_duplicates(subset=['eid', 'ins_index'])  # fix for synthetic data
 
     # Merge HES diag with HES on EID and INS_INDEX to get ADMIDATE and drop duplicates.

diff --git a/src/main/python/transformation/hesin_oper_to_procedure_occurrence.py b/src/main/python/transformation/hesin_oper_to_procedure_occurrence.py
@@ -12,10 +12,14 @@
 
 
 def hesin_oper_to_procedure_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.ProcedureOccurrence]:
+    # Load hesin and hesin_oper tables, with selected columns to avoid memory failures
     hesin_oper_source = wrapper.source_data.get_source_file('hesin_oper.csv')
-    hesin_oper = hesin_oper_source.get_csv_as_df(apply_dtypes=False)
+    hesin_oper = hesin_oper_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index',
+                                                                              'oper4', 'oper3',
+                                                                              'opdate', 'level'])
     hesin_source = wrapper.source_data.get_source_file('hesin.csv')
-    hesin = hesin_source.get_csv_as_df(apply_dtypes=False)
+    hesin = hesin_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', 'spell_index',
+                                                                   'dsource'])
     hesin = hesin.drop_duplicates(subset=['eid', 'ins_index'])  # fix for synthetic data
 
     df = hesin_oper.merge(hesin, on=['eid', 'ins_index'], how='left', suffixes=('', '_x'))

diff --git a/src/main/python/wrapper.py b/src/main/python/wrapper.py
@@ -114,7 +114,7 @@ def transform(self):
         self.execute_batch_transformation(hesin_oper_to_procedure_occurrence, bulk=True, batch_size=100000)
 
         if self.load_gp_covid19:
-            # these are expected to be the most memory heavy transformations. Execut last
+            # these are expected to be the most memory heavy transformations. Execute last
             self.execute_batch_transformation(covid19_emis_gp_clinical_scripts_to_visit_occurrence, bulk=True, batch_size=100000)
             self.execute_batch_transformation(covid19_tpp_gp_clinical_scripts_to_visit_occurrence, bulk=True, batch_size=100000)
 

diff --git a/src/test/R/main_test.sh b/src/test/R/main_test.sh
@@ -0,0 +1,7 @@
+cd src/test/R
+R -f run_create_tests.R
+cd ../../../
+python main.py -c config/config-test.yml
+cd src/test/R
+R -f run_evaluate_tests.R
+cd ../../../