diff --git a/resources/synthetic_data/Hes_500,000/generate_gp_tables.py b/resources/synthetic_data/Hes_500,000/generate_gp_tables.py new file mode 100644 index 00000000..ca929cba --- /dev/null +++ b/resources/synthetic_data/Hes_500,000/generate_gp_tables.py @@ -0,0 +1,56 @@ +import random +import pandas as pd +import numpy as np + +# Number of unique ids +length = 500000 +eid = list(range(length)) + +# Create a range of dates to take from +min_d = pd.to_datetime('01/01/2000') +max_d = pd.to_datetime('31/12/2020') +d = (max_d - min_d).days + 1 + +# GP_clinical file +# Create random number of indexes for each person with id +gp_clin_temp = [] +for i in eid: + x = random.randint(1, 11) + for j in range(x): + gp_clin_temp.append([i]) + +# Create the main hospital data (gp_clinical) +gp_clin = pd.DataFrame(gp_clin_temp, columns=['eid']) +n_clin = len(gp_clin) +print(f'length of gp_clinical: {n_clin}') + +# Add columns we use in the transformation with randomised values (gp_clinincal) +gp_clin['event_dt'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_clin), unit='d')).strftime("%d/%m/%Y") + +# Add to the unused the most frequent value from the scan report (gp_clinical) +gp_clin['code_type'] = 0 +gp_clin['code'] = '42L..' +gp_clin['value'] = 9.1000004 + +# GP_scipts file +# Create random number of indexes for each person with id +gp_scr_temp = [] +for i in eid: + x = random.randint(1, 11) + for j in range(x): + gp_scr_temp.append([i]) + +# Create the main hospital data (gp_scripts) +gp_scr = pd.DataFrame(gp_scr_temp, columns=['eid']) +n_scr = len(gp_scr) +print(f'length of gp_scripts: {n_scr}') + +# Add columns we use in the transformation with randomised values (gp_scripts) +gp_scr['issue_date'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_scr), unit='d')).strftime("%d/%m/%Y") + +# Add to the unused the most frequent value from the scan report (gp_scripts) +gp_scr['dmd_code'] = 321177003 + +# Write to csv +gp_clin.to_csv('covid19_tpp_gp_clinical.csv', sep=',', index=False) +gp_scr.to_csv('covid19_tpp_gp_scripts.csv', sep=',', index=False) diff --git a/resources/synthetic_data/Hes_500,000/generate_hes_tables.py b/resources/synthetic_data/Hes_500,000/generate_hes_tables.py new file mode 100644 index 00000000..0ad4a38b --- /dev/null +++ b/resources/synthetic_data/Hes_500,000/generate_hes_tables.py @@ -0,0 +1,129 @@ +import random +import pandas as pd +import numpy as np + +# Number of unique ids +length = 7919 +eid = list(range(length)) + +# Create a range of dates to take from +min_d = pd.to_datetime('01/01/2000') +max_d = pd.to_datetime('31/12/2020') +d = (max_d - min_d).days + 1 + +# HESIN file +# Create random number of indexes for each person with id+index unique +hes_temp = [] +for i in eid: + x = random.randint(1, 137) + for j in range(x): + hes_temp.append([i, j]) + +# Create the main hospital data (hesin) +hesin = pd.DataFrame(hes_temp, columns=['eid', 'ins_index']) +n_hes = len(hesin) +print(f'length of main hes: {n_hes}') + +# Add columns we use in the transformation with randomised values (hesin) +hesin['dsource'] = np.random.choice(['HES', 'SMR', 'PEDW'], size=n_hes) +hesin['epistart'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y") +hesin['epiend'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y") +hesin['spell_index'] = np.random.choice(range(1, 967), size=n_hes) +hesin['elecdate'] = '18/02/2010' +hesin['admidate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y") +hesin['admimeth'] = np.random.choice(range(1, 101), size=n_hes) +hesin['admisorc'] = np.random.choice(range(1, 101), size=n_hes) +hesin['disdate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y") +hesin['disdest'] = np.random.choice(range(1, 101), size=n_hes) + +# Add to the unused the most frequent value from the scan report (hesin) +hesin['source'] = 6 +hesin['epidur'] = 0 +hesin['bedyear'] = 0 +hesin['epistat'] = 3 +hesin['epitype'] = 1 +hesin['epiorder'] = 1 +hesin['spell_seq'] = 0 +hesin['spelbgin'] = 2 +hesin['spelend'] = 'Y' +hesin['speldur'] = 0 +hesin['pctcode'] = '5N1' +hesin['gpprpct'] = '5N1' +hesin['category'] = 10 +hesin['elecdur'] = 1 +hesin['admimeth_uni'] = 1001 +hesin['admisorc_uni'] = 1000 +hesin['firstreg'] = 8 +hesin['classpat_uni'] = 1000 +hesin['classpat'] = 2 +hesin['intmanag_uni'] = 200 +hesin['intmanag'] = 2 +hesin['mainspef_uni'] = 1350 +hesin['mainspef'] = 100 +hesin['tretspef_uni'] = 1490 +hesin['tretspef'] = 300 +hesin['operstat'] = 1 +hesin['dismeth_uni'] = 1000 +hesin['dismeth'] = 1 +hesin['disdest_uni'] = 1000 +hesin['carersi'] = 99 + +# HESIN_DIAG +# Create random number of indexes for each person with id+index unique +diag_temp = [] +for i in eid: + x = random.randint(1, 197) + for j in range(x): + diag_temp.append([i, j]) + +# Create the diagnosis data (hesin_diag) +hesin_diag = pd.DataFrame(diag_temp, columns=['eid', 'ins_index']) +n_diag = len(hesin_diag) +print(f'length of hes_diagnosis: {n_diag}') + +# Add columns we use in the transformation with randomised values (hesin_diag) +hesin_diag['level'] = np.random.choice(range(1, 3), size=n_diag) +hesin_diag['diag_icd9'] = np.random.choice(['V252', '6262', '78909', '4549', '7890', '7865', + '6359', '4556', '1749', '5509', '5742', '6117', + '6289'], size=n_diag) +hesin_diag['diag_icd10'] = np.random.choice(['I10', 'E119', 'Z864', 'E780', 'J459', 'Z511', + 'Z867', 'Z921', 'E039', 'I209', 'I48', 'I259', + 'C509'], size=n_diag) + +# Add to the unused the most frequent value from the scan report (hesin_diag) +hesin_diag['arr_index'] = 0 +hesin_diag['diag_icd9_nb'] = None +hesin_diag['diag_icd10_nb'] = 2 + +# HESIN_OPER +# Create random number of indexes for each person with id+index unique +oper_temp = [] +for i in eid: + x = random.randint(1, 163) + for j in range(x): + oper_temp.append([i, j]) + +# Create the diagnosis data (hesin_diag) +hesin_oper = pd.DataFrame(oper_temp, columns=['eid', 'ins_index']) +n_oper = len(oper_temp) +print(f'length of hes_operations: {n_oper}') + +# Add columns we use in the transformation with randomised values (hesin_oper) +hesin_oper['level'] = np.random.choice(range(1, 2), size=n_oper) +hesin_oper['opdate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_oper), unit='d')).strftime("%d/%m/%Y") +hesin_oper['oper3'] = np.random.choice([704, 687, 4011, 387, 6961, 6512, 3811, + 608, 7421, 913, 979], size=n_oper) +hesin_oper['oper4'] = np.random.choice(['X998', 'Z942', 'Z943', 'G451', 'X403', 'Y981', 'H229', + 'C751', 'C712', 'Y534', 'Z274'], size=n_oper) + +# Add to the unused the most frequent value from the scan report (hesin_oper) +hesin_oper['arr_index'] = 0 +hesin_oper['oper3_nb'] = None +hesin_oper['oper4_nb'] = None +hesin_oper['posopdur'] = 0 +hesin_oper['preopdur'] = 0 + +# Write to csv +hesin.to_csv('hesin.csv', sep=',', index=False) +hesin_diag.to_csv('hesin_diag.csv', sep=',', index=False) +hesin_oper.to_csv('hesin_oper.csv', sep=',', index=False) diff --git a/src/main/python/transformation/covid19_emis_gp_clinical_scripts_to_visit_occurrence.py b/src/main/python/transformation/covid19_emis_gp_clinical_scripts_to_visit_occurrence.py index e76f1505..f6242bda 100644 --- a/src/main/python/transformation/covid19_emis_gp_clinical_scripts_to_visit_occurrence.py +++ b/src/main/python/transformation/covid19_emis_gp_clinical_scripts_to_visit_occurrence.py @@ -10,20 +10,28 @@ def covid19_emis_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.VisitOccurrence]: - clinical_source = wrapper.source_data.get_source_file('covid19_emis_gp_clinical.csv') - clinical = clinical_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'event_dt']) + source = wrapper.source_data._source_dir + clinical = pd.read_csv(source / 'covid19_emis_gp_clinical.csv', usecols=['eid', 'event_dt'], + dtype={'eid': 'Int32', 'event_id': 'datetime64'}) clinical = clinical[["eid", "event_dt"]].rename(columns={'event_dt': 'date'}) - scripts_source = wrapper.source_data.get_source_file('covid19_emis_gp_scripts.csv') - scripts = scripts_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'issue_date']) + scripts = pd.read_csv(source / 'covid19_emis_gp_scripts.csv', usecols=['eid', 'issue_date'], + dtype={'eid': 'Int32', 'event_id': 'datetime64'}) scripts = scripts[["eid", "issue_date"]].rename(columns={'issue_date': 'date'}) - df = pd.concat([scripts, clinical]) - df = df.drop_duplicates(['eid', 'date']) + clinical = clinical.append(scripts) + del scripts # to reduce memory use - for _, row in df.iterrows(): - visit_date = wrapper.get_gp_datetime(row['date'], - person_source_value=row['eid'], + clinical = clinical.drop_duplicates(['eid', 'date']) + + for _, row in clinical.iterrows(): + if row.isnull().any(): + continue + eid = row['eid'] + eid_str = str(eid) + date = row['date'] + visit_date = wrapper.get_gp_datetime(date, + person_source_value=eid_str, format="%d/%m/%Y", default_date=None) @@ -32,8 +40,8 @@ def covid19_emis_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> Li continue yield wrapper.cdm.VisitOccurrence( - visit_occurrence_id=create_gp_emis_visit_occurrence_id(row['eid'], visit_date), - person_id=row['eid'], + visit_occurrence_id=create_gp_emis_visit_occurrence_id(eid_str, visit_date), + person_id=eid, visit_concept_id=38004453, # Family Practice visit_start_date=visit_date.date(), visit_start_datetime=visit_date, diff --git a/src/main/python/transformation/covid19_tpp_gp_clinical_scripts_to_visit_occurrence.py b/src/main/python/transformation/covid19_tpp_gp_clinical_scripts_to_visit_occurrence.py index 61d7cfd6..7f161229 100644 --- a/src/main/python/transformation/covid19_tpp_gp_clinical_scripts_to_visit_occurrence.py +++ b/src/main/python/transformation/covid19_tpp_gp_clinical_scripts_to_visit_occurrence.py @@ -10,20 +10,29 @@ def covid19_tpp_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.VisitOccurrence]: - clinical_source = wrapper.source_data.get_source_file('covid19_tpp_gp_clinical.csv') - clinical = clinical_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'event_dt']) + source = wrapper.source_data._source_dir + clinical = pd.read_csv(source / 'covid19_tpp_gp_clinical.csv', usecols=['eid', 'event_dt'], + dtype={'eid': 'Int32', 'event_id': 'datetime64'}) clinical = clinical[["eid", "event_dt"]].rename(columns={'event_dt': 'date'}) - scripts_source = wrapper.source_data.get_source_file('covid19_tpp_gp_scripts.csv') - scripts = scripts_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'issue_date']) + scripts = pd.read_csv(source / 'covid19_tpp_gp_scripts.csv', usecols=['eid', 'issue_date'], + dtype={'eid': 'Int32', 'event_id': 'datetime64'}) scripts = scripts[["eid", "issue_date"]].rename(columns={'issue_date': 'date'}) - df = pd.concat([scripts, clinical]) - df = df.drop_duplicates(['eid', 'date']) + clinical = clinical.append(scripts) + del scripts # to reduce memory use - for _, row in df.iterrows(): - visit_date = wrapper.get_gp_datetime(row['date'], - person_source_value=row['eid'], + clinical = clinical.drop_duplicates(['eid', 'date']) + + for _, row in clinical.iterrows(): + if row.isnull().any(): + continue + eid = row['eid'] + eid_str = str(eid) + date = row['date'] + + visit_date = wrapper.get_gp_datetime(date, + person_source_value=eid_str, format="%d/%m/%Y", default_date=None) @@ -32,8 +41,8 @@ def covid19_tpp_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> Lis continue yield wrapper.cdm.VisitOccurrence( - visit_occurrence_id=create_gp_tpp_visit_occurrence_id(row['eid'], visit_date), - person_id=row['eid'], + visit_occurrence_id=create_gp_tpp_visit_occurrence_id(eid_str, visit_date), + person_id=eid, visit_concept_id=38004453, # Family Practice visit_start_date=visit_date.date(), visit_start_datetime=visit_date, diff --git a/src/main/python/transformation/hesin_diag_to_condition_occurrence.py b/src/main/python/transformation/hesin_diag_to_condition_occurrence.py index 9a8e971b..be34fa79 100644 --- a/src/main/python/transformation/hesin_diag_to_condition_occurrence.py +++ b/src/main/python/transformation/hesin_diag_to_condition_occurrence.py @@ -14,10 +14,14 @@ def hesin_diag_to_condition_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.ConditionOccurrence]: + # Load hesin and hesin_diag tables, with selected columns to avoid memory failures hesin_diag_source = wrapper.source_data.get_source_file('hesin_diag.csv') - hesin_diag = hesin_diag_source.get_csv_as_df(apply_dtypes=False) + hesin_diag = hesin_diag_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', + 'diag_icd9', 'diag_icd10', + 'level']) hesin_source = wrapper.source_data.get_source_file('hesin.csv') - hesin = hesin_source.get_csv_as_df(apply_dtypes=False) + hesin = hesin_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', 'spell_index', + 'admidate', 'dsource']) hesin = hesin.drop_duplicates(subset=['eid', 'ins_index']) # fix for synthetic data # Merge HES diag with HES on EID and INS_INDEX to get ADMIDATE and drop duplicates. diff --git a/src/main/python/transformation/hesin_oper_to_procedure_occurrence.py b/src/main/python/transformation/hesin_oper_to_procedure_occurrence.py index 95f24e95..eb7eb660 100644 --- a/src/main/python/transformation/hesin_oper_to_procedure_occurrence.py +++ b/src/main/python/transformation/hesin_oper_to_procedure_occurrence.py @@ -12,10 +12,14 @@ def hesin_oper_to_procedure_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.ProcedureOccurrence]: + # Load hesin and hesin_oper tables, with selected columns to avoid memory failures hesin_oper_source = wrapper.source_data.get_source_file('hesin_oper.csv') - hesin_oper = hesin_oper_source.get_csv_as_df(apply_dtypes=False) + hesin_oper = hesin_oper_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', + 'oper4', 'oper3', + 'opdate', 'level']) hesin_source = wrapper.source_data.get_source_file('hesin.csv') - hesin = hesin_source.get_csv_as_df(apply_dtypes=False) + hesin = hesin_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', 'spell_index', + 'dsource']) hesin = hesin.drop_duplicates(subset=['eid', 'ins_index']) # fix for synthetic data df = hesin_oper.merge(hesin, on=['eid', 'ins_index'], how='left', suffixes=('', '_x')) diff --git a/src/main/python/wrapper.py b/src/main/python/wrapper.py index 327d9709..17af5469 100644 --- a/src/main/python/wrapper.py +++ b/src/main/python/wrapper.py @@ -114,7 +114,7 @@ def transform(self): self.execute_batch_transformation(hesin_oper_to_procedure_occurrence, bulk=True, batch_size=100000) if self.load_gp_covid19: - # these are expected to be the most memory heavy transformations. Execut last + # these are expected to be the most memory heavy transformations. Execute last self.execute_batch_transformation(covid19_emis_gp_clinical_scripts_to_visit_occurrence, bulk=True, batch_size=100000) self.execute_batch_transformation(covid19_tpp_gp_clinical_scripts_to_visit_occurrence, bulk=True, batch_size=100000) diff --git a/src/test/R/main_test.sh b/src/test/R/main_test.sh new file mode 100644 index 00000000..7f13c353 --- /dev/null +++ b/src/test/R/main_test.sh @@ -0,0 +1,7 @@ +cd src/test/R +R -f run_create_tests.R +cd ../../../ +python main.py -c config/config-test.yml +cd src/test/R +R -f run_evaluate_tests.R +cd ../../../ \ No newline at end of file