Skip to content

Commit

Permalink
Merge pull request #317 from EHDEN/changes_from_partial_run
Browse files Browse the repository at this point in the history
Changes from partial run
  • Loading branch information
SofiaMp authored Aug 4, 2021
2 parents a810929 + 7d72378 commit b9ebd75
Show file tree
Hide file tree
Showing 8 changed files with 244 additions and 27 deletions.
56 changes: 56 additions & 0 deletions resources/synthetic_data/Hes_500,000/generate_gp_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import random
import pandas as pd
import numpy as np

# Number of unique ids
length = 500000
eid = list(range(length))

# Create a range of dates to take from
min_d = pd.to_datetime('01/01/2000')
max_d = pd.to_datetime('31/12/2020')
d = (max_d - min_d).days + 1

# GP_clinical file
# Create random number of indexes for each person with id
gp_clin_temp = []
for i in eid:
x = random.randint(1, 11)
for j in range(x):
gp_clin_temp.append([i])

# Create the main hospital data (gp_clinical)
gp_clin = pd.DataFrame(gp_clin_temp, columns=['eid'])
n_clin = len(gp_clin)
print(f'length of gp_clinical: {n_clin}')

# Add columns we use in the transformation with randomised values (gp_clinincal)
gp_clin['event_dt'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_clin), unit='d')).strftime("%d/%m/%Y")

# Add to the unused the most frequent value from the scan report (gp_clinical)
gp_clin['code_type'] = 0
gp_clin['code'] = '42L..'
gp_clin['value'] = 9.1000004

# GP_scipts file
# Create random number of indexes for each person with id
gp_scr_temp = []
for i in eid:
x = random.randint(1, 11)
for j in range(x):
gp_scr_temp.append([i])

# Create the main hospital data (gp_scripts)
gp_scr = pd.DataFrame(gp_scr_temp, columns=['eid'])
n_scr = len(gp_scr)
print(f'length of gp_scripts: {n_scr}')

# Add columns we use in the transformation with randomised values (gp_scripts)
gp_scr['issue_date'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_scr), unit='d')).strftime("%d/%m/%Y")

# Add to the unused the most frequent value from the scan report (gp_scripts)
gp_scr['dmd_code'] = 321177003

# Write to csv
gp_clin.to_csv('covid19_tpp_gp_clinical.csv', sep=',', index=False)
gp_scr.to_csv('covid19_tpp_gp_scripts.csv', sep=',', index=False)
129 changes: 129 additions & 0 deletions resources/synthetic_data/Hes_500,000/generate_hes_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import random
import pandas as pd
import numpy as np

# Number of unique ids
length = 7919
eid = list(range(length))

# Create a range of dates to take from
min_d = pd.to_datetime('01/01/2000')
max_d = pd.to_datetime('31/12/2020')
d = (max_d - min_d).days + 1

# HESIN file
# Create random number of indexes for each person with id+index unique
hes_temp = []
for i in eid:
x = random.randint(1, 137)
for j in range(x):
hes_temp.append([i, j])

# Create the main hospital data (hesin)
hesin = pd.DataFrame(hes_temp, columns=['eid', 'ins_index'])
n_hes = len(hesin)
print(f'length of main hes: {n_hes}')

# Add columns we use in the transformation with randomised values (hesin)
hesin['dsource'] = np.random.choice(['HES', 'SMR', 'PEDW'], size=n_hes)
hesin['epistart'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
hesin['epiend'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
hesin['spell_index'] = np.random.choice(range(1, 967), size=n_hes)
hesin['elecdate'] = '18/02/2010'
hesin['admidate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
hesin['admimeth'] = np.random.choice(range(1, 101), size=n_hes)
hesin['admisorc'] = np.random.choice(range(1, 101), size=n_hes)
hesin['disdate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_hes), unit='d')).strftime("%d/%m/%Y")
hesin['disdest'] = np.random.choice(range(1, 101), size=n_hes)

# Add to the unused the most frequent value from the scan report (hesin)
hesin['source'] = 6
hesin['epidur'] = 0
hesin['bedyear'] = 0
hesin['epistat'] = 3
hesin['epitype'] = 1
hesin['epiorder'] = 1
hesin['spell_seq'] = 0
hesin['spelbgin'] = 2
hesin['spelend'] = 'Y'
hesin['speldur'] = 0
hesin['pctcode'] = '5N1'
hesin['gpprpct'] = '5N1'
hesin['category'] = 10
hesin['elecdur'] = 1
hesin['admimeth_uni'] = 1001
hesin['admisorc_uni'] = 1000
hesin['firstreg'] = 8
hesin['classpat_uni'] = 1000
hesin['classpat'] = 2
hesin['intmanag_uni'] = 200
hesin['intmanag'] = 2
hesin['mainspef_uni'] = 1350
hesin['mainspef'] = 100
hesin['tretspef_uni'] = 1490
hesin['tretspef'] = 300
hesin['operstat'] = 1
hesin['dismeth_uni'] = 1000
hesin['dismeth'] = 1
hesin['disdest_uni'] = 1000
hesin['carersi'] = 99

# HESIN_DIAG
# Create random number of indexes for each person with id+index unique
diag_temp = []
for i in eid:
x = random.randint(1, 197)
for j in range(x):
diag_temp.append([i, j])

# Create the diagnosis data (hesin_diag)
hesin_diag = pd.DataFrame(diag_temp, columns=['eid', 'ins_index'])
n_diag = len(hesin_diag)
print(f'length of hes_diagnosis: {n_diag}')

# Add columns we use in the transformation with randomised values (hesin_diag)
hesin_diag['level'] = np.random.choice(range(1, 3), size=n_diag)
hesin_diag['diag_icd9'] = np.random.choice(['V252', '6262', '78909', '4549', '7890', '7865',
'6359', '4556', '1749', '5509', '5742', '6117',
'6289'], size=n_diag)
hesin_diag['diag_icd10'] = np.random.choice(['I10', 'E119', 'Z864', 'E780', 'J459', 'Z511',
'Z867', 'Z921', 'E039', 'I209', 'I48', 'I259',
'C509'], size=n_diag)

# Add to the unused the most frequent value from the scan report (hesin_diag)
hesin_diag['arr_index'] = 0
hesin_diag['diag_icd9_nb'] = None
hesin_diag['diag_icd10_nb'] = 2

# HESIN_OPER
# Create random number of indexes for each person with id+index unique
oper_temp = []
for i in eid:
x = random.randint(1, 163)
for j in range(x):
oper_temp.append([i, j])

# Create the diagnosis data (hesin_diag)
hesin_oper = pd.DataFrame(oper_temp, columns=['eid', 'ins_index'])
n_oper = len(oper_temp)
print(f'length of hes_operations: {n_oper}')

# Add columns we use in the transformation with randomised values (hesin_oper)
hesin_oper['level'] = np.random.choice(range(1, 2), size=n_oper)
hesin_oper['opdate'] = (min_d + pd.to_timedelta(np.random.randint(d, size=n_oper), unit='d')).strftime("%d/%m/%Y")
hesin_oper['oper3'] = np.random.choice([704, 687, 4011, 387, 6961, 6512, 3811,
608, 7421, 913, 979], size=n_oper)
hesin_oper['oper4'] = np.random.choice(['X998', 'Z942', 'Z943', 'G451', 'X403', 'Y981', 'H229',
'C751', 'C712', 'Y534', 'Z274'], size=n_oper)

# Add to the unused the most frequent value from the scan report (hesin_oper)
hesin_oper['arr_index'] = 0
hesin_oper['oper3_nb'] = None
hesin_oper['oper4_nb'] = None
hesin_oper['posopdur'] = 0
hesin_oper['preopdur'] = 0

# Write to csv
hesin.to_csv('hesin.csv', sep=',', index=False)
hesin_diag.to_csv('hesin_diag.csv', sep=',', index=False)
hesin_oper.to_csv('hesin_oper.csv', sep=',', index=False)
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,28 @@


def covid19_emis_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.VisitOccurrence]:
clinical_source = wrapper.source_data.get_source_file('covid19_emis_gp_clinical.csv')
clinical = clinical_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'event_dt'])
source = wrapper.source_data._source_dir
clinical = pd.read_csv(source / 'covid19_emis_gp_clinical.csv', usecols=['eid', 'event_dt'],
dtype={'eid': 'Int32', 'event_id': 'datetime64'})
clinical = clinical[["eid", "event_dt"]].rename(columns={'event_dt': 'date'})

scripts_source = wrapper.source_data.get_source_file('covid19_emis_gp_scripts.csv')
scripts = scripts_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'issue_date'])
scripts = pd.read_csv(source / 'covid19_emis_gp_scripts.csv', usecols=['eid', 'issue_date'],
dtype={'eid': 'Int32', 'event_id': 'datetime64'})
scripts = scripts[["eid", "issue_date"]].rename(columns={'issue_date': 'date'})

df = pd.concat([scripts, clinical])
df = df.drop_duplicates(['eid', 'date'])
clinical = clinical.append(scripts)
del scripts # to reduce memory use

for _, row in df.iterrows():
visit_date = wrapper.get_gp_datetime(row['date'],
person_source_value=row['eid'],
clinical = clinical.drop_duplicates(['eid', 'date'])

for _, row in clinical.iterrows():
if row.isnull().any():
continue
eid = row['eid']
eid_str = str(eid)
date = row['date']
visit_date = wrapper.get_gp_datetime(date,
person_source_value=eid_str,
format="%d/%m/%Y",
default_date=None)

Expand All @@ -32,8 +40,8 @@ def covid19_emis_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> Li
continue

yield wrapper.cdm.VisitOccurrence(
visit_occurrence_id=create_gp_emis_visit_occurrence_id(row['eid'], visit_date),
person_id=row['eid'],
visit_occurrence_id=create_gp_emis_visit_occurrence_id(eid_str, visit_date),
person_id=eid,
visit_concept_id=38004453, # Family Practice
visit_start_date=visit_date.date(),
visit_start_datetime=visit_date,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,29 @@


def covid19_tpp_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.VisitOccurrence]:
clinical_source = wrapper.source_data.get_source_file('covid19_tpp_gp_clinical.csv')
clinical = clinical_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'event_dt'])
source = wrapper.source_data._source_dir
clinical = pd.read_csv(source / 'covid19_tpp_gp_clinical.csv', usecols=['eid', 'event_dt'],
dtype={'eid': 'Int32', 'event_id': 'datetime64'})
clinical = clinical[["eid", "event_dt"]].rename(columns={'event_dt': 'date'})

scripts_source = wrapper.source_data.get_source_file('covid19_tpp_gp_scripts.csv')
scripts = scripts_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'issue_date'])
scripts = pd.read_csv(source / 'covid19_tpp_gp_scripts.csv', usecols=['eid', 'issue_date'],
dtype={'eid': 'Int32', 'event_id': 'datetime64'})
scripts = scripts[["eid", "issue_date"]].rename(columns={'issue_date': 'date'})

df = pd.concat([scripts, clinical])
df = df.drop_duplicates(['eid', 'date'])
clinical = clinical.append(scripts)
del scripts # to reduce memory use

for _, row in df.iterrows():
visit_date = wrapper.get_gp_datetime(row['date'],
person_source_value=row['eid'],
clinical = clinical.drop_duplicates(['eid', 'date'])

for _, row in clinical.iterrows():
if row.isnull().any():
continue
eid = row['eid']
eid_str = str(eid)
date = row['date']

visit_date = wrapper.get_gp_datetime(date,
person_source_value=eid_str,
format="%d/%m/%Y",
default_date=None)

Expand All @@ -32,8 +41,8 @@ def covid19_tpp_gp_clinical_scripts_to_visit_occurrence(wrapper: Wrapper) -> Lis
continue

yield wrapper.cdm.VisitOccurrence(
visit_occurrence_id=create_gp_tpp_visit_occurrence_id(row['eid'], visit_date),
person_id=row['eid'],
visit_occurrence_id=create_gp_tpp_visit_occurrence_id(eid_str, visit_date),
person_id=eid,
visit_concept_id=38004453, # Family Practice
visit_start_date=visit_date.date(),
visit_start_datetime=visit_date,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@


def hesin_diag_to_condition_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.ConditionOccurrence]:
# Load hesin and hesin_diag tables, with selected columns to avoid memory failures
hesin_diag_source = wrapper.source_data.get_source_file('hesin_diag.csv')
hesin_diag = hesin_diag_source.get_csv_as_df(apply_dtypes=False)
hesin_diag = hesin_diag_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index',
'diag_icd9', 'diag_icd10',
'level'])
hesin_source = wrapper.source_data.get_source_file('hesin.csv')
hesin = hesin_source.get_csv_as_df(apply_dtypes=False)
hesin = hesin_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', 'spell_index',
'admidate', 'dsource'])
hesin = hesin.drop_duplicates(subset=['eid', 'ins_index']) # fix for synthetic data

# Merge HES diag with HES on EID and INS_INDEX to get ADMIDATE and drop duplicates.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@


def hesin_oper_to_procedure_occurrence(wrapper: Wrapper) -> List[Wrapper.cdm.ProcedureOccurrence]:
# Load hesin and hesin_oper tables, with selected columns to avoid memory failures
hesin_oper_source = wrapper.source_data.get_source_file('hesin_oper.csv')
hesin_oper = hesin_oper_source.get_csv_as_df(apply_dtypes=False)
hesin_oper = hesin_oper_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index',
'oper4', 'oper3',
'opdate', 'level'])
hesin_source = wrapper.source_data.get_source_file('hesin.csv')
hesin = hesin_source.get_csv_as_df(apply_dtypes=False)
hesin = hesin_source.get_csv_as_df(apply_dtypes=False, usecols=['eid', 'ins_index', 'spell_index',
'dsource'])
hesin = hesin.drop_duplicates(subset=['eid', 'ins_index']) # fix for synthetic data

df = hesin_oper.merge(hesin, on=['eid', 'ins_index'], how='left', suffixes=('', '_x'))
Expand Down
2 changes: 1 addition & 1 deletion src/main/python/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def transform(self):
self.execute_batch_transformation(hesin_oper_to_procedure_occurrence, bulk=True, batch_size=100000)

if self.load_gp_covid19:
# these are expected to be the most memory heavy transformations. Execut last
# these are expected to be the most memory heavy transformations. Execute last
self.execute_batch_transformation(covid19_emis_gp_clinical_scripts_to_visit_occurrence, bulk=True, batch_size=100000)
self.execute_batch_transformation(covid19_tpp_gp_clinical_scripts_to_visit_occurrence, bulk=True, batch_size=100000)

Expand Down
7 changes: 7 additions & 0 deletions src/test/R/main_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cd src/test/R
R -f run_create_tests.R
cd ../../../
python main.py -c config/config-test.yml
cd src/test/R
R -f run_evaluate_tests.R
cd ../../../

0 comments on commit b9ebd75

Please sign in to comment.