Skip to content

Commit

Permalink
[DC-2692] adding changes based on stashed files
Browse files Browse the repository at this point in the history
* sets some run_for_synthetic rules to False to avoid dropping too much test data
  • Loading branch information
lrwb-aou committed Feb 27, 2023
1 parent 1423b95 commit d5dbaca
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 7 deletions.
4 changes: 2 additions & 2 deletions data_steward/cdr_cleaner/cleaning_rules/drop_orphaned_pids.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
USING (person_id)
LEFT JOIN (SELECT DISTINCT person_id FROM `{{project}}.{{dataset}}.visit_detail`) vd
USING (person_id)
where co.person_id is NULL AND d.person_id is NULL AND dee.person_id is NULL AND dre.person_id is NULL
where co.person_id is NULL AND d.person_id is NULL AND dee.person_id is NULL AND dre.person_id is NULL
AND m.person_id is NULL AND n.person_id is NULL AND o.person_id is NULL AND po.person_id is NULL
AND s.person_id is NULL AND vo.person_id is NULL AND vd.person_id is NULL
""")
Expand Down Expand Up @@ -97,7 +97,7 @@ def __init__(self,
dataset_id=dataset_id,
sandbox_dataset_id=sandbox_dataset_id,
table_namer=table_namer,
run_for_synthetic=True)
run_for_synthetic=False)

def get_query_specs(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __init__(self,
dataset_id=dataset_id,
sandbox_dataset_id=sandbox_dataset_id,
table_namer=table_namer,
run_for_synthetic=True)
run_for_synthetic=False)

def get_query_specs(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@
AND UPPER(qrai.type) = 'LANGUAGE'
""")

QRID_RID_MAPPING_QUERY = JINJA_ENV.from_string("""
-- must update the conduct_id now so the extension table is joinable on the de-identified survey_conduct table --
UPDATE `{{project_id}}.{{dataset_id}}.survey_conduct_ext` t
SET t.survey_conduct_id = m.research_response_id
FROM `{{project_id}}.{{deid_questionnaire_response_map_dataset_id}}.{{deid_questionnaire_response_map}}` m
WHERE t.survey_conduct_id = m.questionnaire_response_id
""")


class PopulateSurveyConductExt(BaseCleaningRule):
"""
Expand Down
5 changes: 4 additions & 1 deletion data_steward/tools/create_rdr_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ def main(raw_args=None):

# copy raw data into staging dataset
copy_raw_rdr_tables(bq_client, args.rdr_dataset, datasets.get('staging'))

LOGGER.info("going to sleep")
import time
time.sleep(30)
LOGGER.info("done sleeping")
# clean the RDR staging dataset
cleaning_args = [
'-p', args.curation_project_id, '-d',
Expand Down
23 changes: 20 additions & 3 deletions data_steward/tools/create_synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def _remove_mapping_tables(bq_client, project_id, final_dataset_name):
bq_client.delete_table(table)


def update_domain_table_id(client: BigQueryClient, fq_table: str):
def _update_domain_table_id(client: BigQueryClient, fq_table: str):
que = (
f"UPDATE `{fq_table}` "
f"SET {fq_table.split('.')[-1]}_id = {fq_table.split('.')[-1]}_id + 1000000000000000 "
Expand Down Expand Up @@ -243,13 +243,30 @@ def create_tier(project_id: str, input_dataset: str, release_tag: str,
bq_client.copy_dataset(f'{project_id}.{input_dataset}',
f'{project_id}.{datasets[consts.STAGING]}')

import time
LOGGER.info("sleeping after the copy")
time.sleep(30)
LOGGER.info("Done sleeping after copy")

# 1. add mapping tables
for domain_table in combine_consts.DOMAIN_TABLES:
LOGGER.info(f'Mapping {domain_table}...')
generate_combined_mapping_tables(bq_client, domain_table,
datasets[consts.STAGING], '',
datasets[consts.STAGING])

if domain_table != 'survey_conduct':
_update_domain_table_id(
bq_client,
f'{project_id}.{datasets[consts.STAGING]}.{domain_table}')

_fix_survey_conduct_records(bq_client, project_id, datasets[consts.STAGING],
datasets[consts.SANDBOX])

LOGGER.info("sleeping after generating mapping tables")
time.sleep(30)
LOGGER.info("Done sleeping generating mapping tables")

# Run cleaning rules
cleaning_args = [
'-p', project_id, '-d', datasets[consts.STAGING], '-b',
Expand Down Expand Up @@ -351,10 +368,10 @@ def main(raw_args=None) -> dict:
args.project_id, '--vocab_dataset', args.vocab_dataset, '--console_log'
])

dataset_obj = dataset_obj.dataset_id
dataset_id = dataset_obj.dataset_id

# Creates synthetic dataset and runs a subset of cleaning rules marked for synthetic data
datasets = create_tier(args.project_id, dataset_obj, args.release_tag,
datasets = create_tier(args.project_id, dataset_id, args.release_tag,
args.target_principal, **kwargs)

return datasets
Expand Down

0 comments on commit d5dbaca

Please sign in to comment.