[DC-2692] adding changes based on stashed files

* sets some run_for_synthetic rules to False to avoid dropping too much test data
all-of-us · Feb 27, 2023 · d5dbaca · d5dbaca
1 parent 1423b95
commit d5dbaca
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 7 deletions.
diff --git a/data_steward/cdr_cleaner/cleaning_rules/drop_orphaned_pids.py b/data_steward/cdr_cleaner/cleaning_rules/drop_orphaned_pids.py
@@ -48,7 +48,7 @@
 USING (person_id)
 LEFT JOIN (SELECT DISTINCT person_id FROM `{{project}}.{{dataset}}.visit_detail`) vd
 USING (person_id)
-where co.person_id is NULL AND d.person_id is NULL AND dee.person_id is NULL AND dre.person_id is NULL 
+where co.person_id is NULL AND d.person_id is NULL AND dee.person_id is NULL AND dre.person_id is NULL
 AND m.person_id is NULL AND n.person_id is NULL AND o.person_id is NULL AND po.person_id is NULL
 AND s.person_id is NULL AND vo.person_id is NULL AND vd.person_id is NULL
 """)
@@ -97,7 +97,7 @@ def __init__(self,
                          dataset_id=dataset_id,
                          sandbox_dataset_id=sandbox_dataset_id,
                          table_namer=table_namer,
-                         run_for_synthetic=True)
+                         run_for_synthetic=False)
 
     def get_query_specs(self):
         """

diff --git a/data_steward/cdr_cleaner/cleaning_rules/drop_participants_without_any_basics.py b/data_steward/cdr_cleaner/cleaning_rules/drop_participants_without_any_basics.py
@@ -82,7 +82,7 @@ def __init__(self,
                          dataset_id=dataset_id,
                          sandbox_dataset_id=sandbox_dataset_id,
                          table_namer=table_namer,
-                         run_for_synthetic=True)
+                         run_for_synthetic=False)
 
     def get_query_specs(self):
         """

diff --git a/data_steward/cdr_cleaner/cleaning_rules/populate_survey_conduct_ext.py b/data_steward/cdr_cleaner/cleaning_rules/populate_survey_conduct_ext.py
@@ -24,6 +24,14 @@
 AND UPPER(qrai.type) = 'LANGUAGE'
 """)
 
+QRID_RID_MAPPING_QUERY = JINJA_ENV.from_string("""
+-- must update the conduct_id now so the extension table is joinable on the de-identified survey_conduct table --
+UPDATE `{{project_id}}.{{dataset_id}}.survey_conduct_ext` t
+SET t.survey_conduct_id = m.research_response_id
+FROM `{{project_id}}.{{deid_questionnaire_response_map_dataset_id}}.{{deid_questionnaire_response_map}}` m
+WHERE t.survey_conduct_id = m.questionnaire_response_id
+""")
+
 
 class PopulateSurveyConductExt(BaseCleaningRule):
     """

diff --git a/data_steward/tools/create_rdr_snapshot.py b/data_steward/tools/create_rdr_snapshot.py
@@ -102,7 +102,10 @@ def main(raw_args=None):
 
     # copy raw data into staging dataset
     copy_raw_rdr_tables(bq_client, args.rdr_dataset, datasets.get('staging'))
-
+    LOGGER.info("going to sleep")
+    import time
+    time.sleep(30)
+    LOGGER.info("done sleeping")
     # clean the RDR staging dataset
     cleaning_args = [
         '-p', args.curation_project_id, '-d',

diff --git a/data_steward/tools/create_synthetic.py b/data_steward/tools/create_synthetic.py
@@ -207,7 +207,7 @@ def _remove_mapping_tables(bq_client, project_id, final_dataset_name):
             bq_client.delete_table(table)
 
 
-def update_domain_table_id(client: BigQueryClient, fq_table: str):
+def _update_domain_table_id(client: BigQueryClient, fq_table: str):
     que = (
         f"UPDATE `{fq_table}`  "
         f"SET {fq_table.split('.')[-1]}_id = {fq_table.split('.')[-1]}_id + 1000000000000000  "
@@ -243,13 +243,30 @@ def create_tier(project_id: str, input_dataset: str, release_tag: str,
     bq_client.copy_dataset(f'{project_id}.{input_dataset}',
                            f'{project_id}.{datasets[consts.STAGING]}')
 
+    import time
+    LOGGER.info("sleeping after the copy")
+    time.sleep(30)
+    LOGGER.info("Done sleeping after copy")
+
     # 1. add mapping tables
     for domain_table in combine_consts.DOMAIN_TABLES:
         LOGGER.info(f'Mapping {domain_table}...')
         generate_combined_mapping_tables(bq_client, domain_table,
                                          datasets[consts.STAGING], '',
                                          datasets[consts.STAGING])
 
+        if domain_table != 'survey_conduct':
+            _update_domain_table_id(
+                bq_client,
+                f'{project_id}.{datasets[consts.STAGING]}.{domain_table}')
+
+    _fix_survey_conduct_records(bq_client, project_id, datasets[consts.STAGING],
+                                datasets[consts.SANDBOX])
+
+    LOGGER.info("sleeping after generating mapping tables")
+    time.sleep(30)
+    LOGGER.info("Done sleeping generating mapping tables")
+
     # Run cleaning rules
     cleaning_args = [
         '-p', project_id, '-d', datasets[consts.STAGING], '-b',
@@ -351,10 +368,10 @@ def main(raw_args=None) -> dict:
         args.project_id, '--vocab_dataset', args.vocab_dataset, '--console_log'
     ])
 
-    dataset_obj = dataset_obj.dataset_id
+    dataset_id = dataset_obj.dataset_id
 
     # Creates synthetic dataset and runs a subset of cleaning rules marked for synthetic data
-    datasets = create_tier(args.project_id, dataset_obj, args.release_tag,
+    datasets = create_tier(args.project_id, dataset_id, args.release_tag,
                            args.target_principal, **kwargs)
 
     return datasets