diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py new file mode 100644 index 0000000000..00ad430b9c --- /dev/null +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.7.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# Purpose: Use this notebook to search for ids in sandbox datasets + +# + tags=["parameters"] +project_id = '' +sandbox_dataset_id = '' # Sandbox dataset to search in for the problem ids +search_field = '' # field in the sandbox tables expected to contain the ids. Example: observation_id +run_as = '' + +# + +from utils import auth +import pandas as pd +from gcloud.bq import BigQueryClient +from common import JINJA_ENV +from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message + +pd.set_option('display.max_rows', None) +# - + +impersonation_creds = auth.get_impersonation_credentials( + run_as, target_scopes=IMPERSONATION_SCOPES) + +client = BigQueryClient(project_id, credentials=impersonation_creds) + +# # Create list of ids to search +# Run the following cell to create a list of ids to search for. Recommend using a LIMIT if the list is quite large.
+# OR
+# Manually create a list of ids called ids_list + +# + +tpl = JINJA_ENV.from_string(''' +{INSERT QUERY HERE} +''') +query = tpl.render() +ids = execute(client, query) + +ids_list = ids[search_field].to_list() + + +# - + +# # Get the tables that contain the search_field, from the sandbox dataset + +# + +tpl = JINJA_ENV.from_string(''' + SELECT + * + FROM + `{{project_id}}.{{sandbox_dataset_id}}.INFORMATION_SCHEMA.COLUMNS` + WHERE + column_name = '{{search_field}}' + ORDER BY table_name + +''') +query = tpl.render(sandbox_dataset_id=sandbox_dataset_id, + project_id=project_id, + search_field=search_field) +tables_in_dataset = execute(client, query) + +tables_list = tables_in_dataset['table_name'].to_list() +tables_list +# - + +# # Search in each sandbox table and print results + +queries = [] +for table in tables_list: + tpl = JINJA_ENV.from_string(''' + SELECT + '{{table}}' as table, + COUNT(*) AS n_{{search_field}}s_found + FROM + `{{project_id}}.{{sandbox_dataset_id}}.{{table}}` + WHERE {{search_field}} IN UNNEST ({{ids_list}}) + ''') + query = tpl.render(sandbox_dataset_id=sandbox_dataset_id, + project_id=project_id, + table=table, + ids_list=ids_list, + search_field=search_field) + queries.append(query) +execute(client, '\nUNION ALL\n'.join(queries)) + diff --git a/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py b/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py index 14248b11d9..16cacdae9b 100644 --- a/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py +++ b/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py @@ -26,7 +26,11 @@ # # QC for RDR Export # # Quality checks performed on a new RDR dataset and comparison with previous RDR dataset. -from common import CATI_TABLES, DEATH, FACT_RELATIONSHIP, JINJA_ENV, PIPELINE_TABLES +from IPython.display import display, HTML +from cdm import tables_to_map +from common import (AOU_DEATH, CATI_TABLES, DEATH, FACT_RELATIONSHIP, JINJA_ENV, + MAPPING_PREFIX, PIPELINE_TABLES) +from resources import CDM_TABLES from utils import auth from gcloud.bq import BigQueryClient from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message @@ -37,221 +41,210 @@ client = BigQueryClient(project_id, credentials=impersonation_creds) # This list is created by querying the redcap surveys. In case of needed update, query provided in the comments of DC3407 -expected_strings = ['cidi5_20', 'cidi5_24', 'cidi5_28', 'cidi5_31', 'mhqukb_48_age', - 'mhqukb_50_number', 'mhqukb_51_number', 'mhqukb_52_number', - 'mhqukb_53_number', 'record_id', 'helpmewithconsent_name', - 'other_concerns', 'other_reasons', 'resultsconsent_emailmecopy', - 'resultsconsent_signaturedate', 'consentpii_helpwithconsentsignature', - 'extraconsent_signature_type', 'extraconsent_todaysdate', - 'piiaddress_streetaddress', 'piiaddress_streetaddress2', - 'piibirthinformation_birthdate', 'piicontactinformation_phone', - 'piiname_first', 'piiname_last', 'piiname_middle', - 'streetaddress_piicity', 'streetaddress_piizip', 'basics_11a_cope_a_33', - 'basics_xx', 'basics_xx20', 'cdc_covid_19_7_xx22_date', 'cope_a_126', - 'cope_a_160', 'cope_a_85', 'copect_50_xx19_cope_a_152', - 'copect_50_xx19_cope_a_198', 'copect_50_xx19_cope_a_57', - 'cu_covid_cope_a_204', 'eds_follow_up_1_xx', 'ipaq_1_cope_a_24', - 'ipaq_2_cope_a_160', 'ipaq_2_cope_a_85', 'ipaq_3_cope_a_24', - 'ipaq_4_cope_a_160', 'ipaq_4_cope_a_85', 'ipaq_5_cope_a_24', - 'ipaq_6_cope_a_160', 'ipaq_6_cope_a_85', 'lifestyle_2_xx12_cope_a_152', - 'lifestyle_2_xx12_cope_a_198', 'lifestyle_2_xx12_cope_a_57', - 'tsu_ds5_13_xx42_cope_a_226', 'cdc_covid_19_7_xx23_other_cope_a_204', - 'cdc_covid_19_n_a2', 'cdc_covid_19_n_a4', 'cdc_covid_19_n_a8', - 'cope_aou_xx_2_a', 'dmfs_29a', 'msds_17_c', - 'nhs_covid_fhc17b_cope_a_226', 'ehrconsentpii_helpwithconsentsignature', - 'ehrconsentpii_todaysdate', 'ehrconsentpii_todaysdateilhippawitness', - 'sensitivetype2_domesticviolence', 'sensitivetype2_genetictesting', - 'sensitivetype2_hivaids', 'sensitivetype2_mentalhealth', - 'sensitivetype2_substanceuse', 'signature_type', 'cidi5_15', - 'mhqukb_25_number', 'mhqukb_26_age', 'mhqukb_28_age', 'ss_2_age', - 'ss_3_age_1', 'ss_3_age_2', 'ss_3_number', - 'english_exploring_the_mind_consent_form', 'etm_help_name', - 'cdc_covid_xx_a_date1', 'cdc_covid_xx_a_date2', - 'cdc_covid_xx_b_firstdose_other', 'cdc_covid_xx_b_seconddose_other', - 'cdc_covid_xx_symptom_cope_350', - 'cdc_covid_xx_symptom_seconddose_cope_350', 'dmfs_29_seconddose_other', - 'othercancer_daughterfreetextbox', 'othercancer_fatherfreetextbox', - 'othercancer_grandparentfreetextbox', 'othercancer_motherfreetextbox', - 'othercancer_siblingfreetextbox', 'othercancer_sonfreetextbox', - 'othercondition_daughterfreetextbox', 'othercondition_fatherfreetextbox', - 'othercondition_grandparentfreetextbox', - 'othercondition_motherfreetextbox', 'othercondition_siblingfreetextbox', - 'othercondition_sonfreetextbox', 'cdc_covid_xx_b_other', - 'otherdelayedmedicalcare_freetext', - 'attemptquitsmoking_completelyquitage', 'otherspecify_otherdrugstextbox', - 'smoking_averagedailycigarettenumber', - 'smoking_currentdailycigarettenumber', - 'smoking_dailysmokestartingagenumber', 'smoking_numberofyearsnumber', - 'cdc_covid_xx_a_date10', 'cdc_covid_xx_a_date11', - 'cdc_covid_xx_a_date12', 'cdc_covid_xx_a_date13', - 'cdc_covid_xx_a_date14', 'cdc_covid_xx_a_date15', - 'cdc_covid_xx_a_date16', 'cdc_covid_xx_a_date17', 'cdc_covid_xx_a_date3', - 'cdc_covid_xx_a_date4', 'cdc_covid_xx_a_date5', 'cdc_covid_xx_a_date6', - 'cdc_covid_xx_a_date7', 'cdc_covid_xx_a_date8', 'cdc_covid_xx_a_date9', - 'cdc_covid_xx_b_dose10_other', 'cdc_covid_xx_b_dose11_other', - 'cdc_covid_xx_b_dose12_other', 'cdc_covid_xx_b_dose13_other', - 'cdc_covid_xx_b_dose14_other', 'cdc_covid_xx_b_dose15_other', - 'cdc_covid_xx_b_dose16_other', 'cdc_covid_xx_b_dose17_other', - 'cdc_covid_xx_b_dose3_other', 'cdc_covid_xx_b_dose4_other', - 'cdc_covid_xx_b_dose5_other', 'cdc_covid_xx_b_dose6_other', - 'cdc_covid_xx_b_dose7_other', 'cdc_covid_xx_b_dose8_other', - 'cdc_covid_xx_b_dose9_other', 'cdc_covid_xx_symptom_cope_350_dose10', - 'cdc_covid_xx_symptom_cope_350_dose11', - 'cdc_covid_xx_symptom_cope_350_dose12', - 'cdc_covid_xx_symptom_cope_350_dose13', - 'cdc_covid_xx_symptom_cope_350_dose14', - 'cdc_covid_xx_symptom_cope_350_dose15', - 'cdc_covid_xx_symptom_cope_350_dose16', - 'cdc_covid_xx_symptom_cope_350_dose17', - 'cdc_covid_xx_symptom_cope_350_dose3', - 'cdc_covid_xx_symptom_cope_350_dose4', - 'cdc_covid_xx_symptom_cope_350_dose5', - 'cdc_covid_xx_symptom_cope_350_dose6', - 'cdc_covid_xx_symptom_cope_350_dose7', - 'cdc_covid_xx_symptom_cope_350_dose8', - 'cdc_covid_xx_symptom_cope_350_dose9', 'cdc_covid_xx_type_dose10_other', - 'cdc_covid_xx_type_dose11_other', 'cdc_covid_xx_type_dose12_other', - 'cdc_covid_xx_type_dose13_other', 'cdc_covid_xx_type_dose14_other', - 'cdc_covid_xx_type_dose15_other', 'cdc_covid_xx_type_dose16_other', - 'cdc_covid_xx_type_dose17_other', 'cdc_covid_xx_type_dose3_other', - 'cdc_covid_xx_type_dose4_other', 'cdc_covid_xx_type_dose5_other', - 'cdc_covid_xx_type_dose6_other', 'cdc_covid_xx_type_dose7_other', - 'cdc_covid_xx_type_dose8_other', 'cdc_covid_xx_type_dose9_other', - 'dmfs_29_additionaldose_other', - 'organtransplant_bloodvesseltransplantdate', - 'organtransplant_bonetransplantdate', - 'organtransplant_corneatransplantdate', - 'organtransplant_hearttransplantdate', - 'organtransplant_intestinetransplantdate', - 'organtransplant_kidneytransplantdate', - 'organtransplant_livertransplantdate', - 'organtransplant_lungtransplantdate', - 'organtransplant_otherorgantransplantdate', - 'organtransplant_othertissuetransplantdate', - 'organtransplant_pancreastransplantdate', - 'organtransplant_skintransplantdate', - 'organtransplant_valvetransplantdate', 'otherorgan_freetextbox', - 'othertissue_freetextbox', - 'outsidetravel6month_outsidetravel6monthhowlong', - 'outsidetravel6month_outsidetravel6monthwheretraveled', - 'overallhealth_hysterectomyhistoryage', - 'overallhealthovaryremovalhistoryage', - 'otherarthritis_daughterfreetextbox', 'otherarthritis_fatherfreetextbox', - 'otherarthritis_freetextbox', 'otherarthritis_grandparentfreetextbox', - 'otherarthritis_motherfreetextbox', 'otherarthritis_siblingfreetextbox', - 'otherarthritis_sonfreetextbox', - 'otherbonejointmuscle_daughterfreetextbox', - 'otherbonejointmuscle_fatherfreetextbox', - 'otherbonejointmuscle_freetextbox', - 'otherbonejointmuscle_grandparentfreetextbox', - 'otherbonejointmuscle_motherfreetextbox', - 'otherbonejointmuscle_siblingfreetextbox', - 'otherbonejointmuscle_sonfreetextbox', - 'otherbrainnervoussystem_daughterfreetextbox', - 'otherbrainnervoussystem_fatherfreetextbox', - 'otherbrainnervoussystem_freetextbox', - 'otherbrainnervoussystem_grandparentfreetextbox', - 'otherbrainnervoussystem_motherfreetextbox', - 'otherbrainnervoussystem_siblingfreetextbox', - 'otherbrainnervoussystem_sonfreetextbox', 'othercancer_freetextbox', - 'otherdiabetes_daughterfreetextbox', 'otherdiabetes_fatherfreetextbox', - 'otherdiabetes_freetextbox', 'otherdiabetes_grandparentfreetextbox', - 'otherdiabetes_motherfreetextbox', 'otherdiabetes_siblingfreetextbox', - 'otherdiabetes_sonfreetextbox', 'otherdiagnosis_daughterfreetextbox', - 'otherdiagnosis_fatherfreetextbox', 'otherdiagnosis_freetextbox', - 'otherdiagnosis_grandparentfreetextbox', - 'otherdiagnosis_motherfreetextbox', 'otherdiagnosis_siblingfreetextbox', - 'otherdiagnosis_sonfreetextbox', - 'otherdigestivecondition_daughterfreetextbox', - 'otherdigestivecondition_fatherfreetextbox', - 'otherdigestivecondition_freetextbox', - 'otherdigestivecondition_grandparentfreetextbox', - 'otherdigestivecondition_motherfreetextbox', - 'otherdigestivecondition_siblingfreetextbox', - 'otherdigestivecondition_sonfreetextbox', - 'otherhearingeye_daughterfreetextbox', - 'otherhearingeye_fatherfreetextbox', 'otherhearingeye_freetextbox', - 'otherhearingeye_grandparentfreetextbox', - 'otherhearingeye_motherfreetextbox', - 'otherhearingeye_siblingfreetextbox', 'otherhearingeye_sonfreetextbox', - 'otherheartorbloodcondition_daughterfreetextbox', - 'otherheartorbloodcondition_fatherfreetextbox', - 'otherheartorbloodcondition_freetextbox', - 'otherheartorbloodcondition_grandparentfreetextbox', - 'otherheartorbloodcondition_motherfreetextbox', - 'otherheartorbloodcondition_siblingfreetextbox', - 'otherheartorbloodcondition_sonfreetextbox', - 'otherhormoneendocrine_daughterfreetextbox', - 'otherhormoneendocrine_fatherfreetextbox', - 'otherhormoneendocrine_freetextbox', - 'otherhormoneendocrine_grandparentfreetextbox', - 'otherhormoneendocrine_motherfreetextbox', - 'otherhormoneendocrine_siblingfreetextbox', - 'otherhormoneendocrine_sonfreetextbox', - 'otherinfectiousdisease_freetextbox', - 'otherkidneycondition_daughterfreetextbox', - 'otherkidneycondition_fatherfreetextbox', - 'otherkidneycondition_freetextbox', - 'otherkidneycondition_grandparentfreetextbox', - 'otherkidneycondition_motherfreetextbox', - 'otherkidneycondition_siblingfreetextbox', - 'otherkidneycondition_sonfreetextbox', - 'othermentalhealthsubstanceuse_daughterfreetextbox', - 'othermentalhealthsubstanceuse_fatherfreetextbox', - 'othermentalhealthsubstanceuse_freetextbox', - 'othermentalhealthsubstanceuse_grandparentfreetextb', - 'othermentalhealthsubstanceuse_motherfreetextbox', - 'othermentalhealthsubstanceuse_siblingfreetextbox', - 'othermentalhealthsubstanceuse_sonfreetextbox', - 'otherrespiratory_daughterfreetextbox', - 'otherrespiratory_fatherfreetextbox', 'otherrespiratory_freetextbox', - 'otherrespiratory_grandparentfreetextbox', - 'otherrespiratory_motherfreetextbox', - 'otherrespiratory_siblingfreetextbox', 'otherrespiratory_sonfreetextbox', - 'otherthyroid_daughterfreetextbox', 'otherthyroid_fatherfreetextbox', - 'otherthyroid_freetextbox', 'otherthyroid_grandparentfreetextbox', - 'otherthyroid_motherfreetextbox', 'otherthyroid_siblingfreetextbox', - 'otherthyroid_sonfreetextbox', 'self_reported_height_cm', - 'self_reported_height_ft', 'self_reported_height_in', - 'self_reported_weight_kg', 'self_reported_weight_pounds', - 'sdoh_eds_follow_up_1_xx', 'urs_8c', 'aian_tribe', - 'aiannoneofthesedescribeme_aianfreetext', - 'blacknoneofthesedescribeme_blackfreetext', - 'employmentworkaddress_addresslineone', - 'employmentworkaddress_addresslinetwo', 'employmentworkaddress_city', - 'employmentworkaddress_country', 'employmentworkaddress_zipcode', - 'hispanicnoneofthesedescribeme_hispanicfreetext', - 'livingsituation_howmanypeople', - 'livingsituation_livingsituationfreetext', - 'livingsituation_peopleunder18', - 'menanoneofthesedescribeme_menafreetext', - 'nhpinoneofthesedescribeme_nhpifreetext', - 'noneofthesedescribeme_asianfreetext', 'otherhealthplan_freetext', - 'persononeaddress_persononeaddresscity', - 'persononeaddress_persononeaddresszipcode', - 'secondarycontactinfo_persononeaddressone', - 'secondarycontactinfo_persononeaddresstwo', - 'secondarycontactinfo_persononeemail', - 'secondarycontactinfo_persononefirstname', - 'secondarycontactinfo_persononelastname', - 'secondarycontactinfo_persononemiddleinitial', - 'secondarycontactinfo_persononetelephone', - 'secondarycontactinfo_secondcontactsaddressone', - 'secondarycontactinfo_secondcontactsaddresstwo', - 'secondarycontactinfo_secondcontactsemail', - 'secondarycontactinfo_secondcontactsfirstname', - 'secondarycontactinfo_secondcontactslastname', - 'secondarycontactinfo_secondcontactsmiddleinitial', - 'secondarycontactinfo_secondcontactsnumber', - 'secondcontactsaddress_secondcontactcity', - 'secondcontactsaddress_secondcontactzipcode', - 'sexatbirthnoneofthese_sexatbirthtextbox', - 'socialsecurity_socialsecuritynumber', - 'somethingelse_sexualitysomethingelsetextbox', - 'specifiedgender_specifiedgendertextbox', 'thebasics_countryborntextbox', - 'whatraceethnicity_raceethnicitynoneofthese', - 'whitenoneofthesedescribeme_whitefreetext', 'timeofday', - 'wearconsent_todaysdate'] +expected_strings = [ + 'cidi5_20', 'cidi5_24', 'cidi5_28', 'cidi5_31', 'mhqukb_48_age', + 'mhqukb_50_number', 'mhqukb_51_number', 'mhqukb_52_number', + 'mhqukb_53_number', 'record_id', 'helpmewithconsent_name', 'other_concerns', + 'other_reasons', 'resultsconsent_emailmecopy', + 'resultsconsent_signaturedate', 'consentpii_helpwithconsentsignature', + 'extraconsent_signature_type', 'extraconsent_todaysdate', + 'piiaddress_streetaddress', 'piiaddress_streetaddress2', + 'piibirthinformation_birthdate', 'piicontactinformation_phone', + 'piiname_first', 'piiname_last', 'piiname_middle', 'streetaddress_piicity', + 'streetaddress_piizip', 'basics_11a_cope_a_33', 'basics_xx', 'basics_xx20', + 'cdc_covid_19_7_xx22_date', 'cope_a_126', 'cope_a_160', 'cope_a_85', + 'copect_50_xx19_cope_a_152', 'copect_50_xx19_cope_a_198', + 'copect_50_xx19_cope_a_57', 'cu_covid_cope_a_204', 'eds_follow_up_1_xx', + 'ipaq_1_cope_a_24', 'ipaq_2_cope_a_160', 'ipaq_2_cope_a_85', + 'ipaq_3_cope_a_24', 'ipaq_4_cope_a_160', 'ipaq_4_cope_a_85', + 'ipaq_5_cope_a_24', 'ipaq_6_cope_a_160', 'ipaq_6_cope_a_85', + 'lifestyle_2_xx12_cope_a_152', 'lifestyle_2_xx12_cope_a_198', + 'lifestyle_2_xx12_cope_a_57', 'tsu_ds5_13_xx42_cope_a_226', + 'cdc_covid_19_7_xx23_other_cope_a_204', 'cdc_covid_19_n_a2', + 'cdc_covid_19_n_a4', 'cdc_covid_19_n_a8', 'cope_aou_xx_2_a', 'dmfs_29a', + 'msds_17_c', 'nhs_covid_fhc17b_cope_a_226', + 'ehrconsentpii_helpwithconsentsignature', 'ehrconsentpii_todaysdate', + 'ehrconsentpii_todaysdateilhippawitness', 'sensitivetype2_domesticviolence', + 'sensitivetype2_genetictesting', 'sensitivetype2_hivaids', + 'sensitivetype2_mentalhealth', 'sensitivetype2_substanceuse', + 'signature_type', 'cidi5_15', 'mhqukb_25_number', 'mhqukb_26_age', + 'mhqukb_28_age', 'ss_2_age', 'ss_3_age_1', 'ss_3_age_2', 'ss_3_number', + 'english_exploring_the_mind_consent_form', 'etm_help_name', + 'cdc_covid_xx_a_date1', 'cdc_covid_xx_a_date2', + 'cdc_covid_xx_b_firstdose_other', 'cdc_covid_xx_b_seconddose_other', + 'cdc_covid_xx_symptom_cope_350', 'cdc_covid_xx_symptom_seconddose_cope_350', + 'dmfs_29_seconddose_other', 'othercancer_daughterfreetextbox', + 'othercancer_fatherfreetextbox', 'othercancer_grandparentfreetextbox', + 'othercancer_motherfreetextbox', 'othercancer_siblingfreetextbox', + 'othercancer_sonfreetextbox', 'othercondition_daughterfreetextbox', + 'othercondition_fatherfreetextbox', 'othercondition_grandparentfreetextbox', + 'othercondition_motherfreetextbox', 'othercondition_siblingfreetextbox', + 'othercondition_sonfreetextbox', 'cdc_covid_xx_b_other', + 'otherdelayedmedicalcare_freetext', 'attemptquitsmoking_completelyquitage', + 'otherspecify_otherdrugstextbox', 'smoking_averagedailycigarettenumber', + 'smoking_currentdailycigarettenumber', + 'smoking_dailysmokestartingagenumber', 'smoking_numberofyearsnumber', + 'cdc_covid_xx_a_date10', 'cdc_covid_xx_a_date11', 'cdc_covid_xx_a_date12', + 'cdc_covid_xx_a_date13', 'cdc_covid_xx_a_date14', 'cdc_covid_xx_a_date15', + 'cdc_covid_xx_a_date16', 'cdc_covid_xx_a_date17', 'cdc_covid_xx_a_date3', + 'cdc_covid_xx_a_date4', 'cdc_covid_xx_a_date5', 'cdc_covid_xx_a_date6', + 'cdc_covid_xx_a_date7', 'cdc_covid_xx_a_date8', 'cdc_covid_xx_a_date9', + 'cdc_covid_xx_b_dose10_other', 'cdc_covid_xx_b_dose11_other', + 'cdc_covid_xx_b_dose12_other', 'cdc_covid_xx_b_dose13_other', + 'cdc_covid_xx_b_dose14_other', 'cdc_covid_xx_b_dose15_other', + 'cdc_covid_xx_b_dose16_other', 'cdc_covid_xx_b_dose17_other', + 'cdc_covid_xx_b_dose3_other', 'cdc_covid_xx_b_dose4_other', + 'cdc_covid_xx_b_dose5_other', 'cdc_covid_xx_b_dose6_other', + 'cdc_covid_xx_b_dose7_other', 'cdc_covid_xx_b_dose8_other', + 'cdc_covid_xx_b_dose9_other', 'cdc_covid_xx_symptom_cope_350_dose10', + 'cdc_covid_xx_symptom_cope_350_dose11', + 'cdc_covid_xx_symptom_cope_350_dose12', + 'cdc_covid_xx_symptom_cope_350_dose13', + 'cdc_covid_xx_symptom_cope_350_dose14', + 'cdc_covid_xx_symptom_cope_350_dose15', + 'cdc_covid_xx_symptom_cope_350_dose16', + 'cdc_covid_xx_symptom_cope_350_dose17', + 'cdc_covid_xx_symptom_cope_350_dose3', + 'cdc_covid_xx_symptom_cope_350_dose4', + 'cdc_covid_xx_symptom_cope_350_dose5', + 'cdc_covid_xx_symptom_cope_350_dose6', + 'cdc_covid_xx_symptom_cope_350_dose7', + 'cdc_covid_xx_symptom_cope_350_dose8', + 'cdc_covid_xx_symptom_cope_350_dose9', 'cdc_covid_xx_type_dose10_other', + 'cdc_covid_xx_type_dose11_other', 'cdc_covid_xx_type_dose12_other', + 'cdc_covid_xx_type_dose13_other', 'cdc_covid_xx_type_dose14_other', + 'cdc_covid_xx_type_dose15_other', 'cdc_covid_xx_type_dose16_other', + 'cdc_covid_xx_type_dose17_other', 'cdc_covid_xx_type_dose3_other', + 'cdc_covid_xx_type_dose4_other', 'cdc_covid_xx_type_dose5_other', + 'cdc_covid_xx_type_dose6_other', 'cdc_covid_xx_type_dose7_other', + 'cdc_covid_xx_type_dose8_other', 'cdc_covid_xx_type_dose9_other', + 'dmfs_29_additionaldose_other', 'organtransplant_bloodvesseltransplantdate', + 'organtransplant_bonetransplantdate', + 'organtransplant_corneatransplantdate', + 'organtransplant_hearttransplantdate', + 'organtransplant_intestinetransplantdate', + 'organtransplant_kidneytransplantdate', + 'organtransplant_livertransplantdate', 'organtransplant_lungtransplantdate', + 'organtransplant_otherorgantransplantdate', + 'organtransplant_othertissuetransplantdate', + 'organtransplant_pancreastransplantdate', + 'organtransplant_skintransplantdate', 'organtransplant_valvetransplantdate', + 'otherorgan_freetextbox', 'othertissue_freetextbox', + 'outsidetravel6month_outsidetravel6monthhowlong', + 'outsidetravel6month_outsidetravel6monthwheretraveled', + 'overallhealth_hysterectomyhistoryage', + 'overallhealthovaryremovalhistoryage', 'otherarthritis_daughterfreetextbox', + 'otherarthritis_fatherfreetextbox', 'otherarthritis_freetextbox', + 'otherarthritis_grandparentfreetextbox', 'otherarthritis_motherfreetextbox', + 'otherarthritis_siblingfreetextbox', 'otherarthritis_sonfreetextbox', + 'otherbonejointmuscle_daughterfreetextbox', + 'otherbonejointmuscle_fatherfreetextbox', + 'otherbonejointmuscle_freetextbox', + 'otherbonejointmuscle_grandparentfreetextbox', + 'otherbonejointmuscle_motherfreetextbox', + 'otherbonejointmuscle_siblingfreetextbox', + 'otherbonejointmuscle_sonfreetextbox', + 'otherbrainnervoussystem_daughterfreetextbox', + 'otherbrainnervoussystem_fatherfreetextbox', + 'otherbrainnervoussystem_freetextbox', + 'otherbrainnervoussystem_grandparentfreetextbox', + 'otherbrainnervoussystem_motherfreetextbox', + 'otherbrainnervoussystem_siblingfreetextbox', + 'otherbrainnervoussystem_sonfreetextbox', 'othercancer_freetextbox', + 'otherdiabetes_daughterfreetextbox', 'otherdiabetes_fatherfreetextbox', + 'otherdiabetes_freetextbox', 'otherdiabetes_grandparentfreetextbox', + 'otherdiabetes_motherfreetextbox', 'otherdiabetes_siblingfreetextbox', + 'otherdiabetes_sonfreetextbox', 'otherdiagnosis_daughterfreetextbox', + 'otherdiagnosis_fatherfreetextbox', 'otherdiagnosis_freetextbox', + 'otherdiagnosis_grandparentfreetextbox', 'otherdiagnosis_motherfreetextbox', + 'otherdiagnosis_siblingfreetextbox', 'otherdiagnosis_sonfreetextbox', + 'otherdigestivecondition_daughterfreetextbox', + 'otherdigestivecondition_fatherfreetextbox', + 'otherdigestivecondition_freetextbox', + 'otherdigestivecondition_grandparentfreetextbox', + 'otherdigestivecondition_motherfreetextbox', + 'otherdigestivecondition_siblingfreetextbox', + 'otherdigestivecondition_sonfreetextbox', + 'otherhearingeye_daughterfreetextbox', 'otherhearingeye_fatherfreetextbox', + 'otherhearingeye_freetextbox', 'otherhearingeye_grandparentfreetextbox', + 'otherhearingeye_motherfreetextbox', 'otherhearingeye_siblingfreetextbox', + 'otherhearingeye_sonfreetextbox', + 'otherheartorbloodcondition_daughterfreetextbox', + 'otherheartorbloodcondition_fatherfreetextbox', + 'otherheartorbloodcondition_freetextbox', + 'otherheartorbloodcondition_grandparentfreetextbox', + 'otherheartorbloodcondition_motherfreetextbox', + 'otherheartorbloodcondition_siblingfreetextbox', + 'otherheartorbloodcondition_sonfreetextbox', + 'otherhormoneendocrine_daughterfreetextbox', + 'otherhormoneendocrine_fatherfreetextbox', + 'otherhormoneendocrine_freetextbox', + 'otherhormoneendocrine_grandparentfreetextbox', + 'otherhormoneendocrine_motherfreetextbox', + 'otherhormoneendocrine_siblingfreetextbox', + 'otherhormoneendocrine_sonfreetextbox', + 'otherinfectiousdisease_freetextbox', + 'otherkidneycondition_daughterfreetextbox', + 'otherkidneycondition_fatherfreetextbox', + 'otherkidneycondition_freetextbox', + 'otherkidneycondition_grandparentfreetextbox', + 'otherkidneycondition_motherfreetextbox', + 'otherkidneycondition_siblingfreetextbox', + 'otherkidneycondition_sonfreetextbox', + 'othermentalhealthsubstanceuse_daughterfreetextbox', + 'othermentalhealthsubstanceuse_fatherfreetextbox', + 'othermentalhealthsubstanceuse_freetextbox', + 'othermentalhealthsubstanceuse_grandparentfreetextb', + 'othermentalhealthsubstanceuse_motherfreetextbox', + 'othermentalhealthsubstanceuse_siblingfreetextbox', + 'othermentalhealthsubstanceuse_sonfreetextbox', + 'otherrespiratory_daughterfreetextbox', + 'otherrespiratory_fatherfreetextbox', 'otherrespiratory_freetextbox', + 'otherrespiratory_grandparentfreetextbox', + 'otherrespiratory_motherfreetextbox', 'otherrespiratory_siblingfreetextbox', + 'otherrespiratory_sonfreetextbox', 'otherthyroid_daughterfreetextbox', + 'otherthyroid_fatherfreetextbox', 'otherthyroid_freetextbox', + 'otherthyroid_grandparentfreetextbox', 'otherthyroid_motherfreetextbox', + 'otherthyroid_siblingfreetextbox', 'otherthyroid_sonfreetextbox', + 'self_reported_height_cm', 'self_reported_height_ft', + 'self_reported_height_in', 'self_reported_weight_kg', + 'self_reported_weight_pounds', 'sdoh_eds_follow_up_1_xx', 'urs_8c', + 'aian_tribe', 'aiannoneofthesedescribeme_aianfreetext', + 'blacknoneofthesedescribeme_blackfreetext', + 'employmentworkaddress_addresslineone', + 'employmentworkaddress_addresslinetwo', 'employmentworkaddress_city', + 'employmentworkaddress_country', 'employmentworkaddress_zipcode', + 'hispanicnoneofthesedescribeme_hispanicfreetext', + 'livingsituation_howmanypeople', 'livingsituation_livingsituationfreetext', + 'livingsituation_peopleunder18', 'menanoneofthesedescribeme_menafreetext', + 'nhpinoneofthesedescribeme_nhpifreetext', + 'noneofthesedescribeme_asianfreetext', 'otherhealthplan_freetext', + 'persononeaddress_persononeaddresscity', + 'persononeaddress_persononeaddresszipcode', + 'secondarycontactinfo_persononeaddressone', + 'secondarycontactinfo_persononeaddresstwo', + 'secondarycontactinfo_persononeemail', + 'secondarycontactinfo_persononefirstname', + 'secondarycontactinfo_persononelastname', + 'secondarycontactinfo_persononemiddleinitial', + 'secondarycontactinfo_persononetelephone', + 'secondarycontactinfo_secondcontactsaddressone', + 'secondarycontactinfo_secondcontactsaddresstwo', + 'secondarycontactinfo_secondcontactsemail', + 'secondarycontactinfo_secondcontactsfirstname', + 'secondarycontactinfo_secondcontactslastname', + 'secondarycontactinfo_secondcontactsmiddleinitial', + 'secondarycontactinfo_secondcontactsnumber', + 'secondcontactsaddress_secondcontactcity', + 'secondcontactsaddress_secondcontactzipcode', + 'sexatbirthnoneofthese_sexatbirthtextbox', + 'socialsecurity_socialsecuritynumber', + 'somethingelse_sexualitysomethingelsetextbox', + 'specifiedgender_specifiedgendertextbox', 'thebasics_countryborntextbox', + 'whatraceethnicity_raceethnicitynoneofthese', + 'whitenoneofthesedescribeme_whitefreetext', 'timeofday', + 'wearconsent_todaysdate' +] # # Table comparison # The export should generally contain the same tables from month to month. @@ -271,6 +264,53 @@ query = tpl.render(new_rdr=new_rdr, old_rdr=old_rdr, project_id=project_id) execute(client, query) +# # Confirm all the expected tables are in this dataset +# This QC confirms all the expected tables are present in this dataset. If not, +# our pipeline might not be working as expected. Missing tables will +# break the combined dataset generation. If this check fails, fix the issues before +# proceeding to the next step. +# See DC-3454 for the background. + +expected_domain_tables = [ + table_name for table_name in CDM_TABLES if table_name != DEATH +] + [AOU_DEATH] +expected_mapping_tables = [ + f'{MAPPING_PREFIX}{table_name}' for table_name in tables_to_map() +] +expected_tables = expected_domain_tables + expected_mapping_tables + +tpl = JINJA_ENV.from_string(''' +WITH expected_tables AS ( +{% for table in expected_tables %} + SELECT '{{table}}' AS table_id + {% if not loop.last -%} UNION ALL {% endif %} +{% endfor %} +) +SELECT table_id AS missing_table FROM expected_tables +WHERE table_id NOT IN (SELECT table_id FROM `{{project_id}}.{{new_rdr}}.__TABLES__`) +''') +query = tpl.render(project_id=project_id, + new_rdr=new_rdr, + expected_tables=expected_tables) +df = execute(client, query) + +# + +success_msg = 'All the expected tables are present in this dataset.' +failure_msg = ''' +{code_count} tables are missing. Check if the missing tables are important ones.
+If it is NOT important (e.g., expected to be empty), simply create an empty table for that +and move on. Create an investigation ticket so we can investigate it later.
+If it is an important table, troubleshoot and figure out why the table is missing in the +dataset before moving on to the next steps. +''' + +render_message(df, + success_msg, + failure_msg, + failure_msg_args={'code_count': len(df)}) + +# - + # ## Row count comparison # Generally the row count of clinical tables should increase from one export to the next. @@ -404,6 +444,7 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) +# + # # Check numeric data in value_as_string # Some numeric data is expected in value_as_string. For example, zip codes or other contact specific information. # @@ -422,15 +463,15 @@ GROUP BY 1 ORDER BY 2 DESC """) -query = tpl.render(new_rdr=new_rdr, project_id=project_id,expected_strings=expected_strings) +query = tpl.render(new_rdr=new_rdr, + project_id=project_id, + expected_strings=expected_strings) df = execute(client, query) success_msg = 'All records with a number in value_as_string are expected to be text.' failure_msg = 'Some records that have a number value_as_string might not be expected. See description.' -render_message(df, - success_msg, - failure_msg) +render_message(df, success_msg, failure_msg) # - # # All COPE `questionnaire_response_id`s are in COPE version map @@ -476,8 +517,13 @@ # - If all 10 surveys are not represented in the query results, this check FAILS. # Notify RDR of the missing survey data. # - If any of the *_failure columns have a result > 0 this check FAILS. -# Notify RDR that there are surveys with observation_dates outside of the survey's expected implementation range. -# +# Notify RDR that there are surveys with observation_dates outside of the survey's expected implementation range.
+# +# Note: Some failures are known. +# * 1209 May failures +# * 24961 Nov failures +# * 44 Feb failures +# * 18 Fall failures tpl = JINJA_ENV.from_string(""" SELECT @@ -508,7 +554,9 @@ # Concepts of class Qualifier Value are permitted as a value and # Concepts of class Topic and PPI Modifier are permitted as a question # Discreprancies (listed below) can be caused by misclassified entries in Athena or -# invalid payloads in the RDR and in further upstream data sources. +# invalid payloads in the RDR and in further upstream data sources.
+# +# Notify the survey team PM of any failures. These should be corrected in collaboration with Odysseus. tpl = JINJA_ENV.from_string(''' WITH ppi_concept_code AS ( @@ -547,6 +595,8 @@ # # Identify Questions That Dont Exist in the RDR Export # This identifies questions as indicated by a PPI vocabulary and Question concept_class_id that # do not exist in the dataset. +# +# This is intended as a visual check only. These concepts might be expected from the rdr or survey implementors but they are not being recieved by curation. tpl = JINJA_ENV.from_string(""" with question_codes as (select c.concept_id, c.concept_name, c.concept_class_id @@ -569,6 +619,10 @@ # # Identify Questions That Dont Exist in the Cleaned RDR Export # This identifies questions as indicated by a PPI vocabulary and Question concept_class_id that # do not exist in the cleaned dataset but did exist in the raw dataset. +# +# This is a list of questions that are being dropped entirely in the cleaning process. Some drops/updates are expected such as the smoking and insurance concepts. These are modified specifically by rdr cleaning classes. +# +# Investigate any concepts being dropped by the rdr cleaning classes unexpectantly. tpl = JINJA_ENV.from_string(""" with question_codes as (select c.concept_id, c.concept_name, c.concept_class_id @@ -627,54 +681,47 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# ## Participants must have basics data -# Identify any participants who have don't have any responses -# to questions in the basics survey module (see [DC-706](https://precisionmedicineinitiative.atlassian.net/browse/DC-706)). These should be -# reported to the RDR as they are supposed to be filtered out -# from the RDR export. - -# + -BASICS_MODULE_CONCEPT_ID = 1586134 +# # All participants have basics data in the rdr dataset +# There should not be any data in the clean rdr dataset for participants who don't have basics data.
+# Note: This check uses the same logic to determine 'basics data' as the rdr CR `drop_participants_without_any_basics`. +# +# **If this check fails** investigate. Ensure all participants lacking basics data are dropped. # Note: This assumes that concept_ancestor sufficiently # represents the hierarchy tpl = JINJA_ENV.from_string(""" -WITH - - -- all PPI question concepts in the basics survey module -- - basics_concept AS - (SELECT - c.concept_id - ,c.concept_name - ,c.concept_code - FROM `{{DATASET_ID}}.concept_ancestor` ca - JOIN `{{DATASET_ID}}.concept` c - ON ca.descendant_concept_id = c.concept_id - WHERE 1=1 - AND ancestor_concept_id={{BASICS_MODULE_CONCEPT_ID}} - AND c.vocabulary_id='PPI' - AND c.concept_class_id='Question') - - -- maps pids to all their associated basics questions in the rdr -- -,pid_basics AS - (SELECT - person_id - ,ARRAY_AGG(DISTINCT c.concept_code IGNORE NULLS) basics_codes - FROM `{{DATASET_ID}}.observation` o - JOIN basics_concept c - ON o.observation_concept_id = c.concept_id - WHERE 1=1 - GROUP BY 1) - - -- list all pids for whom no basics questions are found -- -SELECT * -FROM `{{DATASET_ID}}.person` -WHERE person_id not in (select person_id from pid_basics) +WITH pids_with_basics AS ( -- pids with basics data -- +SELECT +person_id +FROM `{{project_id}}.{{new_rdr}}.concept_ancestor` +INNER JOIN `{{project_id}}.{{new_rdr}}.observation` o ON observation_concept_id = descendant_concept_id +INNER JOIN `{{project_id}}.{{new_rdr}}.concept` d ON d.concept_id = descendant_concept_id +WHERE ancestor_concept_id = 1586134 + +UNION DISTINCT + +SELECT +person_id +FROM `{{project_id}}.{{new_rdr}}.concept` +JOIN `{{project_id}}.{{new_rdr}}.concept_ancestor` +ON (concept_id = ancestor_concept_id) +JOIN `{{project_id}}.{{new_rdr}}.observation` +ON (descendant_concept_id = observation_concept_id) +WHERE concept_class_id = 'Module' +AND concept_name IN ('The Basics') + AND questionnaire_response_id IS NOT NULL +) + +SELECT +'persons_without_basics_in_rdr' as issue, +COUNT(DISTINCT person_id) as n +FROM `{{project_id}}.{{new_rdr}}.person` +WHERE person_id NOT IN (SELECT person_id FROM pids_with_basics) +GROUP BY person_id + """) -query = tpl.render(DATASET_ID=new_rdr, - BASICS_MODULE_CONCEPT_ID=BASICS_MODULE_CONCEPT_ID) +query = tpl.render(project_id=project_id, new_rdr=new_rdr) execute(client, query) -# - # # Date conformance check # COPE surveys contain some concepts that must enforce dates in the observation.value_as_string field. @@ -832,26 +879,6 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# # Checks for basics survey module -# Participants with data in other survey modules must also have data from the basics survey module. -# This check identifies survey responses associated with participants that do not have any responses -# associated with the basics survey module. -# In ideal circumstances, this query will not return any results. - -tpl = JINJA_ENV.from_string(''' -SELECT DISTINCT person_id FROM `{{project_id}}.{{new_rdr}}.observation` -JOIN `{{project_id}}.{{new_rdr}}.concept` on (observation_source_concept_id=concept_id) -WHERE vocabulary_id = 'PPI' AND person_id NOT IN ( -SELECT DISTINCT person_id FROM `{{project_id}}.{{new_rdr}}.concept` -JOIN `{{project_id}}.{{new_rdr}}.concept_ancestor` on (concept_id=ancestor_concept_id) -JOIN `{{project_id}}.{{new_rdr}}.observation` on (descendant_concept_id=observation_concept_id) -WHERE concept_class_id='Module' -AND concept_name IN ('The Basics') -AND questionnaire_response_id IS NOT NULL) -''') -query = tpl.render(new_rdr=new_rdr, project_id=project_id) -execute(client, query) - # ## Participants must be 18 years of age or older to consent # # AOU participants are required to be 18+ years of age at the time of consent @@ -901,6 +928,8 @@ # According to [this ticket](https://precisionmedicineinitiative.atlassian.net/browse/DC-1792), # the RDR export should not contain some operational concepts that are irrelevant to researchers. # Any violations should be reported to the RDR team. +# +# If the check fails with a 403 error, run it in the Bigquery UI. tpl = JINJA_ENV.from_string(""" SELECT @@ -1061,89 +1090,7 @@ # - -# # COPE survey mapping - -# There is a known issue that COPE survey questions all map to the module -# 1333342 (COPE survey with no version specified). This check is to confirm -# if this issue still exists in the vocabulary or not. -# If this issue is fixed, each COPE survey questions will have mapping to -# individual COPE survey modules and will no longer have mapping to 1333342. -# cope_question_concept_ids are collected using the SQL listed in DC-2641: -# [DC-2641](https://precisionmedicineinitiative.atlassian.net/browse/DC-2641). - -cope_question_concept_ids = [ - 596884, 596885, 596886, 596887, 596888, 702686, 713888, 715711, 715713, - 715714, 715719, 715720, 715721, 715722, 715723, 715724, 715725, 715726, - 903629, 903630, 903631, 903632, 903633, 903634, 903635, 903641, 903642, - 1310051, 1310052, 1310053, 1310054, 1310056, 1310058, 1310060, 1310062, - 1310065, 1310066, 1310067, 1310132, 1310133, 1310134, 1310135, 1310136, - 1310137, 1310138, 1310139, 1310140, 1310141, 1310142, 1310144, 1310145, - 1310146, 1310147, 1310148, 1332734, 1332735, 1332737, 1332738, 1332739, - 1332741, 1332742, 1332744, 1332745, 1332746, 1332747, 1332748, 1332749, - 1332750, 1332751, 1332752, 1332753, 1332754, 1332755, 1332756, 1332762, - 1332763, 1332767, 1332769, 1332792, 1332793, 1332794, 1332795, 1332796, - 1332797, 1332800, 1332801, 1332802, 1332803, 1332804, 1332805, 1332806, - 1332807, 1332808, 1332819, 1332820, 1332822, 1332824, 1332826, 1332828, - 1332829, 1332830, 1332831, 1332832, 1332833, 1332835, 1332843, 1332847, - 1332848, 1332849, 1332853, 1332854, 1332861, 1332862, 1332863, 1332866, - 1332867, 1332868, 1332869, 1332870, 1332871, 1332872, 1332874, 1332876, - 1332878, 1332880, 1332935, 1332937, 1332944, 1332998, 1333004, 1333011, - 1333012, 1333013, 1333014, 1333015, 1333016, 1333017, 1333018, 1333019, - 1333020, 1333021, 1333022, 1333023, 1333024, 1333102, 1333104, 1333105, - 1333118, 1333119, 1333120, 1333121, 1333156, 1333163, 1333164, 1333165, - 1333166, 1333167, 1333168, 1333182, 1333183, 1333184, 1333185, 1333186, - 1333187, 1333188, 1333189, 1333190, 1333191, 1333192, 1333193, 1333194, - 1333195, 1333200, 1333216, 1333221, 1333234, 1333235, 1333274, 1333275, - 1333276, 1333277, 1333278, 1333279, 1333280, 1333281, 1333285, 1333286, - 1333287, 1333288, 1333289, 1333291, 1333292, 1333293, 1333294, 1333295, - 1333296, 1333297, 1333298, 1333299, 1333300, 1333301, 1333303, 1333311, - 1333312, 1333313, 1333314, 1333324, 1333325, 1333326, 1333327, 1333328 -] - -tpl = JINJA_ENV.from_string(""" -WITH question_topic_module AS ( - SELECT - cr1.concept_id_1 AS question, - cr1.concept_id_2 AS topic, - cr2.concept_id_2 AS module - FROM `{{projcet_id}}.{{dataset}}.concept_relationship` cr1 - JOIN `{{projcet_id}}.{{dataset}}.concept` c1 ON cr1.concept_id_2 = c1.concept_id - JOIN `{{projcet_id}}.{{dataset}}.concept_relationship` cr2 ON c1.concept_id = cr2.concept_id_1 - JOIN `{{projcet_id}}.{{dataset}}.concept` c2 ON cr2.concept_id_2 = c2.concept_id - WHERE cr1.concept_id_1 IN ({{cope_question_concept_ids}}) - AND c1.concept_class_id = 'Topic' - AND c2.concept_class_id = 'Module' -) -SELECT DISTINCT question FROM question_topic_module -WHERE module = 1333342 -""") -query = tpl.render( - new_rdr=new_rdr, - project_id=project_id, - dataset=new_rdr, - cope_question_concept_ids=", ".join( - str(concept_id) for concept_id in cope_question_concept_ids)) -df = execute(client, query) - -# + -success_msg = ''' - The mapping issue is resolved. Double-check each concept is mapped to individual COPE module. - Once we double-checked it, we can remove this QC from this notebook. -''' -failure_msg = ''' - The mapping issue still exists. There are {code_count} concepts for COPE questions - that map to 1333342. Notify Odysseus that the issue still persists. - For pipeline, we can use cope_survey_semantic_version_map to diffrentiate COPE module versions, - so we can still move on. See DC-2641 for detail. -''' - -render_message(df, - success_msg, - failure_msg, - failure_msg_args={'code_count': len(df)}) -# - - -# ### RDR date cutoff check +# ## RDR date cutoff check # Check that survey dates are not beyond the RDR cutoff date, also check observation. query = JINJA_ENV.from_string(""" @@ -1224,7 +1171,7 @@ ''' success_msg = 'All death records follow the technical requirement for the CDR V8 release.' failure_msg = ''' - {code_count} records do not follow the technical requirement for the CDR V8 release. + {code_count} records do not follow the technical requirement for the CDR V8 release. Investigate and confirm if (a) bad data is coming from RDR, (b) the requirement has changed, or (c) something else. ''' render_message(df_if_empty, success_msg_if_empty, failure_msg_if_empty) @@ -1238,3 +1185,132 @@ success_msg, failure_msg, failure_msg_args={'code_count': len(df)}) +# - + +# # Remote self-reported physical measurement (PM) + +# From CDR V8, Curation receives self-reported height and weight records from RDR. +# We calculate BMI using those records at RDR data stage. +# We must ensure (1) the BMI is calculated correctly, and (2) self-reported +# height/weight/BMI records are cleaned in the same way as in-person PM. + +# + +query_extreme_check = JINJA_ENV.from_string(""" +SELECT * FROM `{{project_id}}.{{dataset}}.measurement` m +WHERE (m.measurement_source_concept_id = 903133 AND m.value_as_number NOT BETWEEN 90 AND 228) +OR (m.measurement_source_concept_id = 903121 AND m.value_as_number NOT BETWEEN 30 AND 250) +OR (m.measurement_source_concept_id = 903124 AND m.value_as_number NOT BETWEEN 10 AND 125) +""").render(project_id=project_id, dataset=new_rdr) +df_extreme_check = execute(client, query_extreme_check) + +query_duplicate_check = JINJA_ENV.from_string(""" +SELECT person_id, measurement_source_concept_id, measurement_type_concept_id, COUNT(*) +FROM `{{project_id}}.{{dataset}}.measurement` m +WHERE measurement_source_concept_id IN (903133, 903121, 903124) +GROUP BY person_id, measurement_source_concept_id, measurement_type_concept_id +HAVING COUNT(*) > 1 +""").render(project_id=project_id, dataset=new_rdr) +df_duplicate_check = execute(client, query_duplicate_check) + +query = JINJA_ENV.from_string(""" +WITH clean_rdr_height AS ( + SELECT COUNT(*) AS row_count FROM `{{project_id}}.{{dataset}}.measurement` + WHERE measurement_type_concept_id = 32865 AND measurement_concept_id = 3036277 +), clean_rdr_weight AS ( + SELECT COUNT(*) AS row_count FROM `{{project_id}}.{{dataset}}.measurement` + WHERE measurement_type_concept_id = 32865 AND measurement_concept_id = 3025315 +), clean_rdr_bmi AS ( + SELECT COUNT(*) AS row_count FROM `{{project_id}}.{{dataset}}.measurement` + WHERE measurement_type_concept_id = 32865 AND measurement_concept_id = 3038553 +), raw_rdr_height AS ( + SELECT COUNT(*) AS row_count FROM `{{project_id}}.{{raw_dataset}}.measurement` + WHERE measurement_type_concept_id = 32865 AND measurement_concept_id = 3036277 +), raw_rdr_weight AS ( + SELECT COUNT(*) AS row_count FROM `{{project_id}}.{{raw_dataset}}.measurement` + WHERE measurement_type_concept_id = 32865 AND measurement_concept_id = 3025315 +), raw_rdr_bmi AS ( + SELECT COUNT(*) AS row_count FROM `{{project_id}}.{{raw_dataset}}.measurement` + WHERE measurement_type_concept_id = 32865 AND measurement_concept_id = 3038553 +) +SELECT + clean_rdr_height.row_count AS clean_height_rows, + clean_rdr_weight.row_count AS clean_weight_rows, + clean_rdr_bmi.row_count AS clean_bmi_rows, + raw_rdr_height.row_count AS raw_height_rows, + raw_rdr_weight.row_count AS raw_weight_rows, + raw_rdr_bmi.row_count AS raw_bmi_rows +FROM clean_rdr_height +CROSS JOIN clean_rdr_weight +CROSS JOIN clean_rdr_bmi +CROSS JOIN raw_rdr_height +CROSS JOIN raw_rdr_weight +CROSS JOIN raw_rdr_bmi +""").render(project_id=project_id, dataset=new_rdr, raw_dataset=raw_rdr) +df = execute(client, query) +(clean_height_rows, clean_weight_rows, clean_bmi_rows, raw_height_rows, + raw_weight_rows, + raw_bmi_rows) = (df.clean_height_rows[0], df.clean_weight_rows[0], + df.clean_bmi_rows[0], df.raw_height_rows[0], + df.raw_weight_rows[0], df.raw_bmi_rows[0]) + +success_msg_extreme_check = 'No extreme height/weight/BMI record is found in measurement.' +failure_msg_extreme_check = ''' + Extreme height/weight/BMI records are found in measurement. Ensure DropExtremeMeasurements ran as expected. + Investigate why the extreme records are not dropped as designed. +''' +success_msg_duplicate_check = 'Height/weight/BMI is correctly up to 1 record per person per in-person/self-report.' +failure_msg_duplicate_check = ''' + Some participants still have multiple height/weight/BMI records for in-person or self-report. + DropMultipleMeasurements is supposed to clean such records. Ensure the CR ran as expected and investigate why there are still duplicates. +''' + +render_message(df_extreme_check, success_msg_extreme_check, + failure_msg_extreme_check) + +render_message(df_duplicate_check, success_msg_duplicate_check, + failure_msg_duplicate_check) + +check_status = "Cannot tell success or failure. Read the message below." +msg = ( + f"In {raw_rdr} (Raw RDR), the numbers of self-reported PM records are following:
" + f"height - {raw_height_rows}, weight - {raw_weight_rows}, BMI - {raw_bmi_rows}
" + f"In {new_rdr} (Clean RDR), the numbers of self-reported PM records are following:
" + f"height - {clean_height_rows}, weight - {clean_weight_rows}, BMI - {clean_bmi_rows}

" + "Check the following:
" + "(1) Raw RDR must have 0 BMI records,
" + "(2) Clean RDR must have fewer height/weight records than Raw RDR, because we ran DropExtremeMeasurements and DropMultipleMeasurements,
" + "(3) Clean RDR must have BMI records about as many as its height/weight records. The numbers do not have to exactly match because of the " + "CRs mentioned in (2) or some participants missing height/weight. They have to be close enough.
" + "If any of (1) - (3) does not look good, investigate and discuss within Curation and find out next steps." +) + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) +# - + +# # Visual check survey_conduct record drop +# +# Review the results. Some surveys are expected to be dropped. +# Investigate any potential issues. Overly extensive cleaning, missing surveys, etc. + +tpl = JINJA_ENV.from_string(''' +WITH raw_survey AS (SELECT survey_source_value as raw_survey, survey_concept_id, COUNT(survey_conduct_id) as raw_count + FROM `{{project_id}}.{{raw_rdr}}.survey_conduct` + WHERE NOT (REGEXP_CONTAINS(survey_source_value,'(?i)SNAP|cope')) + GROUP BY 1, 2), +clean_survey AS (SELECT survey_source_value as clean_survey, survey_concept_id, COUNT(survey_conduct_id) as clean_count + FROM `{{project_id}}.{{new_rdr}}.survey_conduct` + WHERE NOT (REGEXP_CONTAINS(survey_source_value,'(?i)SNAP|cope')) + GROUP BY 1, 2) +SELECT *, clean_count - raw_count as cleaned_records +FROM raw_survey +FULL OUTER JOIN clean_survey +USING (survey_concept_id) +ORDER BY 6 +''') +query = tpl.render(new_rdr=new_rdr, raw_rdr=raw_rdr, project_id=project_id) +execute(client, query) + + diff --git a/data_steward/analytics/cdr_ops/combined.py b/data_steward/analytics/cdr_ops/combined.py index 045751db07..a075698082 100644 --- a/data_steward/analytics/cdr_ops/combined.py +++ b/data_steward/analytics/cdr_ops/combined.py @@ -25,6 +25,7 @@ from common import JINJA_ENV, MAPPED_CLINICAL_DATA_TABLES from cdr_cleaner.cleaning_rules.negative_ages import date_fields +from cdr_cleaner.cleaning_rules.remove_ehr_data_without_consent import EHR_UNCONSENTED_PARTICIPANTS_LOOKUP_TABLE as UNCONSENTED from utils import auth from gcloud.bq import BigQueryClient from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message @@ -50,7 +51,7 @@ # ## Check for duplicates across all unique identifier fields. # This query gathers any duplicates of the {table}_id from each OMOP table listed. -# The OMOP tables `death` and `fact_relationship` are excluded from the check because they do not have primary key fields. +# The OMOP tables `death` and `fact_relationship` are excluded from the check because they do not have primary key fields. `aou_death` is also excluded since it uses 'uuid'. # The output of this query should be empty. If any duplicates are found there may be a bug in the pipeline. # # Specific to duplicates in observation:
@@ -60,6 +61,7 @@ # If any duplicates are found there may be a bug in the pipeline- # particularly in `ehr_union.move_ehr_person_to_observation`. + query = f""" DECLARE i INT64 DEFAULT 0; DECLARE tables ARRAY; @@ -68,7 +70,7 @@ "care_site", "condition_era", "device_cost", "device_exposure", "dose_era", "drug_exposure", "location", "measurement", "note", "note_nlp", "person", "procedure_cost", "procedure_occurrence", "provider", "specimen", -"survey_conduct", "visit_cost", "visit_detail", "visit_occurrence", "aou_death"]; +"survey_conduct", "visit_cost", "visit_detail", "visit_occurrence"]; CREATE TEMPORARY TABLE non_unique_primary_keys(table_name STRING, key_column int64); @@ -120,7 +122,7 @@ SELECT "{{table_name}}" AS table_name ,"{{date_field}}" AS date_field -,t.{{date_field}} AS date_value +,DATE(t.{{date_field}}) AS date_value ,p.birth_datetime AS birth_datetime FROM `{{dataset_id}}.{{table_name}}` t JOIN `{{dataset_id}}.person` p @@ -128,17 +130,21 @@ WHERE ( -- age <= 0y -- - t.{{date_field}} < DATE(p.birth_datetime) + DATE(t.{{date_field}}) < DATE(p.birth_datetime) -- age >= 150y -- - OR {{PIPELINE_TABLES}}.calculate_age(t.{{date_field}}, EXTRACT(DATE FROM p.birth_datetime)) >= 150 + OR pipeline_tables.calculate_age(DATE(t.{{date_field}}), EXTRACT(DATE FROM p.birth_datetime)) >= 150 ) +AND +p.birth_datetime IS NOT NULL +AND +t.{{date_field}} IS NOT NULL {% if not loop.last -%} UNION ALL {% endif %} {% endfor %} ''') -query = tpl.render(dataset_id=DATASET_ID, date_fields=date_fields) +query = tpl.render(dataset_id=DATASET_ID, date_fields=date_fields, PIPELINE_TABLES="pipeline_tables") execute(client, query) # ## PPI records should never follow death date @@ -146,7 +152,6 @@ # + query = JINJA_ENV.from_string(""" -query = f''' WITH ppi_concept AS @@ -251,7 +256,7 @@ || 'USING (' || table_name ||'_id) ' || 'LEFT JOIN consented c ' || ' USING (person_id)' - || 'WHERE m.src_hpo_id <> "rdr" AND c.person_id IS NULL)' + || 'WHERE m.src_hpo_id NOT IN (\\"ce\\", \\"vibrent\\", \\"healthpro\\") AND c.person_id IS NULL)' , ' UNION ALL ') FROM `{{DATASET_ID}}.INFORMATION_SCHEMA.COLUMNS` c JOIN `{{DATASET_ID}}.__TABLES__` t @@ -307,7 +312,7 @@ (SELECT d.table_schema AS table_schema ,d.table_name AS table_name - ,pk.column_name AS key_field + ,CASE WHEN pk.column_name = 'aou_death_id' THEN '0' ELSE pk.column_name END AS key_field ,d.column_name AS date_field ,ts.column_name AS timestamp_field FROM `{DATASET_ID}.INFORMATION_SCHEMA.COLUMNS` d @@ -456,7 +461,7 @@ cols = 3 rows = math.ceil(total_plots / cols) -fig, axes = plt.subplots(rows, cols, figsize=(5, 5), squeeze=False) +fig, axes = plt.subplots(rows, cols, figsize=(10, 10), squeeze=False) k = 0 while k < total_plots: @@ -767,21 +772,21 @@ def verify_dataset_labels(dataset): # + query = JINJA_ENV.from_string(""" WITH qc_aou_death AS ( - SELECT - aou_death_id, + SELECT + aou_death_id, CASE WHEN aou_death_id IN ( - SELECT aou_death_id FROM `{{project_id}}.{{dataset_id}}.aou_death` + SELECT aou_death_id FROM `{{project_id}}.{{dataset}}.aou_death` WHERE death_date IS NOT NULL -- NULL death_date records must not become primary -- QUALIFY RANK() OVER ( - PARTITION BY person_id + PARTITION BY person_id ORDER BY LOWER(src_id) NOT LIKE '%healthpro%' DESC, -- EHR records are chosen over HealthPro ones -- death_date ASC, -- Earliest death_date records are chosen over later ones -- death_datetime ASC NULLS LAST, -- Earliest non-NULL death_datetime records are chosen over later or NULL ones -- src_id ASC -- EHR site that alphabetically comes first is chosen -- - ) = 1 + ) = 1 ) THEN TRUE ELSE FALSE END AS primary_death_record - FROM `{{project}}.{{dataset}}.aou_death` + FROM `{{project}}.{{dataset}}.aou_death` ) SELECT ad.aou_death_id FROM `{{project_id}}.{{dataset}}.aou_death` ad @@ -793,7 +798,7 @@ def verify_dataset_labels(dataset): success_msg = 'All death records have the correct `primary_death_record` values.' failure_msg = ''' - {code_count} records do not have the correct `primary_death_record` values. + {code_count} records do not have the correct `primary_death_record` values. Investigate and confirm if (a) any logic is incorrect, (b) the requirement has changed, or (c) something else. ''' render_message(df, @@ -828,9 +833,9 @@ def verify_dataset_labels(dataset): ext_template = JINJA_ENV.from_string(""" SELECT table_id - FROM + FROM `{{project_id}}.{{dataset}}.__TABLES__` - WHERE + WHERE table_id LIKE '%_ext%' """) ext_tables_query = ext_template.render(project_id=PROJECT_ID, @@ -847,7 +852,7 @@ def verify_dataset_labels(dataset): `{{project_id}}.{{dataset}}.{{table_name}}` WHERE NOT REGEXP_CONTAINS(src_id, r'(?i)(Portal)|(EHR site)') - OR + OR src_id IS NULL GROUP BY 1,2 """) @@ -857,3 +862,45 @@ def verify_dataset_labels(dataset): result.append(query) results = '\nUNION ALL\n'.join(result) execute(client, results) + +# ## Verify no participant in the pdr_ehr_dup_report list has EHR data. +# +# Curation will receive a table of unconsented PID's stored in the table `...combined_sandbox._ehr_unconsented_pids` from the PDR. This check will verify that every mapped EHR table will not contain these PID's. If any of these PID's are found, a failue is raised, otherwise this check ends with a pass. See [DC-3435](https://precisionmedicineinitiative.atlassian.net/browse/DC-3435) + +unconsented_records_tpl = JINJA_ENV.from_string(""" +SELECT \'{{domain_table}}\' AS domain_table, person_id +FROM + `{{project}}.{{dataset}}.{{domain_table}}` d + JOIN + `{{project}}.{{dataset}}.{{mapping_domain_table}}` md + USING + ({{domain_table}}_id) + WHERE + person_id IN ( + SELECT + person_id + FROM + `{{project}}.{{sandbox_dataset}}.{{unconsented_lookup}}`) + AND src_dataset_id LIKE '%ehr%' + +""") + +# + +success_msg_if_empty = "All PID's with EHR data are found consenting" +failure_msg_if_empty = "EHR data is found for PIDs who have not consented to contribute EHR data." + +sub_queries = [] +for table in MAPPED_CLINICAL_DATA_TABLES: + query = unconsented_records_tpl.render( + project=PROJECT_ID, + dataset=DATASET_ID, + domain_table=table, + mapping_domain_table=f'_mapping_{table}', + sandbox_dataset=f'{DATASET_ID}_sandbox', + unconsented_lookup=UNCONSENTED) + + sub_queries.append(query) + +full_query = '\nUNION ALL\n'.join(sub_queries) +result = execute(client, full_query) +render_message(result, success_msg_if_empty, failure_msg_if_empty) diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/README.md b/data_steward/analytics/cdr_ops/controlled_tier_qc/README.md index 7a7159d656..77d4ec1d31 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/README.md +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/README.md @@ -4,7 +4,7 @@ There are 2 notebooks for the controlled tier QC process. 1. check_controlled_tier.py This notebook is the main notebook for the controlled tier QC. It covers the quality checks from the following tickets: [DC-1370](https://precisionmedicineinitiative.atlassian.net/browse/DC-1370), [DC-1377](https://precisionmedicineinitiative.atlassian.net/browse/DC-1377), [DC-1346](https://precisionmedicineinitiative.atlassian.net/browse/DC-1346), [DC-1348](https://precisionmedicineinitiative.atlassian.net/browse/DC-1348), [DC-1355](https://precisionmedicineinitiative.atlassian.net/browse/DC-1355), [DC-1357](https://precisionmedicineinitiative.atlassian.net/browse/DC-1357), [DC-1359](https://precisionmedicineinitiative.atlassian.net/browse/DC-1359), [DC-1362](https://precisionmedicineinitiative.atlassian.net/browse/DC-1362), [DC-1364](https://precisionmedicineinitiative.atlassian.net/browse/DC-1364), [DC-1366](https://precisionmedicineinitiative.atlassian.net/browse/DC-1366), -[DC-1368](https://precisionmedicineinitiative.atlassian.net/browse/DC-1368), [DC-1373](https://precisionmedicineinitiative.atlassian.net/browse/DC-1373), [DC-1382](https://precisionmedicineinitiative.atlassian.net/browse/DC-1382), [DC-1388](https://precisionmedicineinitiative.atlassian.net/browse/DC-1388), [DC-1496](https://precisionmedicineinitiative.atlassian.net/browse/DC-1496), [DC-1527](https://precisionmedicineinitiative.atlassian.net/browse/DC-1527), [DC-1535](https://precisionmedicineinitiative.atlassian.net/browse/DC-1535), [DC-2112](https://precisionmedicineinitiative.atlassian.net/browse/DC-2112) +[DC-1368](https://precisionmedicineinitiative.atlassian.net/browse/DC-1368), [DC-1373](https://precisionmedicineinitiative.atlassian.net/browse/DC-1373), [DC-1388](https://precisionmedicineinitiative.atlassian.net/browse/DC-1388), [DC-1496](https://precisionmedicineinitiative.atlassian.net/browse/DC-1496), [DC-1535](https://precisionmedicineinitiative.atlassian.net/browse/DC-1535), [DC-2112](https://precisionmedicineinitiative.atlassian.net/browse/DC-2112) 2. check_controlled_tier_covid_concept_no_suppression.py This notebook is for [DC-2119](https://precisionmedicineinitiative.atlassian.net/browse/DC-2119). DC-2119 is not included in `check_controlled_tier.py` because of the following reasons: diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier.py index 01d03456c7..16ef5377ad 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier.py +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier.py @@ -30,9 +30,11 @@ pd.set_option('display.width', None) # If you want to ignore specific QC rule(s): Remove those rules from to_include -to_include = ['DC-1370', 'DC-1377', 'DC-1346', 'DC-1348', 'DC-1355', 'DC-1357', 'DC-1359', - 'DC-1362', 'DC-1364', 'DC-1366', 'DC-1368', 'DC-1373', 'DC-1382', 'DC-1388', - 'DC-1496', 'DC-1527', 'DC-1535', 'DC-2112'] +to_include = [ + 'DC-1370', 'DC-1377', 'DC-1346', 'DC-1348', 'DC-1355', 'DC-1357', 'DC-1359', + 'DC-1362', 'DC-1364', 'DC-1366', 'DC-1368', 'DC-1373', 'DC-1388', 'DC-1496', + 'DC-1535', 'DC-2112' +] checks = run_qc(project_id, post_deid_dataset, pre_deid_dataset, @@ -96,10 +98,6 @@ display_check_detail_of_rule(checks, 'DC-1373', to_include) -# # [DC-1382: Record Suppression of some cancer condition](https://precisionmedicineinitiative.atlassian.net/browse/DC-1382) - -display_check_detail_of_rule(checks, 'DC-1382', to_include) - # # [DC-1388: Free Text survey response are suppressed](https://precisionmedicineinitiative.atlassian.net/browse/DC-1388) display_check_detail_of_rule(checks, 'DC-1388', to_include) @@ -108,10 +106,6 @@ display_check_detail_of_rule(checks, 'DC-1496', to_include) -# # [DC-1527: Suppression of organ transplant rows](https://precisionmedicineinitiative.atlassian.net/browse/DC-1527) - -display_check_detail_of_rule(checks, 'DC-1527', to_include) - # # [DC-1535: Suppression of geolocation records](https://precisionmedicineinitiative.atlassian.net/browse/DC-1535) display_check_detail_of_rule(checks, 'DC-1535', to_include) diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py index 6f4dafda12..9a5c4ff0ce 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py @@ -52,12 +52,16 @@ # 14 all other observation concept ids WITH dates similar to birth dates other than the 3 above should be removed # # 15 All the descendants of ancestor_concept_id IN (4054924, 141771) -- motor vehicle accidents should be dropped in condition_occurrence table +# +# 19 Test for the presences of at least one race/ethnicity sub category +# # + tags=["parameters"] # Parameters project_id = "" rt_dataset = "" ct_dataset = "" +deid_sandbox = "" earliest_ehr_date = "" cut_off_date = "" @@ -76,23 +80,14 @@ df = pd.DataFrame(columns=['query', 'result']) # wear_consent and wear_consent_ptsc question and module concepts where not in multiple surveys. -# The concepts found in multiple surveys are: 'resultsconsent_helpmewithconsent' and 'helpmewithconsent_name' -WEAR_SURVEY_CODES = ['havesmartphone', - 'wearwatch', - 'usetracker', - 'wear12months', - 'receivesms', - 'frequency', - 'agreetoshare', - 'onlyparticipantinhousehold', - 'haveaddress', - 'resultsconsent_wear', - 'email_help_consent', - 'timeofday', - 'wearconsent_signature', - 'wearconsent_todaysdate', - 'wear_consent', - 'wear_consent_ptsc'] +# The concepts found in multiple surveys are: 'resultsconsent_helpmewithconsent' and 'helpmewithconsent_name' +WEAR_SURVEY_CODES = [ + 'havesmartphone', 'wearwatch', 'usetracker', 'wear12months', 'receivesms', + 'frequency', 'agreetoshare', 'onlyparticipantinhousehold', 'haveaddress', + 'resultsconsent_wear', 'email_help_consent', 'timeofday', + 'wearconsent_signature', 'wearconsent_todaysdate', 'wear_consent', + 'wear_consent_ptsc' +] # # Query1: all the birthdates are set to 15th June of the birth year in person table # @@ -101,7 +96,7 @@ # step1 , to get the tables AND columns that have person_id, size >1 AND DATE columns AND save to a data frame query = JINJA_ENV.from_string(""" -SELECT +SELECT 'person' as table_name, 'birth_datetime' as column_name, count (*) as row_counts_failures @@ -150,7 +145,7 @@ column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` - WHERE + WHERE column_name='person_id' ), table2 AS ( SELECT @@ -158,28 +153,28 @@ row_count FROM `{{project_id}}.{{ct_dataset}}.__TABLES__` - WHERE + WHERE row_count>1) - + SELECT table_name, column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` c - WHERE + WHERE table_name IN ( SELECT DISTINCT table_name FROM table2 - WHERE + WHERE table_name IN ( SELECT DISTINCT table_name FROM table1)) - AND c.data_type IN ('DATE','TIMESTAMP') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') + AND c.data_type IN ('DATE','TIMESTAMP') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') AND NOT REGEXP_CONTAINS(column_name, r'(?i)(birth)') """) @@ -201,21 +196,21 @@ def my_sql(table_name, column_name): FROM {{project_id}}.{{rt_dataset}}.person JOIN - {{project_id}}.{{rt_dataset}}._deid_map + {{project_id}}.{{deid_sandbox}}._deid_map USING (person_id) - ) - -SELECT + ) + +SELECT '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END AS Failure_birth_date - + FROM `{{project_id}}.{{ct_dataset}}.{{table_name}}` c JOIN rt_map r USING (person_id) WHERE DATE(c.{{column_name}})< r.birth_date @@ -223,6 +218,7 @@ def my_sql(table_name, column_name): q = query.render(project_id=project_id, rt_dataset=rt_dataset, ct_dataset=ct_dataset, + deid_sandbox=deid_sandbox, table_name=table_name, column_name=column_name) df11 = execute(client, q) @@ -276,14 +272,14 @@ def my_sql(table_name, column_name): def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" - + WITH df1 as ( -SELECT +SELECT person_id,c.{{column_name}}, DATE_ADD(d.death_date, INTERVAL 30 DAY) AS after_death_30_days - + FROM `{{project_id}}.{{ct_dataset}}.{{table_name}}` c -FULL JOIN `{{project_id}}.{{ct_dataset}}.aou_death` d USING (person_id) +FULL JOIN `{{project_id}}.{{ct_dataset}}.aou_death` d USING (person_id) WHERE DATE(c.{{column_name}}) > d.death_date AND d.primary_death_record = TRUE AND c.{{table_name}}_concept_id NOT IN (4013886, 4135376, 4271761) @@ -293,7 +289,7 @@ def my_sql(table_name, column_name): '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END @@ -339,13 +335,13 @@ def my_sql(table_name, column_name): def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" - + WITH death_30_days as ( -SELECT +SELECT c.{{column_name}}, DATE_ADD(d.death_date, INTERVAL 30 DAY) AS after_death_30_days FROM `{{project_id}}.{{ct_dataset}}.{{table_name}}` c - JOIN `{{project_id}}.{{ct_dataset}}.aou_death` d USING (person_id) + JOIN `{{project_id}}.{{ct_dataset}}.aou_death` d USING (person_id) WHERE DATE(c.{{column_name}}) > d.death_date AND d.primary_death_record = TRUE ) @@ -354,7 +350,7 @@ def my_sql(table_name, column_name): '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END @@ -418,12 +414,12 @@ def my_sql(table_name, column_name): def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" - -SELECT + +SELECT '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END @@ -475,7 +471,13 @@ def my_sql(table_name, column_name): }, ignore_index=True) -# # query 7 All participants have basics,done +# # query 7 All participants have basics data +# +# There should not be any data in the CT datasets for participants who don't have basics data. +# +# If this check fails investigate. Ensure all participants lacking basics data have been dropped. +# +# Note: Since the CR `drop_participants_without_any_basics` occurs in the RDR cleaning stage it is possible that a small number of participants have their Basics data dropped between the rdr and CT pipeline stages. At this time(V8), participants without Basics data in the CT dataset are allowed in the CDR if they had Basics data in the rdr stage. # + query = JINJA_ENV.from_string(""" @@ -484,28 +486,28 @@ def my_sql(table_name, column_name): person_basics as ( SELECT distinct person_id -FROM -`{{project_id}}.{{ct_dataset}}.concept` +FROM +`{{project_id}}.{{ct_dataset}}.concept` JOIN `{{project_id}}.{{ct_dataset}}.concept_ancestor` on (concept_id=ancestor_concept_id) JOIN `{{project_id}}.{{ct_dataset}}.observation` on (descendant_concept_id=observation_concept_id) JOIN `{{project_id}}.{{ct_dataset}}.observation_ext` USING(observation_id) -WHERE observation_concept_id NOT IN (40766240,43528428,1585389) +WHERE observation_concept_id NOT IN (40766240,43528428,1585389) AND concept_class_id='Module' -AND concept_name IN ('The Basics') +AND concept_name IN ('The Basics') AND NOT REGEXP_CONTAINS(src_id, r'(?i)(PPI/PM)|(EHR site)') AND questionnaire_response_id is not null) -SELECT +SELECT 'observation' AS table_name, 'person_id' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END AS Failure_bascis -FROM person_all +FROM person_all WHERE person_id NOT IN (SELECT person_id FROM person_basics) """) @@ -574,21 +576,21 @@ def my_sql(table_name, column_name): ), person_yes as ( - SELECT - distinct person_id - FROM + SELECT + distinct person_id + FROM `{{project_id}}.{{ct_dataset}}.observation` - WHERE + WHERE observation_concept_id = 1586099 - AND + AND value_source_concept_id = 1586100 ) -SELECT +SELECT 'person_ehr' AS table_name, 'person_id' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END @@ -634,7 +636,7 @@ def my_sql(table_name, column_name): column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` - WHERE + WHERE column_name='person_id' ), table2 AS ( SELECT @@ -642,51 +644,65 @@ def my_sql(table_name, column_name): row_count FROM `{{project_id}}.{{ct_dataset}}.__TABLES__` - WHERE + WHERE row_count>1) - + SELECT table_name, column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` c - WHERE + WHERE table_name IN ( SELECT DISTINCT table_name FROM table2 - WHERE + WHERE (table_name IN ( SELECT DISTINCT table_name FROM table1)) - AND REGEXP_CONTAINS(column_name, r'(?i)(_id)') - AND NOT REGEXP_CONTAINS(table_name, r'(?i)(person)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(person_)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_concept)') + AND REGEXP_CONTAINS(column_name, r'(?i)(_id)') + AND NOT REGEXP_CONTAINS(table_name, r'(?i)(person)') + AND NOT REGEXP_CONTAINS(table_name, r'(?i)(aou_death)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(person_)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_concept)') AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_site)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(provider)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(response)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(location)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(source)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(unique)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(provider)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(response)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(location)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(source)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(visit_detail)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(unique)') ) - + OR ( (table_name IN ( SELECT DISTINCT table_name FROM table1)) - AND REGEXP_CONTAINS(table_name, r'(?i)(visit)') - AND REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') + AND REGEXP_CONTAINS(table_name, r'(?i)(visit_occurrence)') + AND REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') AND NOT REGEXP_CONTAINS(column_name, r'(?i)(preceding)') ) OR ( + (table_name IN ( + SELECT + DISTINCT table_name + FROM + table1)) + AND REGEXP_CONTAINS(table_name, r'(?i)(visit_detail)') + AND REGEXP_CONTAINS(column_name, r'(?i)(visit_detail_id)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(preceding)') ) + """) + +""" +OR ( (table_name IN ( SELECT DISTINCT table_name @@ -694,9 +710,9 @@ def my_sql(table_name, column_name): table1)) AND REGEXP_CONTAINS(table_name, r'(?i)(person)') AND NOT REGEXP_CONTAINS(table_name, r'(?i)(person_ext)') - AND REGEXP_CONTAINS(column_name, r'(?i)(person_id)') + AND REGEXP_CONTAINS(column_name, r'(?i)(person_id)') ) - """) +""" q = query.render(project_id=project_id, ct_dataset=ct_dataset) target_tables = execute(client, q) @@ -709,17 +725,17 @@ def my_sql(table_name, column_name): def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" - SELECT + SELECT '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END AS Failure_primary_key_match - + FROM `{{project_id}}.{{ct_dataset}}.{{table_name}}` c JOIN `{{project_id}}.{{ct_dataset}}.{{table_name}}_ext` ext USING ({{column_name}}) WHERE c.{{column_name}} !=ext.{{column_name}} @@ -776,7 +792,7 @@ def my_sql(table_name, column_name): column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` - WHERE + WHERE column_name='person_id' ), table2 AS ( SELECT @@ -784,49 +800,73 @@ def my_sql(table_name, column_name): row_count FROM `{{project_id}}.{{ct_dataset}}.__TABLES__` WHERE row_count>1) - + SELECT table_name, column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` c - WHERE + WHERE table_name IN ( SELECT DISTINCT table_name FROM table2 - WHERE + WHERE (table_name IN ( SELECT DISTINCT table_name FROM table1)) - AND REGEXP_CONTAINS(column_name, r'(?i)(_id)') - AND NOT REGEXP_CONTAINS(table_name, r'(?i)(person)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(person_)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_concept)') + AND REGEXP_CONTAINS(column_name, r'(?i)(_id)') + AND NOT REGEXP_CONTAINS(table_name, r'(?i)(person)') + AND NOT REGEXP_CONTAINS(table_name, r'(?i)(aou_death)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(person_)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_concept)') AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_site)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(provider)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(response)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(location)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(source)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(unique)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(provider)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(response)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(location)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(source)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(visit_detail)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(unique)') ) - + OR ( (table_name IN ( SELECT DISTINCT table_name FROM table1)) - AND REGEXP_CONTAINS(table_name, r'(?i)(visit)') - AND REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') + AND REGEXP_CONTAINS(table_name, r'(?i)(visit_occurrence)') + AND REGEXP_CONTAINS(column_name, r'(?i)(visit_occurrence)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(preceding)') ) + + OR ( + (table_name IN ( + SELECT + DISTINCT table_name + FROM + table1)) + AND REGEXP_CONTAINS(table_name, r'(?i)(visit_detail)') + AND REGEXP_CONTAINS(column_name, r'(?i)(visit_detail_id)') AND NOT REGEXP_CONTAINS(column_name, r'(?i)(preceding)') ) OR ( + (table_name IN ( + SELECT + DISTINCT table_name + FROM + table1)) + AND REGEXP_CONTAINS(table_name, r'(?i)(aou_death)') + AND REGEXP_CONTAINS(column_name, r'(?i)(aou_death_id)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(src)') ) + """) + +""" +OR ( (table_name IN ( SELECT DISTINCT table_name @@ -834,9 +874,9 @@ def my_sql(table_name, column_name): table1)) AND REGEXP_CONTAINS(table_name, r'(?i)(person)') AND NOT REGEXP_CONTAINS(table_name, r'(?i)(person_ext)') - AND REGEXP_CONTAINS(column_name, r'(?i)(person_id)') + AND REGEXP_CONTAINS(column_name, r'(?i)(person_id)') ) - """) +""" q = query.render(project_id=project_id, ct_dataset=ct_dataset) target_tables = execute(client, q) @@ -849,14 +889,14 @@ def my_sql(table_name, column_name): def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" - -SELECT + +SELECT '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, {{column_name}}, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END @@ -917,7 +957,7 @@ def my_sql(table_name, column_name): column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` - WHERE + WHERE column_name='person_id' ), table2 AS ( SELECT @@ -925,29 +965,29 @@ def my_sql(table_name, column_name): row_count FROM `{{project_id}}.{{ct_dataset}}.__TABLES__` - WHERE + WHERE row_count>1) - + SELECT table_name, column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` c - WHERE + WHERE table_name IN ( SELECT DISTINCT table_name FROM table2 - WHERE + WHERE table_name IN ( SELECT DISTINCT table_name FROM table1)) - AND REGEXP_CONTAINS(column_name, r'(?i)(_concept_id)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_source)') + AND REGEXP_CONTAINS(column_name, r'(?i)(_concept_id)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_PAR)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(_source)') """) q = query.render(project_id=project_id, ct_dataset=ct_dataset) @@ -961,18 +1001,18 @@ def my_sql(table_name, column_name): def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" - -SELECT + +SELECT '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END AS Failure_primary_key_match - + FROM `{{project_id}}.{{ct_dataset}}.concept` c JOIN `{{project_id}}.{{ct_dataset}}.{{table_name}}` ON (concept_id={{column_name}}) WHERE standard_concept !='S' @@ -1025,25 +1065,25 @@ def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" WITH rows_having_brith_date as ( - + SELECT distinct observation_id - FROM + FROM `{{project_id}}.{{rt_dataset}}.observation` ob JOIN {{project_id}}.{{rt_dataset}}.person p USING (person_id) WHERE observation_concept_id in (4013886, 4135376, 4271761) AND observation_date=DATE(p.birth_datetime) - ) + ) SELECT 'observation' AS table_name, 'observation_date' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END AS Failure_birth_date_cut_Off -FROM `{{project_id}}.{{ct_dataset}}.observation` +FROM `{{project_id}}.{{ct_dataset}}.observation` WHERE observation_id IN (SELECT observation_id FROM rows_having_brith_date) AND observation_date != '{{cut_off_date}}' """) @@ -1084,19 +1124,19 @@ def my_sql(table_name, column_name): query = JINJA_ENV.from_string(""" WITH rows_having_brith_date as ( - + SELECT observation_id FROM {{project_id}}.{{rt_dataset}}.observation ob JOIN {{project_id}}.{{rt_dataset}}.person p USING (person_id) WHERE observation_concept_id NOT IN (4013886, 4135376, 4271761) AND observation_date=DATE(p.birth_datetime) - ) + ) -SELECT +SELECT 'observation' AS table_name, 'observation_date' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END @@ -1142,12 +1182,12 @@ def my_sql(table_name, column_name): 'condition_occurrence' AS table_name, 'concept_id' AS column_name, COUNT(*) AS row_counts_failure, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END AS Failure_no_two_concept_ids -FROM `{{project_id}}.{{ct_dataset}}.condition_occurrence` +FROM `{{project_id}}.{{ct_dataset}}.condition_occurrence` JOIN `{{project_id}}.{{ct_dataset}}.concept` c ON (condition_concept_id=c.concept_id) JOIN `{{project_id}}.{{ct_dataset}}.concept_ancestor` ON (c.concept_id=descendant_concept_id) WHERE ancestor_concept_id IN (4054924, 141771) @@ -1192,7 +1232,7 @@ def query_template(table_era): query = JINJA_ENV.from_string(""" WITH df1 AS ( - SELECT + SELECT `{{table_era}}_id` FROM `{{project_id}}.{{ct_dataset}}.{{table_era}}` @@ -1256,7 +1296,7 @@ def query_template(table_era): # 2. Wear study participants are also found in the CDR person table. # 3. Wear study participants have primary consent records in observation. # -# **If check fails:**
+# **If check fails:**
# * The issue `participant with multiple records` means that those participants have multiple rows in the wear_study table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.
# * The issue `not in person table` means that participants exist in the wear_study table that aren't in the person table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.
# * The issue `no primary consent` means that participants exist in the wear_study table that do not have proper primary consent. Investigate the issue. It is possible that there is another way to determine primary consent.
@@ -1288,7 +1328,7 @@ def query_template(table_era): SELECT person_id FROM `{{project_id}}.{{ct_dataset}}.person` o ) - + UNION ALL SELECT @@ -1300,17 +1340,16 @@ def query_template(table_era): FROM latest_primary_consent_records cte LEFT JOIN ( -- any positive primary consent -- SELECT * - FROM `{{project_id}}.{{ct_dataset}}.observation` o - WHERE REGEXP_CONTAINS(o.observation_source_value, '(?i)extraconsent_agreetoconsent') - AND o.value_as_concept_id = 45877994 + FROM `{{project_id}}.{{ct_dataset}}.observation` + WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent') + AND value_as_concept_id = 45877994) o ON cte.person_id = o.person_id - AND cte.latest_consent_date = o.observation_date + AND cte.latest_date = o.observation_date WHERE o.person_id IS NOT NULL ) """) -q = query.render(project_id=project_id, - ct_dataset=ct_dataset) +q = query.render(project_id=project_id, ct_dataset=ct_dataset) df1 = execute(client, q) if df1['bad_rows'].sum() == 0: @@ -1323,11 +1362,15 @@ def query_template(table_era): else: df = df.append( { - 'query': 'Query17 wear_study table is not as expected. See notes in the description.', - 'result': 'Failure' + 'query': + 'Query17 wear_study table is not as expected. See notes in the description.', + 'result': + 'Failure' }, ignore_index=True) +# - +df1 # + # Query 18: Check that wear_consent records are suppressed in the 'observation' and 'survey_conduct' tables @@ -1350,21 +1393,135 @@ def query_template(table_era): COUNT(*) AS bad_rows FROM `{{project_id}}.{{ct_dataset}}.survey_conduct` sc -WHERE sc.survey_concept_id IN (2100000011,2100000012) +WHERE sc.survey_concept_id IN (2100000011,2100000012) GROUP BY 1 """) q = query.render(project_id=project_id, - ct_dataset=ct_dataset, - wear_codes=WEAR_SURVEY_CODES) -df1=execute(client, q) -if df1['bad_rows'].sum()==0: - df = df.append({'query' : 'Query18 wear_consent records are cleaned as expected.', 'result' : 'PASS'}, - ignore_index = True) + ct_dataset=ct_dataset, + wear_codes=WEAR_SURVEY_CODES) +df1 = execute(client, q) +if df1['bad_rows'].sum() == 0: + df = df.append( + { + 'query': 'Query18 wear_consent records are cleaned as expected.', + 'result': 'PASS' + }, + ignore_index=True) else: - df = df.append({'query' : 'Query18 wear_consent records have not been cleaned as expected.', 'result' : 'Failure'}, - ignore_index = True) + df = df.append( + { + 'query': + 'Query18 wear_consent records have not been cleaned as expected.', + 'result': + 'Failure' + }, + ignore_index=True) df1 +# + +# Query 19: Check for the existence of at least one race/ethnicity sub-categories +# - + +query = JINJA_ENV.from_string(""" +SELECT COUNT(*), value_source_value FROM `{{project_id}}.{{ct_dataset}}.observation` +WHERE value_source_concept_id IN (1585605, 1585606, 1585607, 1585608, 1585609, 1585610, 1585611, 1585612,1585613, 1585614, -- Asian -- + 1585616, 1585617, 1585618, 1585619, 1585620, 1585621, 1585622, 1585623, 1585624, 1585625, 1585626, 1585627, -- African -- + 1585345, 1585346, 1586086, 1586087, 1586088, 1586089, 1586090, 1586091, 1586092, 1586093, -- Spanish -- + 1585316, 1585633, 1585630, 1585631, 1585629, 1585319, 1585318, 1585317, 1585632, -- Middle Eastern -- + 1585321, 1585322, 1585323, 1585324, 1585325, 1585328, 1585329, 1585330, -- Pacific -- + 1585339, 1585332, 1585334, 1585337, 1585338, 1585340, 1585341, 1585342, -- European -- + 1586149) -- None of these fully Describe me -- + +GROUP BY value_source_value +ORDER BY value_source_value +""") +q = query.render( + project_id=project_id, + ct_dataset=ct_dataset, +) +result = execute(client, q) +if not result.empty: + df = df.append( + { + 'query': + 'Query19 existence of at least one race/ethnicity sub-categories', + 'result': + 'PASS' + }, + ignore_index=True) +else: + df = df.append( + { + 'query': + 'Query19 At least one race/ethnicity sub-categories DOES NOT exist', + 'result': + 'Failure' + }, + ignore_index=True) +result + +# Query 20: verify all race/ethnicity values are top-level, not subcategories +# - + +query = JINJA_ENV.from_string(""" +SELECT race_concept_id, + race_source_concept_id, + race_source_value, + race_concept_id, + ethnicity_concept_id, + ethnicity_source_concept_id, + ethnicity_source_value +FROM `{{project_id}}.{{ct_dataset}}.person` +WHERE race_source_concept_id NOT IN (1586141, 1586142, 1586143, 1586144, 1586145, 1586146, 903079, 0) +AND ethnicity_concept_id NOT IN (38003563, 38003564, 1586148, 903079, 903096, 0) +""") +q = query.render(project_id=project_id, ct_dataset=ct_dataset) +result = execute(client, q) +if result.empty: + df = df.append( + { + 'query': 'Query20 All race/ethnicity categories are top-level', + 'result': 'PASS' + }, + ignore_index=True) +else: + df = df.append( + { + 'query': 'Query20 All race/ethnicity categories are NOT top-level.', + 'result': 'Failure', + }, + ignore_index=True) +result + +# Query 20: verify free text responses for Race/ethnicity is suppressed +# - + +query = JINJA_ENV.from_string(""" +SELECT * FROM `{{project_id}}.{{ct_dataset}}.observation` +WHERE value_as_concept_id = 1586149 +""") +q = query.render(project_id=project_id, ct_dataset=ct_dataset) +result = execute(client, q) +if result.empty: + df = df.append( + { + 'query': 'No text found for "NoneOfTheseDescribeMe" free text.', + 'result': 'PASS' + }, + ignore_index=True) +else: + df = df.append( + { + 'query': '"NoneOfTheseDescribeMe" response have free text.', + 'result': 'Failure' + }, + ignore_index=True) +result + +# - + + +# # Results Summary # + def highlight_cells(val): @@ -1372,4 +1529,4 @@ def highlight_cells(val): return f'background-color: {color}' -df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) +df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) \ No newline at end of file diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Check_Description.csv b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Check_Description.csv index 7d9bd55925..2935b7a70e 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Check_Description.csv +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Check_Description.csv @@ -11,9 +11,7 @@ DC-1364,Concept,check_concept_suppression,Verify Explicit identifier record supp DC-1366,Concept,check_concept_suppression,Verify Race/Ethnicity record suppression DC-1368,Field,check_vehicle_accident_suppression,Verify Motor Vehicle Accident record suppression DC-1373,Field,check_field_suppression,Verify identifying field suppression works -DC-1382,Field,check_field_cancer_concept_suppression,Verify record suppression of some cancer condition concepts DC-1388,Field,check_field_freetext_response_suppression,Verify Free text survey response records are suppressed DC-1496,Concept,check_concept_suppression,Verify suppression of some COPE survey responses -DC-1527,Concept,check_concept_suppression,Verify suppression of organ transplant concepts DC-1535,Field,check_field_geolocation_records_suppression,Verify suppression of geolocation records DC-2112,Concept,check_concept_suppression,Verify suppression of some COPE survey responses diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv index eafc522e4d..a032d67973 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv @@ -291,12 +291,6 @@ observation,observation_source_value,,tsu_ds5_13_xx42_cope_a_226,YES,DC-1496 observation,observation_source_value,,cu_covid_cope_a_204,YES,DC-1496 observation,observation_source_value,,dmfs_29a,YES,DC-1496 observation,observation_source_value,,msds_17_c,YES,DC-1496 -observation,value_source_concept_id,1585807,,YES,DC-1527 -observation,value_source_concept_id,1585808,,YES,DC-1527 -observation,value_as_concept_id,1585807,,YES,DC-1527 -observation,value_as_concept_id,1585808,,YES,DC-1527 -observation,value_source_value,,OrganTransplantDescription_OtherOrgan,YES,DC-1527 -observation,value_source_value,,OrganTransplantDescription_OtherTissue,YES,DC-1527 observation,observation_source_concept_id,766006,,YES,DC-2112 observation,observation_source_concept_id,765982,,YES,DC-2112 observation,observation_source_concept_id,766032,,YES,DC-2112 diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv index fe2846b05e..e8875c11ec 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv @@ -141,86 +141,6 @@ cost,currency_concept_id,INT64,YES,,DC-1368 cost,revenue_code_concept_id,INT64,YES,,DC-1368 dose_era,drug_concept_id,INT64,NO,,DC-1368 dose_era,unit_concept_id,INT64,NO,,DC-1368 -observation,observation_concept_id,INT64,NO,,DC-1382 -observation,observation_type_concept_id,INT64,NO,,DC-1382 -observation,value_as_concept_id,INT64,YES,,DC-1382 -observation,qualifier_concept_id,INT64,YES,,DC-1382 -observation,unit_concept_id,INT64,YES,,DC-1382 -observation,observation_source_concept_id,INT64,YES,,DC-1382 -observation,value_source_concept_id,INT64,YES,,DC-1382 -drug_era,drug_concept_id,INT64,NO,,DC-1382 -fact_relationship,relationship_concept_id,INT64,NO,,DC-1382 -observation_period,period_type_concept_id,INT64,NO,,DC-1382 -procedure_cost,currency_concept_id,INT64,YES,,DC-1382 -procedure_cost,revenue_code_concept_id,INT64,YES,,DC-1382 -visit_occurrence,visit_concept_id,INT64,NO,,DC-1382 -visit_occurrence,visit_type_concept_id,INT64,NO,,DC-1382 -visit_occurrence,visit_source_concept_id,INT64,YES,,DC-1382 -visit_occurrence,admitting_source_concept_id,INT64,YES,,DC-1382 -visit_occurrence,discharge_to_concept_id,INT64,YES,,DC-1382 -drug_strength,drug_concept_id,INT64,NO,,DC-1382 -drug_strength,ingredient_concept_id,INT64,NO,,DC-1382 -drug_strength,amount_unit_concept_id,INT64,YES,,DC-1382 -drug_strength,numerator_unit_concept_id,INT64,YES,,DC-1382 -drug_strength,denominator_unit_concept_id,INT64,YES,,DC-1382 -condition_era,condition_concept_id,INT64,NO,,DC-1382 -measurement,measurement_concept_id,INT64,NO,,DC-1382 -measurement,measurement_type_concept_id,INT64,NO,,DC-1382 -measurement,operator_concept_id,INT64,YES,,DC-1382 -measurement,value_as_concept_id,INT64,YES,,DC-1382 -measurement,unit_concept_id,INT64,YES,,DC-1382 -measurement,measurement_source_concept_id,INT64,YES,,DC-1382 -visit_cost,currency_concept_id,INT64,YES,,DC-1382 -domain,domain_concept_id,INT64,NO,,DC-1382 -provider,specialty_concept_id,INT64,YES,,DC-1382 -provider,gender_concept_id,INT64,YES,,DC-1382 -provider,specialty_source_concept_id,INT64,YES,,DC-1382 -provider,gender_source_concept_id,INT64,YES,,DC-1382 -person,gender_concept_id,INT64,NO,,DC-1382 -person,race_concept_id,INT64,NO,,DC-1382 -person,ethnicity_concept_id,INT64,NO,,DC-1382 -person,gender_source_concept_id,INT64,YES,,DC-1382 -person,race_source_concept_id,INT64,YES,,DC-1382 -person,ethnicity_source_concept_id,INT64,YES,,DC-1382 -drug_cost,currency_concept_id,INT64,YES,,DC-1382 -cohort_attribute,value_as_concept_id,INT64,YES,,DC-1382 -procedure_occurrence,procedure_concept_id,INT64,NO,,DC-1382 -procedure_occurrence,procedure_type_concept_id,INT64,NO,,DC-1382 -procedure_occurrence,modifier_concept_id,INT64,YES,,DC-1382 -procedure_occurrence,procedure_source_concept_id,INT64,YES,,DC-1382 -care_site,place_of_service_concept_id,INT64,YES,,DC-1382 -specimen,specimen_concept_id,INT64,NO,,DC-1382 -specimen,specimen_type_concept_id,INT64,NO,,DC-1382 -specimen,unit_concept_id,INT64,YES,,DC-1382 -specimen,anatomic_site_concept_id,INT64,YES,,DC-1382 -specimen,disease_status_concept_id,INT64,YES,,DC-1382 -death,death_type_concept_id,INT64,NO,,DC-1382 -death,cause_concept_id,INT64,YES,,DC-1382 -death,cause_source_concept_id,INT64,YES,,DC-1382 -device_exposure,device_concept_id,INT64,NO,,DC-1382 -device_exposure,device_type_concept_id,INT64,NO,,DC-1382 -device_exposure,device_source_concept_id,INT64,YES,,DC-1382 -device_cost,currency_concept_id,INT64,YES,,DC-1382 -condition_occurrence,condition_concept_id,INT64,NO,,DC-1382 -condition_occurrence,condition_type_concept_id,INT64,NO,,DC-1382 -condition_occurrence,condition_source_concept_id,INT64,YES,,DC-1382 -condition_occurrence,condition_status_concept_id,INT64,YES,,DC-1382 -cohort_definition,definition_type_concept_id,INT64,NO,,DC-1382 -cohort_definition,subject_concept_id,INT64,NO,,DC-1382 -attribute_definition,attribute_type_concept_id,INT64,NO,,DC-1382 -drug_exposure,drug_concept_id,INT64,NO,,DC-1382 -drug_exposure,drug_type_concept_id,INT64,NO,,DC-1382 -drug_exposure,route_concept_id,INT64,YES,,DC-1382 -drug_exposure,drug_source_concept_id,INT64,YES,,DC-1382 -note,note_type_concept_id,INT64,NO,,DC-1382 -note,note_class_concept_id,INT64,NO,,DC-1382 -note,encoding_concept_id,INT64,NO,,DC-1382 -note,language_concept_id,INT64,NO,,DC-1382 -cost,cost_type_concept_id,INT64,NO,,DC-1382 -cost,currency_concept_id,INT64,YES,,DC-1382 -cost,revenue_code_concept_id,INT64,YES,,DC-1382 -dose_era,drug_concept_id,INT64,NO,,DC-1382 -dose_era,unit_concept_id,INT64,NO,,DC-1382 observation,observation_concept_id,INT64,NO,,DC-1388 observation,observation_type_concept_id,INT64,NO,,DC-1388 observation,value_as_concept_id,INT64,YES,,DC-1388 diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py index bc47f8dc35..23693cd325 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py @@ -231,7 +231,7 @@ ) SELECT '{{ table_name }}' AS table_name, - IFNULL(SUM(CASE WHEN output_zip != expected_zip THEN 1 ELSE 0 END), 0) AS n_row_violation + IFNULL(SUM(CASE WHEN output_zip != expected_zip AND output_zip != '000**' THEN 1 ELSE 0 END), 0) AS n_row_violation FROM transformed_zips """ @@ -262,4 +262,4 @@ AND (concept_class_id = 'Question' OR concept_class_id = 'Topic') AND vocabulary_id = 'PPI' ) -""" \ No newline at end of file +""" diff --git a/data_steward/analytics/cdr_ops/cross_ct_rt/CDR_QC_RT_vs_CT.py b/data_steward/analytics/cdr_ops/cross_ct_rt/CDR_QC_RT_vs_CT.py index fa2c260287..9a775805eb 100644 --- a/data_steward/analytics/cdr_ops/cross_ct_rt/CDR_QC_RT_vs_CT.py +++ b/data_steward/analytics/cdr_ops/cross_ct_rt/CDR_QC_RT_vs_CT.py @@ -21,14 +21,17 @@ project_id = "" rt_dataset = "" ct_dataset = "" +combined_sandbox_dataset = "" +withdrawn_pids_table = "" maximum_age = "" run_as = "" # + import pandas as pd +from IPython.display import display from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES -from common import JINJA_ENV, PIPELINE_TABLES +from common import JINJA_ENV, PIPELINE_TABLES, PID_RID_MAPPING from gcloud.bq import BigQueryClient from utils import auth @@ -119,14 +122,14 @@ SELECT table_name FROM `{{project_id}}.{{rt_dataset}}.INFORMATION_SCHEMA.COLUMNS` where lower(column_name) = 'person_id' - AND not REGEXP_CONTAINS(table_name, r'(?i)(death)|(copy)') + AND not REGEXP_CONTAINS(table_name, r'(?i)(death)|(copy)|(wear)') UNION DISTINCT SELECT table_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` where lower(column_name) = 'person_id' - AND not REGEXP_CONTAINS(table_name, r'(?i)(death)|(copy)') + AND not REGEXP_CONTAINS(table_name, r'(?i)(death)|(copy)|(wear)') ); @@ -205,14 +208,14 @@ SELECT table_name, column_name FROM `{{project_id}}.{{rt_dataset}}.INFORMATION_SCHEMA.COLUMNS` where REGEXP_CONTAINS(table_name, r'(?i)(_ext)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(src_id)|(survey_version_concept_id)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(src_id)|(survey_version_concept_id)|(language)') UNION DISTINCT SELECT table_name, column_name FROM `{{project_id}}.{{ct_dataset}}.INFORMATION_SCHEMA.COLUMNS` where REGEXP_CONTAINS(table_name, r'(?i)(_ext)') - AND NOT REGEXP_CONTAINS(column_name, r'(?i)(src_id)|(survey_version_concept_id)') + AND NOT REGEXP_CONTAINS(column_name, r'(?i)(src_id)|(survey_version_concept_id)|(language)') ) SELECT AS STRUCT table_name, column_name FROM data @@ -561,7 +564,7 @@ SELECT DISTINCT person_id, {{PIPELINE_TABLES}}.calculate_age(CURRENT_DATE, birth_datetime) AS age, CASE WHEN {{PIPELINE_TABLES}}.calculate_age(CURRENT_DATE, birth_datetime) < {{maximum_age}} - THEN 1 ELSE 0 END AS Failure +THEN 1 ELSE 0 END AS Failure FROM ct_person_id JOIN `{{project_id}}.{{ct_dataset}}.person` USING (person_id) @@ -602,10 +605,10 @@ # # Query8: Verify the wear_study dateshift # -# RT dates should have been shifted back by the number of days designated to each +# RT dates should have been shifted back by the number of days designated to each # participant via the primary_pid_rid_mapping table. # -# The following query will find any rows in the wear_study tables where the RT date plus the date shift is not equal to the +# The following query will find any rows in the wear_study tables where the RT date plus the date shift is not equal to the # CT date. If there are resulting rows, make sure the pipeline dateshift ran properly. # + @@ -634,10 +637,73 @@ df1 = execute(client, q) if df1['bad_rows'].sum() == 0: + df = df.append( + { + 'query': 'Query8 Wear_study dates are as expected.', + 'result': 'PASS' + }, + ignore_index=True) +else: df = df.append( { 'query': - 'Query8 Wear_study dates are as expected.', + 'Query8 Wear_study dates are not aligned properly. See description.', + 'result': + 'FAIL' + }, + ignore_index=True) + display(df1) +# - + +# # Query9: Verify data for withdrawn pids is removed from all tables in rt dataset. + +# + +person_id_tables_query = JINJA_ENV.from_string(''' +SELECT table_name +FROM `{{project_id}}.{{dataset_id}}.INFORMATION_SCHEMA.COLUMNS` +WHERE column_name = "person_id" +''').render(project_id=project_id, dataset_id=rt_dataset) +pid_table_list = client.query(person_id_tables_query).to_dataframe().get( + 'table_name').to_list() + +query = JINJA_ENV.from_string(""" + SELECT + \{{table_name}}\ AS table_name, + COUNT(*) AS total_records + FROM + `{{project_id}}.{{dataset_id}}.{{table_name}}` + WHERE person_id IN ( + SELECT + pr.research_id + FROM + `{{project_id}}.{{pipeline_tables}}.{{pid_rid_mapping}}` AS pr + JOIN + `{{project_id}}.{{combined_sandbox_dataset}}.{{withdrawn_pids_table}}` AS w + ON + pr.person_id = w.person_id + ) +""") + +row_counts_queries_list = [] +for table in pid_table_list: + row_counts_queries_list.append( + query.render(project_id=project_id, + dataset_id=rt_dataset, + table_name=table, + combined_sandbox_dataset=combined_sandbox_dataset, + withdrawn_pids_table=withdrawn_pids_table, + pipeline_tables=PIPELINE_TABLES, + pid_rid_mapping=PID_RID_MAPPING)) + +row_counts_union_all_query = '\nUNION ALL\n'.join(row_counts_queries_list) + +df1 = execute(client, row_counts_union_all_query) + +if df1['total_records'].sum() == 0: + df = df.append( + { + 'query': + 'Query9: Verify data for withdrawn pids is removed from all tables in rt dataset.', 'result': 'PASS' }, @@ -646,13 +712,77 @@ df = df.append( { 'query': - 'Query8 Wear_study dates are not aligned properly. See description.', + 'Query9: Verify data for withdrawn pids is removed from all tables in rt dataset.', 'result': 'FAIL' }, - ignore_index=True - ) - display(df1) + ignore_index=True) +display(df1) +# - + +# # Query10: Verify data for withdrawn pids is removed from all tables in ct dataset. + +# + +person_id_tables_query = JINJA_ENV.from_string(''' +SELECT table_name +FROM `{{project_id}}.{{dataset_id}}.INFORMATION_SCHEMA.COLUMNS` +WHERE column_name = "person_id" +''').render(project_id=project_id, dataset_id=ct_dataset) +pid_table_list = client.query(person_id_tables_query).to_dataframe().get( + 'table_name').to_list() + +query = JINJA_ENV.from_string(""" + SELECT + \{{table_name}}\ AS table_name, + COUNT(*) AS total_records + FROM + `{{project_id}}.{{dataset_id}}.{{table_name}}` + WHERE person_id IN ( + SELECT + pr.research_id + FROM + `{{project_id}}.{{pipeline_tables}}.{{pid_rid_mapping}}` AS pr + JOIN + `{{project_id}}.{{combined_sandbox_dataset}}.{{withdrawn_pids_table}}` AS w + ON + pr.person_id = w.person_id + ) +""") + +row_counts_queries_list = [] +for table in pid_table_list: + row_counts_queries_list.append( + query.render(project_id=project_id, + dataset_id=ct_dataset, + table_name=table, + combined_sandbox_dataset=combined_sandbox_dataset, + withdrawn_pids_table=withdrawn_pids_table, + pipeline_tables=PIPELINE_TABLES, + pid_rid_mapping=PID_RID_MAPPING)) + +row_counts_union_all_query = '\nUNION ALL\n'.join(row_counts_queries_list) + +df1 = execute(client, row_counts_union_all_query) + +if df1['total_records'].sum() == 0: + df = df.append( + { + 'query': + 'Query10: Verify data for withdrawn pids is removed from all tables in ct dataset.', + 'result': + 'PASS' + }, + ignore_index=True) +else: + df = df.append( + { + 'query': + 'Query10: Verify data for withdrawn pids is removed from all tables in ct dataset.', + 'result': + 'FAIL' + }, + ignore_index=True) +display(df1) # - # # Summary_CDR_QC_RT_vs_CT_comparison diff --git a/data_steward/analytics/cdr_ops/curation_dashboard.py b/data_steward/analytics/cdr_ops/curation_dashboard.py index f55c47636f..6a37ae6920 100644 --- a/data_steward/analytics/cdr_ops/curation_dashboard.py +++ b/data_steward/analytics/cdr_ops/curation_dashboard.py @@ -6,7 +6,7 @@ # extension: .py # format_name: light # format_version: '1.5' -# jupytext_version: 1.3.0 +# jupytext_version: 1.7.1 # kernelspec: # display_name: Python 3 # language: python @@ -20,12 +20,23 @@ import seaborn as sns from common import PIPELINE_TABLES -from notebooks.defaults import DEFAULT_DATASETS from utils import bq warnings.filterwarnings('ignore') sns.set() +RDR = '' +UNIONED = '' +VOCABULARY = '' +COMBINED = '' +RT_DATASET = '' +CT_DATASET = '' + +ALL_RDR = [] +ALL_UNIONED = [] +ALL_COMBINED = [] +ALL_RT_DATASET = [] +ALL_CT_DATASET = [] # - @@ -52,7 +63,7 @@ def row_counts(dataset_ids): # # RDR data volume over time -rdr_df = row_counts(DEFAULT_DATASETS.trend.rdr) +rdr_df = row_counts(ALL_RDR + [RDR]) rdr_df = rdr_df.pivot(index='table_id', columns='dataset_id', values='row_count') @@ -60,7 +71,7 @@ def row_counts(dataset_ids): # # EHR data volume over time -unioned_df = row_counts(DEFAULT_DATASETS.trend.unioned) +unioned_df = row_counts(ALL_UNIONED + [UNIONED]) unioned_df = unioned_df.pivot(index='table_id', columns='dataset_id', values='row_count') @@ -68,32 +79,38 @@ def row_counts(dataset_ids): # ## Combined data volume over time -combined_df = row_counts(DEFAULT_DATASETS.trend.combined) +combined_df = row_counts(ALL_COMBINED + [COMBINED]) combined_df = combined_df.pivot(index='table_id', columns='dataset_id', values='row_count') combined_df.to_csv('%s.csv' % 'combined_diff') +ct_df = row_counts(ALL_CT_DATASET + [CT_DATASET]) +ct_df = ct_df.pivot(index='table_id', + columns='dataset_id', + values='row_count') +ct_df.to_csv('%s.csv' % 'ct_diff') + # # Characterization of EHR data -q = """ +q = f""" SELECT (2018 - r.year_of_birth) AS age, gc.concept_name AS gender, rc.concept_name AS race, ec.concept_name AS ethnicity, CASE WHEN e.person_id IS NULL THEN 'no' ELSE 'yes' END AS has_ehr_data -FROM {latest.rdr}.person r - LEFT JOIN `{latest.unioned}.person` e +FROM `{RDR}.person` r + LEFT JOIN `{UNIONED}.person` e ON r.person_id = e.person_id -JOIN `{latest.vocabulary}.concept` gc +JOIN `{VOCABULARY}.concept` gc ON r.gender_concept_id = gc.concept_id -JOIN `{latest.vocabulary}.concept` rc +JOIN `{VOCABULARY}.concept` rc ON r.race_concept_id = rc.concept_id -JOIN `{latest.vocabulary}.concept` ec +JOIN `{VOCABULARY}.concept` ec ON r.ethnicity_concept_id = ec.concept_id ORDER BY age, gender, race -""".format(latest=DEFAULT_DATASETS.latest) +""" df = bq.query(q) # ## Presence of EHR data by race @@ -132,13 +149,14 @@ def row_counts(dataset_ids): gc.concept_name AS gender, rc.concept_name AS race, ec.concept_name AS ethnicity -FROM `{DEFAULT_DATASETS.latest.unioned}.person` p -JOIN `{DEFAULT_DATASETS.latest.vocabulary}.concept` gc +FROM `{UNIONED}.person` p +JOIN `{VOCABULARY}.concept` gc ON p.gender_concept_id = gc.concept_id -JOIN `{DEFAULT_DATASETS.latest.vocabulary}.concept` rc +JOIN `{VOCABULARY}.concept` rc ON p.race_concept_id = rc.concept_id -JOIN `{DEFAULT_DATASETS.latest.vocabulary}.concept` ec +JOIN `{VOCABULARY}.concept` ec ON p.ethnicity_concept_id = ec.concept_id +WHERE p.birth_datetime IS NOT NULL ORDER BY age, gender, race ''') @@ -174,21 +192,21 @@ def row_counts(dataset_ids): def gender_by_race(dataset_id): - df = bq.query(''' + df = bq.query(f''' SELECT c1.concept_name AS gender, c2.concept_name AS race, COUNT(1) AS `count` FROM `{dataset_id}.person` p - JOIN `{latest.vocabulary}.concept` c1 + JOIN `{VOCABULARY}.concept` c1 ON p.gender_concept_id = c1.concept_id - JOIN `{latest.vocabulary}.concept` c2 + JOIN `{VOCABULARY}.concept` c2 ON p.race_concept_id = c2.concept_id GROUP BY c2.concept_name, c1.concept_name - '''.format(dataset_id=dataset_id, latest=DEFAULT_DATASETS.latest)) + ''') df['race'] = df['race'].astype('category') df['gender'] = df['gender'].astype('category') - g = sns.FacetGrid(df, col='race', hue='gender', col_wrap=5) + g = sns.FacetGrid(df, col='race', sharey=False, hue='gender', col_wrap=5) g.map(sns.barplot, 'gender', 'count', ci=None) g.set_xticklabels([]) g.set_axis_labels('', '') @@ -197,12 +215,13 @@ def gender_by_race(dataset_id): # ## RDR -gender_by_race(DEFAULT_DATASETS.latest.rdr) +gender_by_race(RDR) # ## EHR -gender_by_race(DEFAULT_DATASETS.latest.unioned) +gender_by_race(UNIONED) # ## CDR -gender_by_race(DEFAULT_DATASETS.latest.combined) +gender_by_race(COMBINED) + diff --git a/data_steward/analytics/cdr_ops/ehr_union_qc.py b/data_steward/analytics/cdr_ops/ehr_union_qc.py index bc2488620a..a3ce3985c2 100644 --- a/data_steward/analytics/cdr_ops/ehr_union_qc.py +++ b/data_steward/analytics/cdr_ops/ehr_union_qc.py @@ -25,10 +25,13 @@ # - # + -from common import JINJA_ENV +from cdm import tables_to_map +from common import AOU_DEATH, DEATH, JINJA_ENV, MAPPING_PREFIX +from resources import CDM_TABLES from utils import auth from gcloud.bq import BigQueryClient from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message + # - impersonation_creds = auth.get_impersonation_credentials( @@ -78,6 +81,53 @@ execute(client, query) # - +# # Confirm all the expected tables are in this dataset +# This QC confirms all the expected tables are present in this dataset. If not, +# our pipeline might not be working as expected. Missing tables will +# break the combined dataset generation. If this check fails, fix the issues before +# proceeding to the next step. +# See DC-3454 for the background. + +expected_domain_tables = [ + table_name for table_name in CDM_TABLES if table_name != DEATH +] + [AOU_DEATH] +expected_mapping_tables = [ + f'{MAPPING_PREFIX}{table_name}' for table_name in tables_to_map() +] +expected_tables = expected_domain_tables + expected_mapping_tables + +tpl = JINJA_ENV.from_string(''' +WITH expected_tables AS ( +{% for table in expected_tables %} + SELECT '{{table}}' AS table_id + {% if not loop.last -%} UNION ALL {% endif %} +{% endfor %} +) +SELECT table_id AS missing_table FROM expected_tables +WHERE table_id NOT IN (SELECT table_id FROM `{{project_id}}.{{dataset}}.__TABLES__`) +''') +query = tpl.render(project_id=PROJECT_ID, + dataset=CURRENT_UNIONED_EHR_DATASET_ID, + expected_tables=expected_tables) +df = execute(client, query) + +# + +success_msg = 'All the expected tables are present in this dataset.' +failure_msg = ''' +{code_count} tables are missing. Check if the missing tables are important ones.
+If it is NOT important (e.g., expected to be empty), simply create an empty table for that +and move on. Create an investigation ticket so we can investigate it later.
+If it is an important table, troubleshoot and figure out why the table is missing in the +dataset before moving on to the next steps. +''' + +render_message(df, + success_msg, + failure_msg, + failure_msg_args={'code_count': len(df)}) + +# - + # ## Participant counts per hpo_site compared to ehr_ops. # + diff --git a/data_steward/analytics/cdr_ops/fitbit_qc.py b/data_steward/analytics/cdr_ops/fitbit_qc.py index 8066b32d8f..9027fa3492 100644 --- a/data_steward/analytics/cdr_ops/fitbit_qc.py +++ b/data_steward/analytics/cdr_ops/fitbit_qc.py @@ -37,7 +37,7 @@ date_columns = { 'activity_summary': 'date', 'heart_rate_summary': 'date', - 'heart_rate_minute_level': 'datetime', + 'heart_rate_intraday': 'datetime', 'steps_intraday': 'datetime', 'sleep_level': 'sleep_date', 'sleep_daily_summary': 'sleep_date', @@ -54,7 +54,7 @@ # Used in the 'Validate fitbit fields' query. table_fields_values = { 'device': { - 'battery': ['high', 'medium', 'low'] + 'battery': ['High', 'Medium', 'Low','Empty'] }, 'sleep_level': { 'level': [ @@ -63,46 +63,13 @@ ] }, 'sleep_daily_summary': { - 'is_main_sleep': ['Peak', 'Cardio', 'Fat Burn', 'Out of Range'] + 'is_main_sleep': ['true', 'false'] }, 'heart_rate_summary': { - 'zone_name': ['true', 'false'] + 'zone_name': ['Peak', 'Cardio', 'Fat Burn', 'Out of Range'] } } -# ## Verify all participants have digital health sharing consent - -# + -health_sharing_consent_check = JINJA_ENV.from_string(""" -SELECT - '{{table_name}}' as table, - COUNT(1) bad_rows -FROM - `{{project}}.{{dataset}}.{{table_name}}` -WHERE - person_id NOT IN ( - SELECT - person_id - FROM - `{{project}}.{{sandbox_dataset}}.digital_health_sharing_status` d - WHERE - status = 'YES' - AND d.wearable = 'fitbit') -""") - -queries_list = [] -for table in FITBIT_TABLES: - queries_list.append( - health_sharing_consent_check.render(project=project_id, - dataset=fitbit_dataset, - table_name=table, - sandbox_dataset=sandbox_dataset)) - -union_all_query = '\nUNION ALL\n'.join(queries_list) - -execute(client, union_all_query) -# - - # ## Identify person_ids that are not in the person table # This check verifies that person_ids are valid. That they exist in the CDM person table and are not null. There should be no bad rows. # @@ -215,15 +182,8 @@ COUNT(1) bad_rows FROM `{{project}}.{{dataset}}.{{table_name}}` t -WHERE t.src_id NOT IN ( - SELECT - hpo_id - FROM - `{{project_id}}.{{pipeline_tables}}.{{site_maskings}}` - WHERE - REGEXP_CONTAINS(src_id, r'(?i)Participant Portal') -) -OR t.src_id IS NULL +WHERE t.src_id IS NULL + """) queries_list = [] @@ -302,3 +262,5 @@ union_all_query = '\nUNION ALL\n'.join(queries_list) execute(client, union_all_query) +# - + diff --git a/data_steward/analytics/cdr_ops/raw_fitbit_qc.py b/data_steward/analytics/cdr_ops/raw_fitbit_qc.py new file mode 100644 index 0000000000..511a43710d --- /dev/null +++ b/data_steward/analytics/cdr_ops/raw_fitbit_qc.py @@ -0,0 +1,305 @@ +# -*- coding: utf-8 -*- +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.7.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# ## Notebook parameters + +# + tags=["parameters"] +project_id = "" # identifies the project containing the datasets +dataset_id = "" # raw fitbit dataset. Most likely it ends with `_backup`. +run_as = "" # Service account email for impersonation +# - + +# # QC for Raw Fitbit +# Quality checks for raw fitbit data. +# Run this QC notebook as soon as we load the Fitbit tables into the curation project. +# See DC-3444's attachment for the original list of validation criteria. + +from common import JINJA_ENV +from utils import auth +from gcloud.bq import BigQueryClient +from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message +from IPython.display import display, HTML + +impersonation_creds = auth.get_impersonation_credentials( + run_as, target_scopes=IMPERSONATION_SCOPES) + +client = BigQueryClient(project_id, credentials=impersonation_creds) + +# # STEPS_INTRADAY table + +# Validation criteria for steps_intraday is the following: +# - The table includes both PTSC and CE data per the src_id field + +# + +query = JINJA_ENV.from_string(""" +SELECT src_id, COUNT(*) AS row_count +FROM `{{project_id}}.{{dataset}}.steps_intraday` +GROUP BY src_id ORDER BY src_id +""").render(project_id=project_id, dataset=dataset_id) +df = execute(client, query) + +check_status = "Look at the result and see if it meets all the following criteria." +msg = ( + "The result must show that
" + "(1) The table has records from both PTSC and CE, and
" + "(2) all the records' src_ids are either PTSC or CE (= No other src_id in this table)
" + "If any of (1) - (2) does not look good, the source records are not properly prepared. " + "Bring up the issue to the RDR team so they can fix it.") + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) + +display(df) + +# - + +# # HEART_RATE_INTRADAY table + +# Validation criteria for steps_intraday is the following: +# - The table includes both PTSC and CE data per the src_id field + +# + +query = JINJA_ENV.from_string(""" +SELECT src_id, COUNT(*) AS row_count +FROM `{{project_id}}.{{dataset}}.heart_rate_intraday` +GROUP BY src_id ORDER BY src_id +""").render(project_id=project_id, dataset=dataset_id) +df = execute(client, query) + +check_status = "Look at the result and see if it meets all the following criteria." +msg = ( + "The result must show that
" + "(1) The table has records from both PTSC and CE, and
" + "(2) all the records' src_ids are either PTSC or CE (= No other src_id in this table)
" + "If any of (1) - (2) does not look good, the source records are not properly prepared. " + "Bring up the issue to the RDR team so they can fix it.") + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) + +display(df) + +# - + +# # HEART_RATE_SUMMARY table + +# Validation criteria for heart_rate_summary is the following: +# - The table includes both PTSC and CE data per the src_id field +# - At least 40% of participants should have at least all 4 zone names for at least one date + +# + + +src_ids_check = JINJA_ENV.from_string(""" +SELECT src_id, COUNT(*) as row_count +FROM `{{project_id}}.{{dataset}}.heart_rate_summary` +GROUP BY src_id ORDER BY src_id +""").render(project_id=project_id, dataset=dataset_id) + +zone_names_check = JINJA_ENV.from_string(""" +with four_zones_for_at_least_one_date AS ( + SELECT + COUNT(DISTINCT zone_name) AS zone_names, + person_id, + date + FROM + `{{project_id}}.{{dataset}}.heart_rate_summary` + GROUP BY + person_id, date + HAVING zone_names > 3 +) + +SELECT + ROUND((COUNT(DISTINCT person_id)/( + SELECT + COUNT(DISTINCT person_id) + FROM + `{{project_id}}.{{dataset}}.heart_rate_summary` + ))*100,2) AS percentage +FROM + four_zones_for_at_least_one_date +""").render(project_id=project_id, dataset=dataset_id) + +src_ids_check_results = execute(client, src_ids_check) +zones_check_results = execute(client, zone_names_check) + +display(src_ids_check_results) +display(zones_check_results) + +check_status = "Look at the result and see if it meets all the following criteria." +msg = ( + "The result must show that
" + "(1) The table has records from both PTSC and CE, and
" + "(2) all the records' src_ids are either PTSC or CE (= No other src_id in this table)
" + "(3) The percentage value returned is equal to or greater than 40.
" + "If any of (1) - (2) - (3) does not look good, the source records are not properly prepared. " + "Bring up the issue to the RDR team so they can fix it.") + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) +# - + +# # ACTIVITY_SUMMARY table + +# Validation criteria for activity_summary is the following: +# - The table includes both PTSC and CE data per the src_id field + +# + +query = JINJA_ENV.from_string(""" +SELECT src_id, COUNT(*) as row_count +FROM `{{project_id}}.{{dataset}}.activity_summary` +GROUP BY src_id ORDER BY src_id +""").render(project_id=project_id, dataset=dataset_id) +df = execute(client, query) + +check_status = "Look at the result and see if it meets all the following criteria." +msg = ( + "The result must show that
" + "(1) The table has records from both PTSC and CE, and
" + "(2) all the records' src_ids are either PTSC or CE (= No other src_id in this table)
" + "If any of (1) - (2) does not look good, the source records are not properly prepared. " + "Bring up the issue to the RDR team so they can fix it.") + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) + +display(df) + +# - + +# # SLEEP_DAILY_SUMMARY table + +# Validation criteria for sleep_daily_summary is the following: +# - The table includes both PTSC and CE data per the src_id field + +# + + +query = JINJA_ENV.from_string(""" +SELECT src_id, COUNT(*) as row_count +FROM `{{project_id}}.{{dataset}}.sleep_daily_summary` +GROUP BY src_id ORDER BY src_id +""").render(project_id=project_id, dataset=dataset_id) +df = execute(client, query) + +check_status = "Look at the result and see if it meets all the following criteria." +msg = ( + "The result must show that
" + "(1) The table has records from both PTSC and CE, and
" + "(2) all the records' src_ids are either PTSC or CE (= No other src_id in this table)
" + "If any of (1) - (2) does not look good, the source records are not properly prepared. " + "Bring up the issue to the RDR team so they can fix it.") + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) + +display(df) +# - + +# # SLEEP_LEVEL table + +# Validation criteria for sleep_level is the following: +# - The table includes both PTSC and CE data per the src_id field +# - At least 40% of participants have at least all sleep level names (awake, light, asleep, deep, restless, wake, rem, unknown) for at least one date + +# + +check_src_ids = JINJA_ENV.from_string(""" +SELECT src_id, COUNT(*) as row_count +FROM `{{project_id}}.{{dataset}}.sleep_level` +GROUP BY src_id ORDER BY src_id +""").render(project_id=project_id, dataset=dataset_id) + +check_sleep_levels = JINJA_ENV.from_string(""" +with all_levels_for_at_least_one_date AS ( + SELECT + COUNT(DISTINCT level) AS levels, + person_id, + sleep_date + FROM + `{{project_id}}.{{dataset}}.sleep_level` + GROUP BY + person_id, sleep_date + HAVING levels > 7 +) + +SELECT + ROUND((COUNT(DISTINCT person_id)/( + SELECT + COUNT(DISTINCT person_id) + FROM + `{{project_id}}.{{dataset}}.sleep_level` + ))*100,2) AS percentage +FROM + all_levels_for_at_least_one_date +""").render(project_id=project_id, dataset=dataset_id) + +src_ids_check_results = execute(client, check_src_ids) +sleep_levels_check_results = execute(client, check_sleep_levels) + +display(src_ids_check_results) +display(sleep_levels_check_results) + +check_status = "Look at the result and see if it meets all the following criteria." +msg = ( + "The result must show that
" + "(1) The table has records from both PTSC and CE, and
" + "(2) all the records' src_ids are either PTSC or CE (= No other src_id in this table)
" + "(3) The percentage value returned is equal to or greater than 40.
" + "If any of (1) - (2) - (3) does not look good, the source records are not properly prepared. " + "Bring up the issue to the RDR team so they can fix it.") + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) +# - + +# # DEVICE table + +# Validation criteria for device is the following: +# - The table includes both PTSC and CE data per the src_id field + +# + +query = JINJA_ENV.from_string(""" +SELECT src_id, COUNT(*) as row_count +FROM `{{project_id}}.{{dataset}}.device` +GROUP BY src_id ORDER BY src_id +""").render(project_id=project_id, dataset=dataset_id) +df = execute(client, query) + +check_status = "Look at the result and see if it meets all the following criteria." +msg = ( + "The result must show that
" + "(1) The table has records from both PTSC and CE, and
" + "(2) all the records' src_ids are either PTSC or CE (= No other src_id in this table)
" + "If any of (1) - (2) does not look good, the source records are not properly prepared. " + "Bring up the issue to the RDR team so they can fix it.") + +display( + HTML( + f'''

Check Status: {check_status}

{msg}

''' + )) + +display(df) + +# - \ No newline at end of file diff --git a/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py b/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py index 2553091687..b08e79e721 100644 --- a/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py +++ b/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py @@ -19,13 +19,16 @@ new_rdr = "" run_as = "" rdr_cutoff_date = "" +vocabulary = "" # - # # QC for RDR Export # # Quality checks performed on a new RDR dataset and comparison with previous RDR dataset. import pandas as pd -from common import CATI_TABLES, DEATH, FACT_RELATIONSHIP, JINJA_ENV, PIPELINE_TABLES, SITE_MASKING_TABLE_ID, SRC_ID_TABLES +from common import (CATI_TABLES, DEATH, FACT_RELATIONSHIP, PROCEDURE_OCCURRENCE, + JINJA_ENV, PIPELINE_TABLES, SITE_MASKING_TABLE_ID, SRC_ID_TABLES, + AOU_DEATH) from utils import auth from gcloud.bq import BigQueryClient from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message @@ -312,7 +315,7 @@ # Rows that are greater than 999,999,999,999,999 the will be listed out here. domain_table_list = [ - table for table in CATI_TABLES if table not in [DEATH, FACT_RELATIONSHIP] + table for table in CATI_TABLES if table not in [DEATH, FACT_RELATIONSHIP, PROCEDURE_OCCURRENCE] ] queries = [] for table in domain_table_list: @@ -429,7 +432,7 @@ # - # # Check the ETL mapped `concept_id`s to question codes -# If most concepts are mapped, this check passes. If only some concepts are not mapping properly these are most likely known vocabulary issues. +# If most concepts are mapped, this check passes. If only some concepts are not mapping properly these are most likely known vocabulary issues or linked to new surveys not yet in Athena. # # **If the check fails.** Investigate. If none, or only a few, of the codes are being mapped notify rdr. # @@ -444,8 +447,7 @@ ,COUNTIF(observation_source_concept_id IS NOT NULL AND observation_source_concept_id != 0 AND observation_concept_id IS NOT NULL AND observation_concept_id != 0) AS n_mapped_by_etl FROM `{{project_id}}.{{new_rdr}}.observation` LEFT JOIN (SELECT concept_id_1 FROM `{{project_id}}.{{new_rdr}}.concept_relationship` WHERE relationship_id = 'Maps to') cr1 -ON observation_source_concept_id = cr1.concept_id_1 -WHERE cr1.concept_id_1 IS NOT NULL +ON observation_source_concept_id = cr1.concept_id_1 GROUP BY 2 ) @@ -554,35 +556,6 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# # Check for duplicates - -tpl = JINJA_ENV.from_string(""" -with duplicates AS ( - SELECT - person_id - ,observation_datetime - ,observation_source_value - ,value_source_value - ,value_as_number - ,value_as_string - -- ,questionnaire_response_id -- - ,COUNT(1) AS n_data - FROM `{{project_id}}.{{new_rdr}}.observation` - INNER JOIN `{{project_id}}.{{new_rdr}}.cope_survey_semantic_version_map` - USING (questionnaire_response_id) -- For COPE only -- - GROUP BY 1,2,3,4,5,6 -) -SELECT - n_data AS duplicates - ,COUNT(1) AS n_duplicates -FROM duplicates -WHERE n_data > 1 -GROUP BY 1 -ORDER BY 2 DESC -""") -query = tpl.render(new_rdr=new_rdr, project_id=project_id) -execute(client, query) - # # Check numeric data in value_as_string # Some numeric data is expected in value_as_string. For example, zip codes or other contact specific information. # @@ -598,7 +571,7 @@ WHERE SAFE_CAST(value_as_string AS INT64) IS NOT NULL AND value_source_concept_id = 0 AND LOWER(observation_source_value) NOT IN UNNEST ({{expected_strings}}) -AND NOT REGEXP_CONTAINS(LOWER(observation_source_value), '(?i)snap|signature|address|email|number|cohortgroup') +AND NOT REGEXP_CONTAINS(LOWER(observation_source_value), r'(?i)snap|signature|address|email|number|cohortgroup') GROUP BY 1 ORDER BY 2 DESC """) @@ -702,6 +675,34 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) +# # Check the expectations of survey_conduct - survey list +# +# Confirm that all expected surveys have records in survey_conduct. Check ignores snap surveys because these surveys are not expected in any release. +# +# Generally the list of surveys should increase from one export to the next. +# +# Investigate any surveys that were available in the previous export but not in the current export. +# Also make sure that any new expected surveys are listed in the current rdr. + +tpl = JINJA_ENV.from_string(''' +WITH old_survey AS (SELECT survey_source_value as old_raw, survey_concept_id, COUNT(survey_conduct_id) as old_count + FROM `{{project_id}}.{{old_rdr}}.survey_conduct` + WHERE NOT (REGEXP_CONTAINS(survey_source_value,'(?i)SNAP|cope')) + GROUP BY 1, 2), +new_survey AS (SELECT survey_source_value as new_raw, survey_concept_id, COUNT(survey_conduct_id) as new_count + FROM `{{project_id}}.{{new_rdr}}.survey_conduct` + WHERE NOT (REGEXP_CONTAINS(survey_source_value,'(?i)SNAP|cope')) + GROUP BY 1, 2) +SELECT *, new_count - old_count as diff +FROM old_survey +FULL OUTER JOIN new_survey +USING (survey_concept_id) +WHERE survey_concept_id != 0 +ORDER BY 3,5 +''') +query = tpl.render(new_rdr=new_rdr, old_rdr=old_rdr, project_id=project_id) +execute(client, query) + # # Class of PPI Concepts using vocabulary.py # Concept codes which appear in `observation.observation_source_value` should belong to concept class Question. # Concept codes which appear in `observation.value_source_value` should belong to concept class Answer. @@ -734,14 +735,14 @@ ,concept_class_id ,n FROM ppi_concept_code -JOIN `{{project_id}}.{{new_rdr}}.concept` +JOIN `{{project_id}}.{{vocabulary}}.concept` ON LOWER(concept_code)=LOWER(code) WHERE LOWER(concept_class_id)<>LOWER(expected_concept_class_id) AND CASE WHEN expected_concept_class_id = 'Question' THEN concept_class_id NOT IN('Topic','PPI Modifier') END AND concept_class_id != 'Qualifier Value' ORDER BY 1, 2, 3 ''') -query = tpl.render(new_rdr=new_rdr, project_id=project_id) +query = tpl.render(new_rdr=new_rdr, project_id=project_id, vocabulary=vocabulary) execute(client, query) # # Identify Questions That Dont Exist in the RDR Export @@ -750,12 +751,12 @@ tpl = JINJA_ENV.from_string(""" with question_codes as (select c.concept_id, c.concept_name, c.concept_class_id -from `{{project_id}}.{{new_rdr}}.concept` as c +from `{{project_id}}.{{vocabulary}}.concept` as c where REGEXP_CONTAINS(c.vocabulary_id, r'(?i)(ppi)') and REGEXP_CONTAINS(c.concept_class_id, r'(?i)(question)')) , used_q_codes as ( select distinct o.observation_source_concept_id, o.observation_source_value from `{{project_id}}.{{new_rdr}}.observation` as o - join `{{project_id}}.{{new_rdr}}.concept` as c + join `{{project_id}}.{{vocabulary}}.concept` as c on o.observation_source_concept_id = c.concept_id where REGEXP_CONTAINS(c.vocabulary_id, r'(?i)(ppi)') and REGEXP_CONTAINS(c.concept_class_id, r'(?i)(question)') ) @@ -763,7 +764,7 @@ from question_codes where concept_id not in (select observation_source_concept_id from used_q_codes) """) -query = tpl.render(new_rdr=new_rdr, project_id=project_id) +query = tpl.render(new_rdr=new_rdr, project_id=project_id, vocabulary=vocabulary) execute(client, query) # # Make sure previously corrected missing data still exists @@ -798,55 +799,6 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# ## Participants must have basics data -# Identify any participants who have don't have any responses -# to questions in the basics survey module (see [DC-706](https://precisionmedicineinitiative.atlassian.net/browse/DC-706)). These should be -# reported to the RDR as they are supposed to be filtered out -# from the RDR export. - -# + -BASICS_MODULE_CONCEPT_ID = 1586134 - -# Note: This assumes that concept_ancestor sufficiently -# represents the hierarchy -tpl = JINJA_ENV.from_string(""" -WITH - - -- all PPI question concepts in the basics survey module -- - basics_concept AS - (SELECT - c.concept_id - ,c.concept_name - ,c.concept_code - FROM `{{DATASET_ID}}.concept_ancestor` ca - JOIN `{{DATASET_ID}}.concept` c - ON ca.descendant_concept_id = c.concept_id - WHERE 1=1 - AND ancestor_concept_id={{BASICS_MODULE_CONCEPT_ID}} - AND c.vocabulary_id='PPI' - AND c.concept_class_id='Question') - - -- maps pids to all their associated basics questions in the rdr -- -,pid_basics AS - (SELECT - person_id - ,ARRAY_AGG(DISTINCT c.concept_code IGNORE NULLS) basics_codes - FROM `{{DATASET_ID}}.observation` o - JOIN basics_concept c - ON o.observation_concept_id = c.concept_id - WHERE 1=1 - GROUP BY 1) - - -- list all pids for whom no basics questions are found -- -SELECT * -FROM `{{DATASET_ID}}.person` -WHERE person_id not in (select person_id from pid_basics) -""") -query = tpl.render(DATASET_ID=new_rdr, - BASICS_MODULE_CONCEPT_ID=BASICS_MODULE_CONCEPT_ID) -execute(client, query) -# - - # # Date conformance check # COPE surveys contain some concepts that must enforce dates in the observation.value_as_string field. # For the observation_source_concept_id = 715711, if the value in value_as_string does not meet a standard date format @@ -949,27 +901,7 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# # Checks for basics survey module -# Participants with data in other survey modules must also have data from the basics survey module. -# This check identifies survey responses associated with participants that do not have any responses -# associated with the basics survey module. -# In ideal circumstances, this query will not return any results. - -tpl = JINJA_ENV.from_string(''' -SELECT DISTINCT person_id FROM `{{project_id}}.{{new_rdr}}.observation` -JOIN `{{project_id}}.{{new_rdr}}.concept` on (observation_source_concept_id=concept_id) -WHERE vocabulary_id = 'PPI' AND person_id NOT IN ( -SELECT DISTINCT person_id FROM `{{project_id}}.{{new_rdr}}.concept` -JOIN `{{project_id}}.{{new_rdr}}.concept_ancestor` on (concept_id=ancestor_concept_id) -JOIN `{{project_id}}.{{new_rdr}}.observation` on (descendant_concept_id=observation_concept_id) -WHERE concept_class_id='Module' -AND concept_name IN ('The Basics') -AND questionnaire_response_id IS NOT NULL) -''') -query = tpl.render(new_rdr=new_rdr, project_id=project_id) -execute(client, query) - -# ## Participants must be 18 years of age or older to consent +# # Participants must be 18 years of age or older to consent # # AOU participants are required to be 18+ years of age at the time of consent # ([DC-1724](https://precisionmedicineinitiative.atlassian.net/browse/DC-1724)), @@ -1033,89 +965,7 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# # COPE survey mapping - -# There is a known issue that COPE survey questions all map to the module -# 1333342 (COPE survey with no version specified). This check is to confirm -# if this issue still exists in the vocabulary or not. -# If this issue is fixed, each COPE survey questions will have mapping to -# individual COPE survey modules and will no longer have mapping to 1333342. -# cope_question_concept_ids are collected using the SQL listed in DC-2641: -# [DC-2641](https://precisionmedicineinitiative.atlassian.net/browse/DC-2641). - -cope_question_concept_ids = [ - 596884, 596885, 596886, 596887, 596888, 702686, 713888, 715711, 715713, - 715714, 715719, 715720, 715721, 715722, 715723, 715724, 715725, 715726, - 903629, 903630, 903631, 903632, 903633, 903634, 903635, 903641, 903642, - 1310051, 1310052, 1310053, 1310054, 1310056, 1310058, 1310060, 1310062, - 1310065, 1310066, 1310067, 1310132, 1310133, 1310134, 1310135, 1310136, - 1310137, 1310138, 1310139, 1310140, 1310141, 1310142, 1310144, 1310145, - 1310146, 1310147, 1310148, 1332734, 1332735, 1332737, 1332738, 1332739, - 1332741, 1332742, 1332744, 1332745, 1332746, 1332747, 1332748, 1332749, - 1332750, 1332751, 1332752, 1332753, 1332754, 1332755, 1332756, 1332762, - 1332763, 1332767, 1332769, 1332792, 1332793, 1332794, 1332795, 1332796, - 1332797, 1332800, 1332801, 1332802, 1332803, 1332804, 1332805, 1332806, - 1332807, 1332808, 1332819, 1332820, 1332822, 1332824, 1332826, 1332828, - 1332829, 1332830, 1332831, 1332832, 1332833, 1332835, 1332843, 1332847, - 1332848, 1332849, 1332853, 1332854, 1332861, 1332862, 1332863, 1332866, - 1332867, 1332868, 1332869, 1332870, 1332871, 1332872, 1332874, 1332876, - 1332878, 1332880, 1332935, 1332937, 1332944, 1332998, 1333004, 1333011, - 1333012, 1333013, 1333014, 1333015, 1333016, 1333017, 1333018, 1333019, - 1333020, 1333021, 1333022, 1333023, 1333024, 1333102, 1333104, 1333105, - 1333118, 1333119, 1333120, 1333121, 1333156, 1333163, 1333164, 1333165, - 1333166, 1333167, 1333168, 1333182, 1333183, 1333184, 1333185, 1333186, - 1333187, 1333188, 1333189, 1333190, 1333191, 1333192, 1333193, 1333194, - 1333195, 1333200, 1333216, 1333221, 1333234, 1333235, 1333274, 1333275, - 1333276, 1333277, 1333278, 1333279, 1333280, 1333281, 1333285, 1333286, - 1333287, 1333288, 1333289, 1333291, 1333292, 1333293, 1333294, 1333295, - 1333296, 1333297, 1333298, 1333299, 1333300, 1333301, 1333303, 1333311, - 1333312, 1333313, 1333314, 1333324, 1333325, 1333326, 1333327, 1333328 -] - -tpl = JINJA_ENV.from_string(""" -WITH question_topic_module AS ( - SELECT - cr1.concept_id_1 AS question, - cr1.concept_id_2 AS topic, - cr2.concept_id_2 AS module - FROM `{{projcet_id}}.{{dataset}}.concept_relationship` cr1 - JOIN `{{projcet_id}}.{{dataset}}.concept` c1 ON cr1.concept_id_2 = c1.concept_id - JOIN `{{projcet_id}}.{{dataset}}.concept_relationship` cr2 ON c1.concept_id = cr2.concept_id_1 - JOIN `{{projcet_id}}.{{dataset}}.concept` c2 ON cr2.concept_id_2 = c2.concept_id - WHERE cr1.concept_id_1 IN ({{cope_question_concept_ids}}) - AND c1.concept_class_id = 'Topic' - AND c2.concept_class_id = 'Module' -) -SELECT DISTINCT question FROM question_topic_module -WHERE module = 1333342 -""") -query = tpl.render( - new_rdr=new_rdr, - project_id=project_id, - dataset=new_rdr, - cope_question_concept_ids=", ".join( - str(concept_id) for concept_id in cope_question_concept_ids)) -df = execute(client, query) - -# + -success_msg = ''' - The mapping issue is resolved. Double-check each concept is mapped to individual COPE module. - Once we double-checked it, we can remove this QC from this notebook. -''' -failure_msg = ''' - The mapping issue still exists. There are {code_count} concepts for COPE questions - that map to 1333342. Notify Odysseus that the issue still persists. - For pipeline, we can use cope_survey_semantic_version_map to diffrentiate COPE module versions, - so we can still move on. See DC-2641 for detail. -''' - -render_message(df, - success_msg, - failure_msg, - failure_msg_args={'code_count': len(df)}) -# - - -# ### RDR date cutoff check +# # RDR date cutoff check # Check that survey dates are not beyond the RDR cutoff date, also check observation. query = JINJA_ENV.from_string(""" @@ -1230,7 +1080,7 @@ src_ids_table = ids_template.render(project_id=project_id, pipeline=PIPELINE_TABLES, site_maskings=SITE_MASKING_TABLE_ID) -for table in SRC_ID_TABLES: +for table in set(SRC_ID_TABLES) - {AOU_DEATH} | {DEATH}: tpl = JINJA_ENV.from_string(""" SELECT \'{{table_name}}\' AS table_name, @@ -1250,30 +1100,216 @@ all_queries = '\nUNION ALL\n'.join(queries) execute(client, f'{src_ids_table}\n{all_queries}') + # # Check Wear Consent Counts +# This query checks the count of wear consent observations. If the number of observations is not decreasing this check will pass. # -# `Wear_consent` and `wear_consent_ptsc` records should be seen in the export. -# -# Results expectations: The result should be roughly 16 rows. Differences here most likely aren't an issue. -# -# **Visual check:**
-# * **PASS** The result **includes** observation_source_value: `resultsconsent_wear`
-# * **FAIL** The result **does not include** observation_source_value: `resultsconsent_wear`. If this row does not exist, confirm the finding, and report to RDR. These records are required for proper suppression of wear fitbit records. +# **If this check fails** Investigate why the number of observations have decreased. These data are important for the creation of the wear_study table and therefore data suppression +# + # Get counts of wear_consent records query = JINJA_ENV.from_string(""" + SELECT - observation_source_value, - COUNT(*) AS n -FROM - `{{project_id}}.{{new_rdr}}.observation` o - LEFT JOIN `{{project_id}}.{{new_rdr}}.survey_conduct` sc - ON sc.survey_conduct_id = o.questionnaire_response_id -WHERE sc.survey_concept_id IN (2100000011,2100000012) -- captures questions asked in multiple surveys -- -OR LOWER(observation_source_value) IN UNNEST ({{wear_codes}}) -- captures those that might be missing from survey_conduct -- -GROUP BY 1 + curr.observation_source_value AS concept + ,prev.row_count AS _{{old_rdr}} + ,curr.row_count AS _{{new_rdr}} + ,(curr.row_count - prev.row_count) row_diff +FROM (SELECT DISTINCT observation_source_value, COUNT(*) as row_count + FROM `{{project_id}}.{{new_rdr}}.observation` o + WHERE observation_source_value = 'resultsconsent_wear' + GROUP BY 1) curr +JOIN (SELECT DISTINCT observation_source_value, COUNT(*) row_count + FROM `{{project_id}}.{{old_rdr}}.observation` o + WHERE observation_source_value = 'resultsconsent_wear' + GROUP BY 1) prev +USING (observation_source_value) +GROUP BY 1,2,3 """).render(project_id=project_id, new_rdr=new_rdr, - wear_codes=WEAR_SURVEY_CODES) + old_rdr=old_rdr) +df = execute(client, query) + +if sum(df['row_diff']) < 0: + display(df, + HTML(f''' +

+ Check Status: FAILURE +

+

+ Wear consent records have been lost since the last rdr. Investigate. See description. +

+ ''')) +else: + display(df, + HTML(f''' +

+ Check Status: PASS +

+

+ An increasing number of wear consents are expected. +

+ + ''')) +# - + +query = JINJA_ENV.from_string(""" +SELECT + 'Mandatory mapping to standard is missing' as issue, + COUNT(*) AS n +FROM `{{project_id}}.{{new_rdr}}.observation` o +WHERE observation_source_value = 'resultsconsent_wear' +AND (observation_source_concept_id != 2100000010 OR + value_source_concept_id NOT IN (2100000008, 2100000009, 903096) -- wear_no, wear_yes, pmi_skip -- + ) + +""").render(project_id=project_id, + new_rdr=new_rdr) +execute(client, query) + + + +# # Check Wear Consent Mapping +# This mapping is required to keep the observations being dropped in the rdr cleaning stage and also required to create the wear_study table. +# +# **If this check fails**, verify the query results before notifying the rdr team. + +# + +query = JINJA_ENV.from_string(""" +SELECT + 'Mandatory mapping to standard is missing' as issue, + COUNT(*) AS n +FROM `{{project_id}}.{{new_rdr}}.observation` o +WHERE observation_source_value = 'resultsconsent_wear' +AND (observation_source_concept_id != 2100000010 OR + value_source_concept_id NOT IN (2100000008, 2100000009, 903096) -- wear_no, wear_yes, pmi_skip -- + ) + +""").render(project_id=project_id, + new_rdr=new_rdr) +df = execute(client, query) + +if sum(df['n']) != 0: + display(df, + HTML(f''' +

+ Check Status: FAILURE +

+

+ These are mandatory mappings. Investigate. See description. +

+ ''')) +else: + display(df, + HTML(f''' +

+ Check Status: PASS +

+

+ All mandatory wear concent records are mapped as expected. +

+ + ''')) +# - +# # Check consent_validation for expected number of consent status +# +# The 'consent_validation' table is renamed from 'consent' in the rdr import script. This table is used to suppress data in `remove_ehr_data_without_consent.py`. +# +# **"Have duplicate consent statuses"** These participants have multiple consent_validation records with the same status.
+# **"Descrepancy btn consent_validation and obs"** Where a consent_validation record has no record in observation or vice versa. These participants will not be flagged as consenting.
+# **"Consent status is NULL"** Whereconsent_for_electronic_health_records(consent status) are NULL and the observation consent was not skipped.
+# **"Varying consent statuses per consent answer"** Where a single consent record in observation has conflicting consent statuses in consent_validation. +#
+ +# + +# Count of participants with multiple validation status' for their ehr consent records. +query = JINJA_ENV.from_string(""" + +WITH obs_consents AS (SELECT +* +FROM `{{project_id}}.{{new_rdr}}.observation` +WHERE observation_source_value = 'EHRConsentPII_ConsentPermission' ), + +issue_queries AS ( +SELECT +"Have duplicate consent statuses" AS issue, +COUNT(*) AS n +FROM (SELECT DISTINCT * EXCEPT (consent_for_electronic_health_records_authored) FROM `{{project_id}}.{{new_rdr}}.consent_validation` ) +GROUP BY person_id +HAVING n>1 + +UNION ALL + +SELECT +"Descrepancy btn consent_validation and obs" AS issue, +cv.person_id +FROM `{{project_id}}.{{new_rdr}}.consent_validation` cv +FULL OUTER JOIN obs_consents o +ON cv.person_id = o.person_id AND cv.consent_for_electronic_health_records_authored = CAST(o.observation_datetime AS DATETIME) +WHERE cv.person_id IS NULL OR o.person_id IS NULL + +UNION ALL + +SELECT +"Consent status is NULL" AS issue, +COUNT(*) AS n +FROM obs_consents o +FULL OUTER JOIN `{{project_id}}.{{new_rdr}}.consent_validation` cv +ON cv.person_id = o.person_id AND cv.consent_for_electronic_health_records_authored = CAST(o.observation_datetime AS DATETIME) +WHERE consent_for_electronic_health_records IS NULL +AND value_source_value != 'PMI_Skip' +GROUP BY cv.person_id + +UNION ALL + +SELECT +"Varying consent statuses per consent answer" as issue +,COUNT(DISTINCT(consent_for_electronic_health_records)) as n +FROM obs_consents o +FULL OUTER JOIN ( -- yes and no consent status only + SELECT * + FROM `{{project_id}}.{{new_rdr}}.consent_validation` cv + WHERE consent_for_electronic_health_records IN ('SUBMITTED', 'SUBMITTED_NO') + ) cv +ON cv.person_id = o.person_id AND cv.consent_for_electronic_health_records_authored = CAST(o.observation_datetime AS DATETIME) +GROUP BY o.person_id, value_source_value +HAVING n >1 + +) +SELECT DISTINCT issue, +COUNT(*) AS n_person_ids +FROM issue_queries +GROUP BY issue +ORDER BY issue + +""").render(project_id=project_id, + new_rdr=new_rdr) +df = execute(client, query) + + + +success_msg = 'consent_validation passes these checks' +failure_msg = ''' + consent_validation has issues. Investigate. +''' + +render_message(df, + success_msg, + failure_msg) +# - +# # Check to catch duplicate observation_ids + +tpl = JINJA_ENV.from_string(''' +SELECT + observation_id, + COUNT(observation_id) AS n +FROM + `{{project_id}}.{{new_rdr}}.observation` +GROUP BY + observation_id +HAVING n>1 +''') +query = tpl.render(project_id=project_id, new_rdr=new_rdr) execute(client, query) + diff --git a/data_steward/analytics/cdr_ops/report_runner.py b/data_steward/analytics/cdr_ops/report_runner.py index fb4d12a5cc..200d9dad15 100644 --- a/data_steward/analytics/cdr_ops/report_runner.py +++ b/data_steward/analytics/cdr_ops/report_runner.py @@ -19,7 +19,7 @@ from nbconvert import HTMLExporter # Project imports -from utils import pipeline_logging +from utils import pipeline_logging LOGGER = logging.getLogger(__name__) IPYNB_SUFFIX = '.ipynb' @@ -47,18 +47,18 @@ def create_ipynb_from_py(py_path) -> str: def create_html_from_ipynb(surrogate_output_path): """ Create a html page from the output of the jupyter notebook - :param surrogate_output_path: - :return: + :param surrogate_output_path: + :return: """ # Convert output ipynb to html output_path = PurePath(surrogate_output_path).with_suffix(HTML_SUFFIX) html_exporter = HTMLExporter() html_exporter.template_name = 'classic' - with open(surrogate_output_path, 'r') as f: + with open(surrogate_output_path, 'r', encoding='utf-8') as f: written_nb = nbformat.reads(f.read(), as_version=4) (body, resources) = html_exporter.from_notebook_node(written_nb) - with open(output_path, 'w') as f: + with open(output_path, 'w', encoding='utf-8') as f: f.write(body) LOGGER.info(f'Notebook exported to {output_path}') @@ -68,12 +68,12 @@ def create_html_from_ipynb(surrogate_output_path): def infer_required(param_properties: OrderedDict) -> OrderedDict: """ - This function infers whether the notebook parameter is required or not based on the following - heuristics: if the default value is 'None' (notebook translates None to a string version of + This function infers whether the notebook parameter is required or not based on the following + heuristics: if the default value is 'None' (notebook translates None to a string version of None) or '""' or '\'\'' (string consists of double quotes or single quotes only) - - :param param_properties: - :return: + + :param param_properties: + :return: """ def is_required(param_value): @@ -91,10 +91,10 @@ def is_required(param_value): def infer_notebook_params(notebook_path) -> List[Tuple[str, OrderedDict]]: """ - A helper function to infer the notebook params + A helper function to infer the notebook params - :param notebook_path: - :return: + :param notebook_path: + :return: """ return [(name, infer_required(properties)) @@ -103,10 +103,10 @@ def infer_notebook_params(notebook_path) -> List[Tuple[str, OrderedDict]]: def display_notebook_help(notebook_path): """ - A helper function to display + A helper function to display - :param notebook_path: - :return: + :param notebook_path: + :return: """ print(f'Parameters inferred for notebook {PurePath(notebook_path).stem}:') for _, properties in infer_notebook_params(notebook_path): @@ -133,7 +133,7 @@ def is_parameter_required(properties: OrderedDict): """ This functions checks if the notebook parameter is required :param properties: the properties associated with the parameter - :return: + :return: """ for key, value in properties.items(): if key == PARAMETER_REQUIRED: @@ -143,11 +143,11 @@ def is_parameter_required(properties: OrderedDict): def validate_notebook_params(notebook_path, provided_params: Dict[str, str]): """ - This function validates the provided parameters passed to the notebook - - :param notebook_path: - :param provided_params: provided parameters from the arg parser - :return: + This function validates the provided parameters passed to the notebook + + :param notebook_path: + :param provided_params: provided parameters from the arg parser + :return: """ notebook_param_dict = dict(infer_notebook_params(notebook_path)) @@ -197,12 +197,12 @@ def validate_notebook_params(notebook_path, provided_params: Dict[str, str]): def main(notebook_jupytext_path, params, output_path, help_notebook=False): """ - - :param notebook_jupytext_path: - :param params: - :param output_path: - :param help_notebook: - :return: + + :param notebook_jupytext_path: + :param params: + :param output_path: + :param help_notebook: + :return: """ # Output name defaults to ipynb_path if the output_path is an empty string @@ -265,7 +265,7 @@ def __call__(self, value, **kwargs): if __name__ == '__main__': - pipeline_logging.configure(logging.INFO, add_console_handler=True) + pipeline_logging.configure(logging.INFO, add_console_handler=True) parser = argparse.ArgumentParser( description= diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py index 399c37c8b0..dc20879d6a 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py @@ -423,12 +423,12 @@ query = JINJA_ENV.from_string(""" SELECT COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{pipeline}}.site_maskings` as c -LEFT JOIN `{{project_id}}.{{deid_sand}}.site_maskings` as r +LEFT JOIN `{{project_id}}.{{combine}}_sandbox.site_maskings` as r USING (hpo_id) WHERE c.src_id != r.src_id -- registered tier did use the stabilized maskings for cross pipeline compatibility -- """) -q = query.render(project_id=project_id, pipeline=pipeline, deid_sand=deid_sand) +q = query.render(project_id=project_id, pipeline=pipeline, combine=combine) df1 = execute(client, q) if df1.loc[0].sum() == 0: df = df.append( @@ -878,11 +878,11 @@ def my_sql(table_name, column_name): FROM latest_primary_consent_records cte LEFT JOIN ( -- any positive primary consent -- SELECT * - FROM `{{project_id}}.{{rt_cdr_deid}}.observation` o - WHERE REGEXP_CONTAINS(o.observation_source_value, '(?i)extraconsent_agreetoconsent') - AND o.value_as_concept_id = 45877994 + FROM `{{project_id}}.{{rt_cdr_deid}}.observation` + WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent') + AND value_as_concept_id = 45877994) o ON cte.person_id = o.person_id - AND cte.latest_consent_date = o.observation_date + AND cte.latest_date = o.observation_date WHERE o.person_id IS NOT NULL ) @@ -905,6 +905,7 @@ def my_sql(table_name, column_name): 'result': 'Failure' }, ignore_index=True) +df1 # + # Query 14: Check that wear_consent records are suppressed in the 'observation' and 'survey_conduct' tables diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report12_extra.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report12_extra.py index f06d19e264..0b3e3c77fb 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report12_extra.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report12_extra.py @@ -27,6 +27,7 @@ pipeline="" rt_cdr_deid = "" deid_sand="" +reg_combined="" rdr_dataset="" rdr_sandbox="" run_as="" @@ -58,14 +59,14 @@ (SELECT DISTINCT research_id FROM `{{project_id}}.{{combine}}.observation` com -JOIN `{{project_id}}.{{deid_sand}}._deid_map` m ON com.person_id = m.person_id +JOIN `{{project_id}}.{{reg_combined}}._deid_map` m ON com.person_id = m.person_id WHERE observation_source_concept_id = 1586140 AND value_source_concept_id = 1586141 ) AND value_source_concept_id NOT IN (2000000008, 2000000001,1586147) """) -q = query.render(project_id=project_id,rt_cdr_deid=rt_cdr_deid,combine=combine,deid_sand=deid_sand) +q = query.render(project_id=project_id,rt_cdr_deid=rt_cdr_deid,combine=combine,deid_sand=deid_sand, reg_combined=reg_combined) df1=execute(client, q) df1 @@ -168,7 +169,7 @@ COUNT (DISTINCT m.person_id) AS row_counts_failure FROM `{{project_id}}.{{rt_cdr_deid}}.survey_conduct` deid JOIN `{{project_id}}.{{rt_cdr_deid}}.survey_conduct_ext` using (survey_conduct_id) - JOIN `{{project_id}}.{{deid_sand}}._deid_map` m ON deid.person_id = m.research_id + JOIN `{{project_id}}.{{reg_combined}}._deid_map` m ON deid.person_id = m.research_id WHERE language ='en' AND m.person_id NOT IN (SELECT DISTINCT person_id FROM `{{project_id}}.{{rdr_dataset}}.observation` @@ -182,7 +183,7 @@ COUNT (DISTINCT m.person_id) AS row_counts_failure FROM `{{project_id}}.{{rt_cdr_deid}}.survey_conduct` deid JOIN `{{project_id}}.{{rt_cdr_deid}}.survey_conduct_ext` using (survey_conduct_id) - JOIN `{{project_id}}.{{deid_sand}}._deid_map` m ON deid.person_id = m.research_id + JOIN `{{project_id}}.{{reg_combined}}._deid_map` m ON deid.person_id = m.research_id WHERE language ='es' AND m.person_id NOT IN (SELECT DISTINCT person_id FROM `{{project_id}}.{{rdr_dataset}}.observation` @@ -197,7 +198,7 @@ COUNT (DISTINCT m.person_id) AS row_counts_failure FROM `{{project_id}}.{{rt_cdr_deid}}.survey_conduct` deid JOIN `{{project_id}}.{{rt_cdr_deid}}.survey_conduct_ext` using (survey_conduct_id) - JOIN `{{project_id}}.{{deid_sand}}._deid_map` m ON deid.person_id = m.research_id + JOIN `{{project_id}}.{{reg_combined}}._deid_map` m ON deid.person_id = m.research_id WHERE ( assisted_source_value ='Telephone' or assisted_concept_id=42530794) AND m.person_id NOT IN (SELECT DISTINCT person_id FROM `{{project_id}}.{{rdr_dataset}}.observation` @@ -251,7 +252,7 @@ COUNT (DISTINCT person_id) AS row_counts_failure FROM `{{project_id}}.{{rt_cdr_deid}}.survey_conduct` d WHERE person_id NOT IN (SELECT research_id - FROM `{{project_id}}.{{deid_sand}}._deid_map` )) + FROM `{{project_id}}.{{reg_combined}}._deid_map` )) SELECT * FROM df1 UNION DISTINCT @@ -275,7 +276,7 @@ ORDER BY check """) -q = query.render(project_id=project_id,rt_cdr_deid=rt_cdr_deid,combine=combine,deid_sand=deid_sand,rdr_dataset=rdr_dataset,rdr_sandbox=rdr_sandbox,pipeline=pipeline) +q = query.render(project_id=project_id,rt_cdr_deid=rt_cdr_deid,combine=combine,deid_sand=deid_sand,rdr_dataset=rdr_dataset,rdr_sandbox=rdr_sandbox,pipeline=pipeline, reg_combined=reg_combined) df1=execute(client, q) df1.shape diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report1_generalization_rule.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report1_generalization_rule.py index f36f6d521a..7e47cb1ac1 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report1_generalization_rule.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report1_generalization_rule.py @@ -113,28 +113,40 @@ # - Verify that if the value_source_concept_id in OBSERVATION table populates: 903079, # the value_as_concept_id field in de-id table populates 1177221 # -# - Verify that if the value_source_concept_id field in OBSERVATION table populates: 903096 , -# the value_as_concept_id field in de-id table populates: 903096 # query = JINJA_ENV.from_string(""" WITH df1 AS ( -SELECT distinct value_source_concept_id,value_as_concept_id +SELECT value_source_concept_id,value_as_concept_id FROM `{{project_id}}.{{deid_cdr}}.observation` - WHERE value_source_concept_id in (2000000001,2000000008,1586142,1586143,1586146,1586147,1586148,903079,903096) + WHERE value_source_concept_id in (2000000001,2000000008,1586142,1586143,1586146,1586147,1586148,903079) ) -SELECT COUNT (*) AS n_row_not_pass FROM df1 +SELECT '2000000001' as issue, COUNT (*) AS n_row_not_pass FROM df1 WHERE (value_source_concept_id=2000000001 AND value_as_concept_id !=2000000001) - OR (value_source_concept_id=2000000008 AND value_as_concept_id !=2000000008) - OR (value_source_concept_id=1586142 AND value_as_concept_id !=45879439) - OR (value_source_concept_id=1586143 AND value_as_concept_id !=1586143) - OR (value_source_concept_id=1586146 AND value_as_concept_id !=45877987) - OR (value_source_concept_id=1586147 AND value_as_concept_id !=1586147) - OR (value_source_concept_id=1586148 AND value_as_concept_id !=45882607) - OR (value_source_concept_id=903079 AND value_as_concept_id !=1177221) - OR (value_source_concept_id=903096 AND value_as_concept_id !=903096) +UNION ALL +SELECT '2000000008' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=2000000008 AND value_as_concept_id !=2000000008) +UNION ALL +SELECT '1586142' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=1586142 AND value_as_concept_id !=45879439) +UNION ALL +SELECT '1586143' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=1586143 AND value_as_concept_id !=1586143) +UNION ALL +SELECT '1586146' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=1586146 AND value_as_concept_id !=45877987) +UNION ALL +SELECT '1586147' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=1586147 AND value_as_concept_id !=1586147) +UNION ALL +SELECT '1586148' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=1586148 AND value_as_concept_id !=45882607) +UNION ALL +SELECT '903079' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=903079 AND value_as_concept_id !=1177221) + """) q = query.render(project_id=project_id, @@ -232,14 +244,14 @@ ignore_index=True) df1 -# # 2.3 Account for multiple SELECTions for sexual orientation (DC-859) +# # 2.3 Account for multiple selections for sexual orientation (DC-859) # # -# Verify that if a person hAS multiple SELECTion for TheBasics_SexualOrientation in pre-deid_com_cdr dataset, then the the value_source_concept_id field in OBSERVATION table populates: 2000000003, for those person_id in deid dataset +# Verify that if a person hAS multiple selection for TheBasics_SexualOrientation in pre-deid_com_cdr dataset, then the the value_source_concept_id field in OBSERVATION table populates: 2000000003, for those person_id in deid dataset # -# Account for multiple SELECTions for sexual orientation (DC-859) +# Account for multiple selections for sexual orientation (DC-859) # -# 1. Find person_ids that have more than 1 sexual_orientation SELECTions in the non-deid datasets (494038847, 326626269,353533275, 697092658,887791634,895181663) +# 1. Find person_ids that have more than 1 sexual_orientation selections in the non-deid datasets (494038847, 326626269,353533275, 697092658,887791634,895181663) # # # 2 Find the person_id of those in the map tables (1872508, 1303111, 2051219, 1488177, 1278442, 1159723) @@ -263,9 +275,10 @@ WHERE value_as_concept_id =2000000003 ) -SELECT COUNT (distinct person_id) AS n_PERSON_ID_not_pass FROM df1 -WHERE countp >1 -AND person_id NOT IN (SELECT distinct person_id FROM df2 ) +SELECT COUNT(DISTINCT person_id) AS n_person_id_not_pass +FROM `{{project_id}}.{{deid_cdr}}.person` +WHERE person_id IN (SELECT person_id FROM df1 WHERE countp >1) +AND person_id NOT IN (SELECT person_id FROM df2) """) q = query.render(project_id=project_id, @@ -293,7 +306,7 @@ ignore_index=True) df1 -# # 3 GR_03 Gender Generalization Rule +# # 3.1 GR_03 Gender Generalization Rule # # Verify that the field identified for de-identification action in OBSERVATION table follow the Gender Generalization Rule for the de-id table. # @@ -354,7 +367,7 @@ df = df.append( { 'query': - 'Query3 GR03 Gender_value_source_concept_id matched value_as_concept_id in observation', + 'Query3.1 GR03 Gender_value_source_concept_id matched value_as_concept_id in observation', 'result': 'PASS' }, @@ -363,14 +376,14 @@ df = df.append( { 'query': - 'Query3 GR03 Gender_value_source_concept_id matched value_as_concept_id in observation', + 'Query3.1 GR03 Gender_value_source_concept_id matched value_as_concept_id in observation', 'result': 'Failure' }, ignore_index=True) df1 -# # 3 Biological Sex Generalization Rule +# # 3.2 Biological Sex Generalization Rule # # Verify that the field identified for de-identification action in OBSERVATION table follow the Biological Sex Generalization Rule for the de-id table. # @@ -415,15 +428,19 @@ if df1.eq(0).any().any(): df = df.append( { - 'query': 'Query3 Biological Sex Generalization Rule in observation', - 'result': 'PASS' + 'query': + 'Query3.2 Biological Sex Generalization Rule in observation', + 'result': + 'PASS' }, ignore_index=True) else: df = df.append( { - 'query': 'Query3 Biological Sex Generalization Rule in observation', - 'result': 'Failure' + 'query': + 'Query3.2 Biological Sex Generalization Rule in observation', + 'result': + 'Failure' }, ignore_index=True) df1 @@ -560,24 +577,29 @@ # - Verify that if the value_source_concept_id in OBSERVATION table populates: 903079, # the value_as_concept_id field in de-id table populates 1177221 # -# - Verify that if the value_source_concept_id in OBSERVATION table populates: 903096, -# the value_as_concept_id field in de-id table populates 903096 query = JINJA_ENV.from_string(""" WITH df1 AS ( -SELECT distinct value_source_concept_id,value_as_concept_id +SELECT value_source_concept_id,value_as_concept_id FROM `{{project_id}}.{{deid_cdr}}.observation` -WHERE value_source_concept_id IN (2000000007, 2000000006,1585945,1585946,903079,903096) +WHERE value_source_concept_id IN (2000000007, 2000000006,1585945,1585946,903079) ) -SELECT COUNT (*) AS n_row_not_pass FROM df1 +SELECT '2000000007' as issue, COUNT (*) AS n_row_not_pass FROM df1 WHERE (value_source_concept_id=2000000007 AND value_as_concept_id !=2000000007) -OR (value_source_concept_id=2000000006 AND value_as_concept_id !=2000000006) -OR (value_source_concept_id=1585945 AND value_as_concept_id !=43021808) -OR (value_source_concept_id=1585946 AND value_as_concept_id !=4260980) -OR (value_source_concept_id=903096 AND value_as_concept_id !=903096) -OR (value_source_concept_id=903079 AND value_as_concept_id !=1177221) +UNION ALL +SELECT '2000000006' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE(value_source_concept_id=2000000006 AND value_as_concept_id !=2000000006) +UNION ALL +SELECT '1585945' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE(value_source_concept_id=1585945 AND value_as_concept_id !=43021808) +UNION ALL +SELECT '1585946' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE(value_source_concept_id=1585946 AND value_as_concept_id !=4260980) +UNION ALL +SELECT '903079' as issue,COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=903079 AND value_as_concept_id !=1177221) """) q = query.render(project_id=project_id, @@ -618,24 +640,25 @@ # the value_as_concept_id field in de-id table populates 2000000004 " # - Verify that if the value_source_concept_id in OBSERVATION table populates: 903079, # the value_as_concept_id field in de-id table populates 1177221 -# - Verify that if the value_source_concept_id in OBSERVATION table populates: 903096, -# the value_as_concept_id field in de-id table populates 903096 +# # query = JINJA_ENV.from_string(""" WITH df1 AS ( -SELECT distinct value_source_concept_id,value_as_concept_id +SELECT value_source_concept_id,value_as_concept_id FROM `{{project_id}}.{{deid_cdr}}.observation` -WHERE value_source_concept_id IN (2000000005, 2000000004,903079,903096) +WHERE value_source_concept_id IN (2000000005, 2000000004,903079) ) -SELECT COUNT (*) AS n_row_not_pass FROM df1 +SELECT '2000000005' as issue, COUNT (*) AS n_row_not_pass FROM df1 WHERE (value_source_concept_id=2000000005 AND value_as_concept_id !=2000000005) -OR (value_source_concept_id=2000000004 AND value_as_concept_id !=2000000004) -OR (value_source_concept_id=903079 AND value_as_concept_id !=1177221) -OR (value_source_concept_id=903096 AND value_as_concept_id !=903096) - +UNION ALL +SELECT '2000000004' as issue, COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=2000000004 AND value_as_concept_id !=2000000004) +UNION ALL +SELECT '903079' as issue, COUNT (*) AS n_row_not_pass FROM df1 +WHERE (value_source_concept_id=903079 AND value_as_concept_id !=1177221) """) q = query.render(project_id=project_id, @@ -703,6 +726,39 @@ # - +# # 7 PMI_Skip records +# All the PMI_Skip records (value_source_concept_id = 903096) must have either +# 903096 or 2000000010 as value_as_concept_id. +# +# See JIRA ticket DC-3494 for more context. + +# + +query = JINJA_ENV.from_string(""" +SELECT COUNT(*) AS n_row_not_pass +FROM `{{project_id}}.{{deid_cdr}}.observation` +WHERE value_source_concept_id = 903096 +AND value_as_concept_id NOT IN (903096, 2000000010) +""").render(project_id=project_id, deid_cdr=deid_cdr) +df1 = execute(client, query) + +if df1.loc[0].sum() == 0: + df = df.append( + { + 'query': 'Query7 PMI_Skip mapping post deid', + 'result': 'PASS' + }, + ignore_index=True) +else: + df = df.append( + { + 'query': 'Query7 PMI_Skip mapping post deid', + 'result': 'Failure' + }, + ignore_index=True) +df1 + +# - + # # Summary_cdr_deid_Generalization_rule @@ -713,3 +769,4 @@ def highlight_cells(val): df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) +# - diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report2_row_suppression.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report2_row_suppression.py index cdaee165a4..b82405b1d0 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report2_row_suppression.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report2_row_suppression.py @@ -76,8 +76,8 @@ df1 # # 2 Verify all fields identified for suppression in the OBSERVATION table have been removed from the table in the deid dataset. - -# ## error in new cdr +# +# If there are issues view them in more detail using query 2.1 query = JINJA_ENV.from_string(""" WITH df1 AS ( @@ -114,6 +114,8 @@ ignore_index = True) df1 +# ## 2.1 Detailed view of issues found in check 2 + # + query = JINJA_ENV.from_string(""" WITH df1 AS ( @@ -143,8 +145,8 @@ # - # # 3 Verify all fields identified for suppression in the OBSERVATION table have been removed from the table in the deid dataset. - -# ## error in new cdr +# +# If there are issues view them in more detail using query 3.1 query = JINJA_ENV.from_string(""" WITH df1 AS ( @@ -174,6 +176,8 @@ ignore_index = True) df1 +# ## 3.1 Detailed view of issues found in check 3 + # + query = JINJA_ENV.from_string(""" WITH df1 AS ( @@ -231,8 +235,8 @@ df1 # # 5 Verify all fields identified for suppression in the OBSERVATION table have been removed from the table in the deid dataset. - -# ## error in new cdr +# +# If there are issues view them in more detail using query 5.1 query = JINJA_ENV.from_string(""" WITH df1 AS ( @@ -262,15 +266,18 @@ ignore_index = True) df1 +# ## 5.1 Detailed view of issues found in check 5 + # + query = JINJA_ENV.from_string(""" -WITH df1 AS ( +WITH df1 AS ( SELECT observation_id FROM `{{project_id}}.{{com_cdr}}.observation` WHERE observation_source_value LIKE '%Gender%' OR observation_source_value LIKE '%Sexuality%' OR observation_source_value LIKE '%SexAtBirthNoneOfThese_%' ) + SELECT distinct observation_source_value,value_source_value, value_as_string FROM `{{project_id}}.{{deid_cdr}}.observation` WHERE observation_id IN (SELECT observation_id FROM df1) @@ -464,8 +471,8 @@ df1 # # 12 Verify all fields identified for suppression in the OBSERVATION table have been removed from the table in the deid dataset. - -# ## error in new cdr +# +# If there are issues view them in more detail using query 12.1 query = JINJA_ENV.from_string(""" WITH df1 AS ( @@ -498,6 +505,8 @@ ignore_index = True) df1 +# ## 12.1 Detailed view of issues found in check 12 + # + query = JINJA_ENV.from_string(""" WITH df1 AS ( diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report4_dateshift.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report4_dateshift.py index 3ea73b256d..71e02f81bf 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report4_dateshift.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report4_dateshift.py @@ -524,7 +524,7 @@ query = JINJA_ENV.from_string(""" WITH df1 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'observation' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.observation` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -533,7 +533,7 @@ ), df2 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'measurement' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.measurement` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -542,7 +542,7 @@ ), df3 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'condition_occurrence' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.condition_occurrence` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -551,7 +551,7 @@ ), df4 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'drug_exposure' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.drug_exposure` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -560,7 +560,7 @@ ), df5 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'device_exposure' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.device_exposure` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -569,7 +569,7 @@ ), df6 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'procedure_occurrence' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.procedure_occurrence` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -578,7 +578,7 @@ ), df7 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'visit_occurrence' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.visit_occurrence` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -587,7 +587,7 @@ ), df8 AS ( -SELECT COUNT (*) AS n_row_not_pass +SELECT 'specimen' as table, COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{com_cdr}}.specimen` non_deid JOIN `{{project_id}}.{{pipeline}}.pid_rid_mapping` m ON m.person_id=non_deid.person_id @@ -597,15 +597,13 @@ SELECT * FROM df1 -JOIN df2 USING(n_row_not_pass) -JOIN df3 USING(n_row_not_pass) -JOIN df4 USING(n_row_not_pass) -JOIN df5 USING(n_row_not_pass) -JOIN df6 USING(n_row_not_pass) -JOIN df7 USING(n_row_not_pass) -JOIN df8 USING(n_row_not_pass) - - +UNION ALL SELECT * FROM df2 +UNION ALL SELECT * FROM df3 +UNION ALL SELECT * FROM df4 +UNION ALL SELECT * FROM df5 +UNION ALL SELECT * FROM df6 +UNION ALL SELECT * FROM df7 +UNION ALL SELECT * FROM df8 """) q = query.render(project_id=project_id, @@ -613,7 +611,7 @@ com_cdr=com_cdr, deid_cdr=deid_cdr) df1 = execute(client, q) -if df1.eq(0).any().any(): +if df1['n_row_not_pass'].sum() == 0: df = df.append( { 'query': @@ -704,4 +702,3 @@ def highlight_cells(val): df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) -# - diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report6_fitdata.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report6_fitdata.py index 4249040f60..98a42d6ebb 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report6_fitdata.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report6_fitdata.py @@ -37,7 +37,7 @@ date_columns = { 'activity_summary': 'date', 'heart_rate_summary': 'date', - 'heart_rate_minute_level': 'datetime', + 'heart_rate_intraday': 'datetime', 'steps_intraday': 'datetime', 'sleep_level': 'sleep_date', 'sleep_daily_summary': 'sleep_date', @@ -149,20 +149,20 @@ if sum(result['bad_rows']) == 0: summary = summary.append({ - 'query': 'Date Shift Query', + 'query': 'Max Age Query', 'result': 'PASS' }, ignore_index=True) else: summary = summary.append({ - 'query': 'Date Shift Query', + 'query': 'Max Age Query', 'result': 'Failure' }, ignore_index=True) result # - -# # Verify that correct date shift is applied to the fitbit data +# # Verify that correct date shift is applied to the RT fitbit data # # DC-1005 # @@ -172,7 +172,9 @@ # # **Note: Should a failure occur during this (long) query, it is advisable to replace `FITBIT_TABLES` with the table in question** # -# [DC-1786] date shifting should be checked against activity_summary, heart_rate_summary, heart_rate_minute_level, and steps_intraday. +# [DC-1786] date shifting should be checked against activity_summary, heart_rate_summary, heart_rate_intraday, and steps_intraday. +# +# Reminder: Date shifting only occurs in RT. # + query = JINJA_ENV.from_string(""" @@ -272,7 +274,7 @@ # # [DC-1788] Add additional person existence check to Fitbit notebook # -# This check should fail if a person_id in the activity_summary, heart_rate_summary, heart_rate_minute_level, or steps_intra_day tables does not exist in a corresponding RT de-identified dataset. +# This check should fail if a person_id in the activity_summary, heart_rate_summary, heart_rate_intraday, or steps_intra_day tables does not exist in a corresponding RT de-identified dataset. # + query = JINJA_ENV.from_string(""" @@ -338,10 +340,10 @@ OR device_id IS NULL ), check_uuid_unique AS ( -SELECT DISTINCT device_id +SELECT device_id FROM `{{project_id}}.{{deid_cdr_fitbit}}.device` -GROUP BY person_id, device_id -HAVING COUNT(device_id) > 1 +GROUP BY device_id +HAVING COUNT(DISTINCT person_id) > 1 ) SELECT 'not_research_device_ids' as issue, COUNT(*) as bad_rows FROM not_research_device_ids @@ -361,19 +363,15 @@ if sum(result['bad_rows']) == 0: summary = summary.append( { - 'query': - 'Query7 device_id was deidentified properly for all records.', - 'result': - 'PASS' + 'query': 'device_id Deidentification Query', + 'result': 'PASS' }, ignore_index=True) else: summary = summary.append( { - 'query': - 'Query7 device_id was not deidentified properly. See query description for hints.', - 'result': - 'Failure' + 'query': 'device_id Deidentification Query', + 'result': 'Failure' }, ignore_index=True) result @@ -445,14 +443,14 @@ if sum(result['bad_src_id_match_rows']) == 0: summary = summary.append( { - 'query': 'Query8 Check de-identification of src_ids.', + 'query': 'src_id Deidentification Query', 'result': 'PASS' }, ignore_index=True) else: summary = summary.append( { - 'query': 'Query8 Check de-identification of src_ids.', + 'query': 'src_id Deidentification Query', 'result': 'Failure' }, ignore_index=True) diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py index c5756d50c7..f834cb5fcc 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py @@ -59,7 +59,7 @@ # these concept_ids should be suppressed query = JINJA_ENV.from_string(""" select OMOP_conceptID,New_Requirement -from `{{project_id}}.{{sandbox}}.temp_cope_privacy_rules` +from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` where New_Requirement like 'suppress%' or New_Requirement like 'row suppression' """) q = query.render(project_id=project_id,sandbox=sandbox) @@ -77,10 +77,10 @@ JOIN `{{project_id}}.{{deid_cdr}}.concept` c ON ob.observation_source_concept_id=c.concept_id WHERE observation_source_concept_id IN -(select OMOP_conceptID from `{{project_id}}.{{sandbox}}.temp_cope_privacy_rules` +(select OMOP_conceptID from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` where New_Requirement like 'suppress%' or New_Requirement like 'row suppression') OR observation_concept_id IN -(select OMOP_conceptID from `{{project_id}}.{{sandbox}}.temp_cope_privacy_rules` +(select OMOP_conceptID from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` where New_Requirement like 'suppress%' or New_Requirement like 'row suppression') GROUP BY 1,2,3,4,5 ORDER BY n_row_not_pass DESC @@ -381,6 +381,7 @@ def my_sql(table_name,column_name): # use a loop to get table name AND column name AND run sql function result = [my_sql (table_name, column_name) for table_name, column_name in zip(target_tables['table_name'], target_tables['column_name'])] result +# if Row_count is '0' in "Combined" dataset as well, '0' showing up in this check is not a problem # + # AND then get the result back FROM loop result list diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report8_household_state_genera.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report8_household_state_genera.py index 3a9fb84421..2824f4b217 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report8_household_state_genera.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report8_household_state_genera.py @@ -17,7 +17,9 @@ import urllib import pandas as pd -from common import JINJA_ENV +from common import (CONDITION_OCCURRENCE, DEVICE_EXPOSURE, DRUG_EXPOSURE, + JINJA_ENV, MEASUREMENT, OBSERVATION, PROCEDURE_OCCURRENCE, + VISIT_OCCURRENCE) from utils import auth from gcloud.bq import BigQueryClient from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES @@ -25,13 +27,12 @@ # + tags=["parameters"] project_id = "" -deid_cdr="" +deid_cdr = "" combine = "" -reg_combine='' +reg_combine = '' pipeline = "" deid_sand = "" -pid_threshold="" -run_as="" +run_as = "" # + impersonation_creds = auth.get_impersonation_credentials( @@ -41,7 +42,7 @@ # - # df will have a summary in the end -df = pd.DataFrame(columns = ['query', 'result']) +df = pd.DataFrame(columns=['query', 'result']) # # Query1 Verify that if the observation_source_concept_id field in OBSERVATION table populates: 1585890, the value_as_concept_id field in de-id table should populate : 2000000012 # @@ -49,14 +50,14 @@ # # Expected result: # -# Null is the value poplulated in the value_as_number fields +# Null is the value poplulated in the value_as_number fields # # AND 2000000012, 2000000010 AND 903096 are the values that are populated in value_as_concept_id field in the deid table. # -# Per Francis, the other two values are valid. so it is pass. +# Per Francis, the other two values are valid. so it is pass. # + -query=JINJA_ENV.from_string(""" +query = JINJA_ENV.from_string(""" SELECT COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{deid_cdr}}.observation` @@ -64,15 +65,23 @@ observation_source_concept_id = 1585890 AND value_as_concept_id NOT IN (2000000012,2000000010,903096) """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) - -if df1.loc[0].sum()==0: - df = df.append({'query' : 'Query1 observation_source_concept_id 1585890', 'result' : 'PASS'}, - ignore_index = True) +q = query.render(project_id=project_id, deid_cdr=deid_cdr) +df1 = execute(client, q) + +if df1.loc[0].sum() == 0: + df = df.append( + { + 'query': 'Query1 observation_source_concept_id 1585890', + 'result': 'PASS' + }, + ignore_index=True) else: - df = df.append({'query' : 'Query1 observation_source_concept_id 1585890', 'result' : 'Failure'}, - ignore_index = True) + df = df.append( + { + 'query': 'Query1 observation_source_concept_id 1585890', + 'result': 'Failure' + }, + ignore_index=True) df1 # - @@ -87,7 +96,7 @@ # ## one row had error in new cdr # + -query=JINJA_ENV.from_string(""" +query = JINJA_ENV.from_string(""" SELECT COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{deid_cdr}}.observation` @@ -95,19 +104,27 @@ observation_source_concept_id = 1333023 AND value_as_concept_id NOT IN (2000000012,2000000010,903096) """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) - -if df1.loc[0].sum()==0: - df = df.append({'query' : 'Query2 observation_source_concept_id 1333023', 'result' : 'PASS'}, - ignore_index = True) +q = query.render(project_id=project_id, deid_cdr=deid_cdr) +df1 = execute(client, q) + +if df1.loc[0].sum() == 0: + df = df.append( + { + 'query': 'Query2 observation_source_concept_id 1333023', + 'result': 'PASS' + }, + ignore_index=True) else: - df = df.append({'query' : 'Query2 observation_source_concept_id 1333023', 'result' : 'Failure'}, - ignore_index = True) + df = df.append( + { + 'query': 'Query2 observation_source_concept_id 1333023', + 'result': 'Failure' + }, + ignore_index=True) df1 # - -query=JINJA_ENV.from_string(""" +query = JINJA_ENV.from_string(""" SELECT * FROM `{{project_id}}.{{deid_cdr}}.observation` @@ -115,8 +132,8 @@ observation_source_concept_id = 1333023 AND value_as_concept_id NOT IN (2000000012,2000000010,903096) """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) +q = query.render(project_id=project_id, deid_cdr=deid_cdr) +df1 = execute(client, q) df1 # # Query3 Verify that if the observation_source_concept_id field in OBSERVATION table populates: 1585889, the value_as_concept_id field in de-id table should populate : 2000000013 @@ -125,44 +142,52 @@ # # expected results: # -# Null is the value poplulated in the value_as_number fields +# Null is the value poplulated in the value_as_number fields # # AND 2000000013, 2000000010 AND 903096 are the values that are populated in value_as_concept_id field in the deid table. # + -query=JINJA_ENV.from_string(""" +query = JINJA_ENV.from_string(""" SELECT COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{deid_cdr}}.observation` WHERE observation_source_concept_id = 1585889 AND value_as_concept_id NOT IN (2000000013,2000000010,903096) """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) - -if df1.loc[0].sum()==0: - df = df.append({'query' : 'Query3 observation_source_concept_id 1585889', 'result' : 'PASS'}, - ignore_index = True) +q = query.render(project_id=project_id, deid_cdr=deid_cdr) +df1 = execute(client, q) + +if df1.loc[0].sum() == 0: + df = df.append( + { + 'query': 'Query3 observation_source_concept_id 1585889', + 'result': 'PASS' + }, + ignore_index=True) else: - df = df.append({'query' : 'Query3 observation_source_concept_id 1585889', 'result' : 'Failure'}, - ignore_index = True) + df = df.append( + { + 'query': 'Query3 observation_source_concept_id 1585889', + 'result': 'Failure' + }, + ignore_index=True) df1 # - # # Query4 Verify that if the observation_source_concept_id field in OBSERVATION table populates: 1333015, the value_as_concept_id field in de-id table should populate : 2000000013 # -# Generalization Rules for reference +# Generalization Rules for reference # # Living Situation: COPE survey Generalize household size >10 # # expected results: # -# Null is the value poplulated in the value_as_number fields +# Null is the value poplulated in the value_as_number fields # # AND 2000000013, 2000000010 AND 903096 are the values that are populated in value_as_concept_id field in the deid table. # + -query=JINJA_ENV.from_string(""" +query = JINJA_ENV.from_string(""" SELECT COUNT (*) AS n_row_not_pass FROM `{{project_id}}.{{deid_cdr}}.observation` @@ -170,127 +195,101 @@ observation_source_concept_id = 1333015 AND value_as_concept_id NOT IN (2000000013,2000000010,903096) """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) - -if df1.loc[0].sum()==0: - df = df.append({'query' : 'Query4 observation_source_concept_id 1333015', 'result' : 'PASS'}, - ignore_index = True) +q = query.render(project_id=project_id, deid_cdr=deid_cdr) +df1 = execute(client, q) + +if df1.loc[0].sum() == 0: + df = df.append( + { + 'query': 'Query4 observation_source_concept_id 1333015', + 'result': 'PASS' + }, + ignore_index=True) else: - df = df.append({'query' : 'Query4 observation_source_concept_id 1333015', 'result' : 'Failure'}, - ignore_index = True) + df = df.append( + { + 'query': 'Query4 observation_source_concept_id 1333015', + 'result': 'Failure' + }, + ignore_index=True) df1 # - -# # Query5 update to verifie that value_as_concept_id and value_source_concept_id are set to 2000000011 for states with less than 200 participants. +# # Query5 State generalization cleaning rules +# `ConflictingHpoStateGeneralize` and `GeneralizeStateByPopulation` updates participants' state records in observation to +# value_source_concept_id = 2000000011 and value_as_concept_id = 2000000011 based on the criteria. +# Query5.1 and Query5.2 check if the state generalization is working as expected. # -# Set the value_source_concept_id = 2000000011 and value_as_concept_id =2000000011 -# -# DC-2377 and DC-1614, DC-2782, DC-2785 +# Related tickets: DC-2377, DC-1614, DC-2782, DC-2785, DC-3504 # ## Query5.1 Generalize state info (2000000011) for participants who have EHR data from states other than the state they are currently living in. +# The CR `ConflictingHpoStateGeneralize` takes care of this criteria. Look at the CR's sandbox tables and related logics if this QC fails. # + query = JINJA_ENV.from_string(""" -with df_ehr_site as ( -SELECT distinct com.person_id, research_id deid_pid, map_state.State EHR_site_state,'observation' table -FROM `{{project_id}}.{{reg_combine}}.observation` com -join `{{project_id}}.{{reg_combine}}._mapping_observation` map on map.observation_id=com.observation_id -JOIN `{{project_id}}.{{deid_sand}}._deid_map` deid_map on deid_map.person_id=com.person_id -JOIN `{{project_id}}.{{reg_combine}}._mapping_src_hpos_to_allowed_states` map_state ON map_state.src_hpo_id=map.src_hpo_id -JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON hpo_id=map_state.src_hpo_id - -union distinct - -SELECT distinct com.person_id, research_id deid_pid, map_state.State EHR_site_state,'condition' table -FROM `{{project_id}}.{{reg_combine}}.condition_occurrence` com -join `{{project_id}}.{{reg_combine}}._mapping_condition_occurrence` map on map.condition_occurrence_id=com.condition_occurrence_id -JOIN `{{project_id}}.{{deid_sand}}._deid_map` deid_map on deid_map.person_id=com.person_id -JOIN `{{project_id}}.{{reg_combine}}._mapping_src_hpos_to_allowed_states` map_state ON map_state.src_hpo_id=map.src_hpo_id -JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON hpo_id=map_state.src_hpo_id - -union distinct - -SELECT distinct com.person_id, research_id deid_pid, map_state.State EHR_site_state,'measurement' table -FROM `{{project_id}}.{{reg_combine}}.measurement` com -join `{{project_id}}.{{reg_combine}}._mapping_measurement` map on map.measurement_id=com.measurement_id -JOIN `{{project_id}}.{{deid_sand}}._deid_map` deid_map on deid_map.person_id=com.person_id -JOIN `{{project_id}}.{{reg_combine}}._mapping_src_hpos_to_allowed_states` map_state ON map_state.src_hpo_id=map.src_hpo_id -JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON hpo_id=map_state.src_hpo_id - -union distinct - -SELECT distinct com.person_id, research_id deid_pid, map_state.State EHR_site_state,'device_exposure' table -FROM `{{project_id}}.{{reg_combine}}.device_exposure` com -join `{{project_id}}.{{reg_combine}}._mapping_device_exposure` map on map.device_exposure_id=com.device_exposure_id -JOIN `{{project_id}}.{{deid_sand}}._deid_map` deid_map on deid_map.person_id=com.person_id -JOIN `{{project_id}}.{{reg_combine}}._mapping_src_hpos_to_allowed_states` map_state ON map_state.src_hpo_id=map.src_hpo_id -JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON hpo_id=map_state.src_hpo_id - -union distinct - -SELECT distinct com.person_id, research_id deid_pid, map_state.State EHR_site_state,'drug_exposure' table -FROM `{{project_id}}.{{reg_combine}}.drug_exposure` com -join `{{project_id}}.{{reg_combine}}._mapping_drug_exposure` map on map.drug_exposure_id=com.drug_exposure_id -JOIN `{{project_id}}.{{deid_sand}}._deid_map` deid_map on deid_map.person_id=com.person_id -JOIN `{{project_id}}.{{reg_combine}}._mapping_src_hpos_to_allowed_states` map_state ON map_state.src_hpo_id=map.src_hpo_id -JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON hpo_id=map_state.src_hpo_id - -union distinct - -SELECT distinct com.person_id, research_id deid_pid, map_state.State EHR_site_state,'procedure' table -FROM `{{project_id}}.{{reg_combine}}.procedure_occurrence` com -join `{{project_id}}.{{reg_combine}}._mapping_procedure_occurrence` map on map.procedure_occurrence_id=com.procedure_occurrence_id -JOIN `{{project_id}}.{{deid_sand}}._deid_map` deid_map on deid_map.person_id=com.person_id -JOIN `{{project_id}}.{{reg_combine}}._mapping_src_hpos_to_allowed_states` map_state ON map_state.src_hpo_id=map.src_hpo_id -JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON hpo_id=map_state.src_hpo_id - -union distinct - -SELECT distinct com.person_id, research_id deid_pid, map_state.State EHR_site_state,'visit' table -FROM `{{project_id}}.{{reg_combine}}.visit_occurrence` com -join `{{project_id}}.{{reg_combine}}._mapping_visit_occurrence` map on map.visit_occurrence_id=com.visit_occurrence_id -JOIN `{{project_id}}.{{deid_sand}}._deid_map` deid_map on deid_map.person_id=com.person_id -JOIN `{{project_id}}.{{reg_combine}}._mapping_src_hpos_to_allowed_states` map_state ON map_state.src_hpo_id=map.src_hpo_id -JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON hpo_id=map_state.src_hpo_id +with participant_hpo_sites as ( + +{% for table in tables %} +SELECT DISTINCT deid.person_id, mask.value_source_concept_id +FROM `{{project_id}}.{{deid_cdr}}.{{table}}` deid +JOIN `{{project_id}}.{{deid_cdr}}.{{table}}_ext` ext ON deid.{{table}}_id = ext.{{table}}_id +JOIN `{{project_id}}.{{pipeline}}.site_maskings` mask ON ext.src_id = mask.src_id + +{% if not loop.last -%} UNION DISTINCT {% endif %} + +{% endfor %} ), df2 as ( -SELECT distinct deid.person_id, -com.value_source_concept_id, -com.value_source_value com_residency_state, EHR_site_state +SELECT DISTINCT deid.person_id FROM `{{project_id}}.{{deid_cdr}}.observation` deid -join `{{project_id}}.{{reg_combine}}.observation` com on com.observation_id=deid.observation_id -join df_ehr_site on deid.person_id=df_ehr_site.deid_pid -where deid.observation_source_concept_id = 1585249 -and deid.value_source_concept_id !=2000000011 -and com.value_source_value !=EHR_site_state +JOIN participant_hpo_sites hpo ON deid.person_id = hpo.person_id +WHERE deid.observation_source_concept_id = 1585249 +AND deid.value_source_concept_id != hpo.value_source_concept_id +AND (deid.value_source_concept_id != 2000000011 OR + deid.value_as_concept_id != 2000000011 + ) ) select count (*) AS row_counts_failure_state_generalization from df2 -""") - -q = query.render(project_id=project_id,deid_cdr=deid_cdr,reg_combine=reg_combine,deid_sand=deid_sand,pipeline=pipeline) - -df1=execute(client, q) - -if df1.loc[0].sum()==0: - df = df.append({'query' : 'Query5.1 state generalization to 2000000011', 'result' : 'PASS'}, - ignore_index = True) +""").render(project_id=project_id, + deid_cdr=deid_cdr, + reg_combine=reg_combine, + deid_sand=deid_sand, + pipeline=pipeline, + tables=[ + CONDITION_OCCURRENCE, DEVICE_EXPOSURE, DRUG_EXPOSURE, + MEASUREMENT, OBSERVATION, PROCEDURE_OCCURRENCE, VISIT_OCCURRENCE + ]) + +df1 = execute(client, query) + +if df1.loc[0].sum() == 0: + df = df.append( + { + 'query': 'Query5.1 state generalization to 2000000011', + 'result': 'PASS' + }, + ignore_index=True) else: - df = df.append({'query' : 'Query5.1 state generalization to 2000000011', 'result' : 'Failure'}, - ignore_index = True) + df = df.append( + { + 'query': 'Query5.1 state generalization to 2000000011', + 'result': 'Failure' + }, + ignore_index=True) df1 # - # ## Query5.2 Generalize state info for participants where the identified number of participants living in the state without EHR records from a different state < 200 (the generalization threshold)” +# The CR `GeneralizeStateByPopulation` takes care of this criteria. Look at the CR's sandbox tables and related logics if this QC fails. # + -query=JINJA_ENV.from_string(""" +query = JINJA_ENV.from_string(""" with df_state_pid_200 as ( select value_source_concept_id, count(distinct person_id) as participant_count @@ -306,25 +305,36 @@ and (value_source_concept_id !=2000000011 or value_as_concept_id !=2000000011) and value_source_concept_id in (select value_source_concept_id from df_state_pid_200) """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr,pid_threshold=pid_threshold,reg_combine=reg_combine) -df1=execute(client, q) - -if df1.loc[0].sum()==0: - df = df.append({'query' : 'Query5.2 state generalization if counts <200', 'result' : 'PASS'}, - ignore_index = True) +q = query.render(project_id=project_id, + deid_cdr=deid_cdr, + reg_combine=reg_combine) +df1 = execute(client, q) + +if df1.loc[0].sum() == 0: + df = df.append( + { + 'query': 'Query5.2 state generalization if counts <200', + 'result': 'PASS' + }, + ignore_index=True) else: - df = df.append({'query' : 'Query5.2 state generalization if counts <200', 'result' : 'Failure'}, - ignore_index = True) + df = df.append( + { + 'query': 'Query5.2 state generalization if counts <200', + 'result': 'Failure' + }, + ignore_index=True) df1 - # - # # Summary_deid_household AND state generalization + # + def highlight_cells(val): color = 'red' if 'Failure' in val else 'white' - return f'background-color: {color}' + return f'background-color: {color}' + df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) diff --git a/data_steward/bq_utils.py b/data_steward/bq_utils.py index c46fb87da7..346478f001 100644 --- a/data_steward/bq_utils.py +++ b/data_steward/bq_utils.py @@ -71,6 +71,7 @@ def get_combined_snapshot_dataset_id(): return os.environ.get('COMBINED_SNAPSHOT') +@deprecated(reason='Use common.COMBINED_DATASET_ID instead') def get_combined_dataset_id(): return os.environ.get('COMBINED_DATASET_ID') diff --git a/data_steward/cdr_cleaner/clean_cdr.py b/data_steward/cdr_cleaner/clean_cdr.py index d25636afea..be59f7d3da 100644 --- a/data_steward/cdr_cleaner/clean_cdr.py +++ b/data_steward/cdr_cleaner/clean_cdr.py @@ -17,6 +17,7 @@ from cdr_cleaner.cleaning_rules.calculate_primary_death_record import CalculatePrimaryDeathRecord from cdr_cleaner.cleaning_rules.clean_by_birth_year import CleanByBirthYear from cdr_cleaner.cleaning_rules.convert_pre_post_coordinated_concepts import ConvertPrePostCoordinatedConcepts +from cdr_cleaner.cleaning_rules.create_aian_lookup import CreateAIANLookup from cdr_cleaner.cleaning_rules.create_expected_ct_list import StoreExpectedCTList from cdr_cleaner.cleaning_rules.domain_alignment import DomainAlignment import cdr_cleaner.cleaning_rules.drop_duplicate_states as drop_duplicate_states @@ -32,6 +33,7 @@ from cdr_cleaner.cleaning_rules.populate_survey_conduct_ext import PopulateSurveyConductExt from cdr_cleaner.cleaning_rules.remove_invalid_procedure_source_records import RemoveInvalidProcedureSourceRecords from cdr_cleaner.cleaning_rules.remove_non_matching_participant import RemoveNonMatchingParticipant +from cdr_cleaner.cleaning_rules.sandbox_and_remove_withdrawn_pids import SandboxAndRemoveWithdrawnPids from cdr_cleaner.cleaning_rules.remove_records_with_wrong_date import RemoveRecordsWithWrongDate from cdr_cleaner.cleaning_rules.remove_participants_under_18years import RemoveParticipantsUnder18Years from cdr_cleaner.cleaning_rules.round_ppi_values_to_nearest_integer import RoundPpiValuesToNearestInteger @@ -168,8 +170,10 @@ RDR_CLEANING_CLASSES = [ (StoreNewPidRidMappings,), (CreateDeidQuestionnaireResponseMap,), + (CreateAIANLookup,), (TruncateRdrData,), (RemoveParticipantsUnder18Years,), + (SandboxAndRemoveWithdrawnPids,), # execute SetConceptIdsForSurveyQuestionAnswers before PpiBranching gets executed # since PpiBranching relies on fully mapped concepts ( @@ -341,9 +345,9 @@ (TableSuppression,), (ControlledTierReplacedConceptSuppression,), (GeneralizeZipCodes,), # Should run after any data remapping rules - (RaceEthnicityRecordSuppression, - ), # Should run after any data remapping rules - (MotorVehicleAccidentSuppression,), + # (RaceEthnicityRecordSuppression,), # Should run after any data remapping rules + ( + MotorVehicleAccidentSuppression,), (VehicularAccidentConceptSuppression,), (ExplicitIdentifierSuppression,), (GeoLocationConceptSuppression,), @@ -391,7 +395,6 @@ (FitbitPIDtoRID,), (FitbitDeidSrcID,), (RemoveNonExistingPids,), # assumes CT dataset is ready for reference - (DropViaSurveyConduct,), ] DATA_CONSISTENCY_CLEANING_CLASSES = [ diff --git a/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle.py b/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle.py index 3dbb411b45..66d7cc01e2 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle.py +++ b/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle.py @@ -33,7 +33,7 @@ def __init__(self, 'This rule extends the abstract class AbstractBackfillSurveyRecords for the skip record creation.' ) - super().__init__(issue_numbers=['DC3099'], + super().__init__(issue_numbers=['DC3099', 'DC3458'], description=desc, affected_datasets=[cdr_consts.RDR], affected_tables=[OBSERVATION], @@ -71,4 +71,4 @@ def validate_rule(self, client): clean_engine.add_console_logging(ARGS.console_log) clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id, - [(BackfillLifestyle,)]) \ No newline at end of file + [(BackfillLifestyle,)]) diff --git a/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health.py b/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health.py index 328dc763fc..6009dd8023 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health.py +++ b/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health.py @@ -40,7 +40,7 @@ def __init__(self, 'This rule extends the abstract class AbstractBackfillSurveyRecords for the skip record creation.' ) - super().__init__(issue_numbers=['DC3098'], + super().__init__(issue_numbers=['DC3098', 'DC3458'], description=desc, affected_datasets=[cdr_consts.RDR], affected_tables=[OBSERVATION], diff --git a/data_steward/cdr_cleaner/cleaning_rules/backfill_survey_records.py b/data_steward/cdr_cleaner/cleaning_rules/backfill_survey_records.py index 65a08efe12..67f91a88d3 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/backfill_survey_records.py +++ b/data_steward/cdr_cleaner/cleaning_rules/backfill_survey_records.py @@ -20,22 +20,24 @@ # Project imports from cdr_cleaner.cleaning_rules.base_cleaning_rule import BaseCleaningRule, query_spec_list from constants.cdr_cleaner.clean_cdr import QUERY -from common import JINJA_ENV, OBSERVATION, PERSON +from common import JINJA_ENV, OBSERVATION, PERSON, MAPPING_PREFIX from resources import fields_for LOGGER = logging.getLogger(__name__) -BACKFILL_QUERY = JINJA_ENV.from_string(""" -INSERT INTO `{{project}}.{{dataset}}.{{obs}}` -({{obs_fields}}) +SANDBOX_QUERY = JINJA_ENV.from_string(""" +CREATE OR REPLACE TABLE `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` AS ( WITH person_who_answered_survey AS ( SELECT person_id, + m.src_id, MAX(observation_date) AS observation_date, MAX(observation_datetime) AS observation_datetime, - FROM `{{project}}.{{dataset}}.{{obs}}` + FROM `{{project}}.{{dataset}}.{{obs}}` o + JOIN `{{project}}.{{dataset}}.{{mapping_obs}}` m + USING (observation_id) WHERE observation_source_concept_id IN ({{backfill_concepts}}) - GROUP BY person_id + GROUP BY person_id, src_id ), backfill_survey AS ( SELECT DISTINCT @@ -62,7 +64,8 @@ pwas.observation_datetime, bs.observation_type_concept_id, bs.observation_source_value, - bs.observation_source_concept_id + bs.observation_source_concept_id, + pwas.src_id FROM person_who_answered_survey pwas CROSS JOIN backfill_survey bs {% if additional_backfill_conditions -%} @@ -113,8 +116,25 @@ CAST(NULL AS STRING) AS qualifier_source_value, 903096 AS value_source_concept_id, 'PMI_Skip' AS value_source_value, - NULL AS questionnaire_response_id + NULL AS questionnaire_response_id, + src_id FROM missing_survey +) +""") + +BACKFILL_QUERY = JINJA_ENV.from_string(""" +INSERT INTO `{{project}}.{{dataset}}.{{obs}}` +({{obs_fields}}) +SELECT {{obs_fields}} +FROM `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` +""") + +# create _mapping_observation records for the backfilled observations +APPEND_MAPPING_QUERY = JINJA_ENV.from_string(""" +INSERT INTO `{{project}}.{{dataset}}.{{mapping_obs}}` +(observation_id, src_id) +SELECT observation_id, src_id +FROM `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` """) @@ -170,19 +190,54 @@ def setup_rule(self, client): pass def get_sandbox_tablenames(self): - # No sandbox table exists for this CR because it runs only an INSERT statement. - return [] + """ + generates sandbox table names + """ + sandbox_table = self.sandbox_table_for(OBSERVATION) + return [sandbox_table] def get_query_specs(self) -> query_spec_list: - query = BACKFILL_QUERY.render( - project=self.project_id, - dataset=self.dataset_id, - obs=OBSERVATION, - pers=PERSON, - obs_fields=', '.join( - field['name'] for field in fields_for(OBSERVATION)), - backfill_concepts=", ".join( - [str(concept_id) for concept_id in self.backfill_concepts]), - additional_backfill_conditions=self.additional_backfill_conditions) - - return [{QUERY: query}] + query_list = [] + query_list.append({ + QUERY: + SANDBOX_QUERY.render( + project=self.project_id, + dataset=self.dataset_id, + sandbox_dataset=self.sandbox_dataset_id, + sandbox_table=self.sandbox_table_for(OBSERVATION), + obs=OBSERVATION, + mapping_obs=MAPPING_PREFIX + OBSERVATION, + pers=PERSON, + rdr_obs_fields=', '.join( + field['name'] + for field in fields_for('rdr_observation')), + backfill_concepts=", ".join([ + str(concept_id) for concept_id in self.backfill_concepts + ]), + additional_backfill_conditions=self. + additional_backfill_conditions) + }) + + query_list.append({ + QUERY: + BACKFILL_QUERY.render( + project=self.project_id, + dataset=self.dataset_id, + sandbox_dataset=self.sandbox_dataset_id, + sandbox_table=self.sandbox_table_for(OBSERVATION), + obs=OBSERVATION, + obs_fields=', '.join( + field['name'] for field in fields_for(OBSERVATION))) + }) + + query_list.append({ + QUERY: + APPEND_MAPPING_QUERY.render( + project=self.project_id, + dataset=self.dataset_id, + mapping_obs=MAPPING_PREFIX + OBSERVATION, + sandbox_dataset=self.sandbox_dataset_id, + sandbox_table=self.sandbox_table_for(OBSERVATION)) + }) + + return query_list diff --git a/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics.py b/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics.py index fd1e1a25f2..a30f7acd4e 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics.py +++ b/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics.py @@ -37,7 +37,7 @@ def __init__(self, 'This rule extends the abstract class AbstractBackfillSurveyRecords for the skip record creation.' ) - super().__init__(issue_numbers=['DC3097'], + super().__init__(issue_numbers=['DC3097', 'DC3458'], description=desc, affected_datasets=[cdr_consts.RDR], affected_tables=[OBSERVATION], diff --git a/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year.py b/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year.py index 48c4d7996c..5e4fe8d1c0 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year.py +++ b/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year.py @@ -1,5 +1,5 @@ """ -Year of birth should not be in the future, before 1800, or indicate +Year of birth should not be in the future, before 1900, or indicate the participant is less than 18 years old. Using rule 18, 19 in Achilles Heel for reference. """ @@ -13,8 +13,8 @@ LOGGER = logging.getLogger(__name__) -ISSUE_NUMBERS = ['DC392', 'DC809'] -MIN_YEAR_OF_BIRTH = 1800 +ISSUE_NUMBERS = ['DC392', 'DC809', 'DC3538'] +MIN_YEAR_OF_BIRTH = 1900 MAX_YEAR_OF_BIRTH = '(EXTRACT(YEAR FROM CURRENT_DATE()) - 17)' LIST_PERSON_ID_TABLES = JINJA_ENV.from_string(""" @@ -67,7 +67,7 @@ def __init__(self, """ desc = ( 'Sandbox and remove records when the participant\'s year of birth ' - 'indicates he/she was born before 1800, in the last 17 years, or in ' + 'indicates he/she was born before 1900, in the last 17 years, or in ' 'the future.') person_id_tables = resources.get_person_id_tables(AOU_REQUIRED) diff --git a/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts.py b/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts.py index 960b80f263..238451ceee 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts.py +++ b/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts.py @@ -8,6 +8,9 @@ This cleaning rule converts the observation records with pre-coordinated mapping to post-coordinated mapping. +This cleaning rule must run after CreateAIANLookup because CreateAIANLookup references +PRE-coordinated concepts for making the AIAN lookup table. + Original Issues: DC-2617 """ @@ -17,6 +20,7 @@ # Project imports import constants.cdr_cleaner.clean_cdr as cdr_consts from cdr_cleaner.cleaning_rules.base_cleaning_rule import BaseCleaningRule, query_spec_list +from cdr_cleaner.cleaning_rules.create_aian_lookup import CreateAIANLookup from cdr_cleaner.cleaning_rules.set_unmapped_question_answer_survey_concepts import ( SetConceptIdsForSurveyQuestionsAnswers) from cdr_cleaner.cleaning_rules.update_family_history_qa_codes import ( @@ -26,7 +30,7 @@ LOGGER = logging.getLogger(__name__) -JIRA_ISSUE_NUMBERS = ['DC2617'] +JIRA_ISSUE_NUMBERS = ['DC2617', 'DC3457'] MAPPING_QUERY = JINJA_ENV.from_string(""" CREATE OR REPLACE TABLE `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}_mapping` AS @@ -139,6 +143,36 @@ ON c.concept_id = m.concept_id """) +INSERT_MAPPING_QUERY = JINJA_ENV.from_string(""" +INSERT INTO `{{project}}.{{dataset}}._mapping_observation` +(observation_id, src_id) +SELECT + -- New observation_ids are assigned in the same way as INSERT_QUERY -- + -- so IDs align between observation and its mapping table -- + ROW_NUMBER() OVER( + PARTITION BY o.observation_id + ORDER BY + m.new_observation_concept_id, + m.new_value_as_concept_id + ) * 100000000000 + o.observation_id AS observation_id, + om.src_id +FROM `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` o +JOIN `{{project}}.{{dataset}}.concept` c +ON o.value_source_value = c.concept_code +JOIN `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}_mapping` m +ON c.concept_id = m.concept_id +JOIN `{{project}}.{{dataset}}._mapping_observation` om +ON o.observation_id = om.observation_id +""") + +DELETE_MAPPING_QUERY = JINJA_ENV.from_string(""" +DELETE FROM `{{project}}.{{dataset}}._mapping_observation` +WHERE observation_id IN ( + SELECT observation_id + FROM `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` +) +""") + class ConvertPrePostCoordinatedConcepts(BaseCleaningRule): @@ -167,7 +201,7 @@ def __init__(self, sandbox_dataset_id=sandbox_dataset_id, depends_on=[ SetConceptIdsForSurveyQuestionsAnswers, - UpdateFamilyHistoryCodes + UpdateFamilyHistoryCodes, CreateAIANLookup ], table_namer=table_namer) @@ -217,7 +251,28 @@ def get_query_specs(self, *args, **keyword_args) -> query_spec_list: field['name'] for field in fields_for(OBSERVATION))) } - return [sandbox_query_dict, delete_query_dict, insert_query_dict] + insert_mapping_query_dict = { + cdr_consts.QUERY: + INSERT_MAPPING_QUERY.render( + project=self.project_id, + sandbox_dataset=self.sandbox_dataset_id, + sandbox_table=self.sandbox_table_for(OBSERVATION), + dataset=self.dataset_id) + } + + delete_mapping_query_dict = { + cdr_consts.QUERY: + DELETE_MAPPING_QUERY.render( + project=self.project_id, + sandbox_dataset=self.sandbox_dataset_id, + sandbox_table=self.sandbox_table_for(OBSERVATION), + dataset=self.dataset_id) + } + + return [ + sandbox_query_dict, insert_query_dict, insert_mapping_query_dict, + delete_query_dict, delete_mapping_query_dict + ] def setup_rule(self, client, *args, **keyword_args): """ diff --git a/data_steward/cdr_cleaner/cleaning_rules/create_aian_lookup.py b/data_steward/cdr_cleaner/cleaning_rules/create_aian_lookup.py new file mode 100644 index 0000000000..6ed59a690d --- /dev/null +++ b/data_steward/cdr_cleaner/cleaning_rules/create_aian_lookup.py @@ -0,0 +1,104 @@ +""" +Create a lookup table of AIAN participants. + +We create this lookup table in case we need to run AIAN-specific processes, retraction for example. +We run this cleaning rule at the early stage of the RDR data stage so that we can include all the +potential AIAN participants in our datasets. + +The criteria of AIAN defition comes from the existing cleaning rules and retractions from the past. +See DC-3402 and its related tickets and comments for more context. + +Original JIRA ticket: DC-3402 +""" +# Python imports +import logging + +# Third party imports + +# Project imports +import constants.cdr_cleaner.clean_cdr as cdr_consts +from common import AIAN_LIST, JINJA_ENV +from cdr_cleaner.cleaning_rules.base_cleaning_rule import BaseCleaningRule + +LOGGER = logging.getLogger(__name__) + +CREATE_AIAN_LIST = JINJA_ENV.from_string(""" +CREATE OR REPLACE TABLE `{{project_id}}.{{sandbox_dataset_id}}.{{storage_table_name}}` AS ( + SELECT DISTINCT person_id FROM `{{project_id}}.{{dataset_id}}.observation` + WHERE (observation_source_concept_id = 1586140 AND value_source_concept_id = 1586141) + OR observation_source_concept_id in (1586150, 1585599, 1586139, 1585604) +)""") + + +class CreateAIANLookup(BaseCleaningRule): + """ + Create a lookup table of AIAN participants. + """ + + def __init__(self, + project_id, + dataset_id, + sandbox_dataset_id, + table_namer=None): + desc = ('Create a lookup table of AIAN participants. ' + 'We create it in case we need AIAN-specific ETL process ' + '(retraction, etc).') + + super().__init__(issue_numbers=['DC3402'], + description=desc, + affected_datasets=[], + project_id=project_id, + dataset_id=dataset_id, + sandbox_dataset_id=sandbox_dataset_id, + depends_on=[], + table_namer=table_namer) + + def get_query_specs(self): + """ + :return: a list of SQL strings to run + """ + create_sandbox_table = CREATE_AIAN_LIST.render( + project_id=self.project_id, + dataset_id=self.dataset_id, + sandbox_dataset_id=self.sandbox_dataset_id, + storage_table_name=self.sandbox_table_for(AIAN_LIST)) + + create_sandbox_table_dict = {cdr_consts.QUERY: create_sandbox_table} + + return [create_sandbox_table_dict] + + def get_sandbox_tablenames(self): + return [self.sandbox_table_for(AIAN_LIST)] + + def setup_rule(self, client): + pass + + def setup_validation(self): + pass + + def validate_rule(self): + pass + + +if __name__ == '__main__': + from utils import pipeline_logging + + import cdr_cleaner.args_parser as parser + import cdr_cleaner.clean_cdr_engine as clean_engine + + ARGS = parser.parse_args() + pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True) + + if ARGS.list_queries: + clean_engine.add_console_logging() + query_list = clean_engine.get_query_list(ARGS.project_id, + ARGS.dataset_id, + ARGS.sandbox_dataset_id, + [(CreateAIANLookup,)]) + for query in query_list: + LOGGER.info(query) + else: + clean_engine.add_console_logging(ARGS.console_log) + clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id, + ARGS.sandbox_dataset_id, + [(CreateAIANLookup,)]) diff --git a/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization.py b/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization.py index 6780425b40..b442a3958d 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization.py +++ b/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization.py @@ -22,16 +22,11 @@ SCHEMA_MAP_TABLE = "person_src_hpos_ext" HPO_ID_NOT_RDR_QUERY = JINJA_ENV.from_string(""" - SELECT - DISTINCT person_id, src_id - FROM - `{{project_id}}.{{dataset_id}}.{{table}}_ext` - JOIN - `{{project_id}}.{{dataset_id}}.{{table}}` - USING - ({{table}}_id) - WHERE - NOT REGEXP_CONTAINS(src_id, r'(?i)(Portal)') + SELECT DISTINCT person_id, src_id + FROM `{{project_id}}.{{dataset_id}}.{{table}}_ext` + JOIN `{{project_id}}.{{dataset_id}}.{{table}}` + USING ({{table}}_id) + WHERE NOT REGEXP_CONTAINS(src_id, r'(?i)(Portal)') """) LIST_PERSON_ID_TABLES = JINJA_ENV.from_string(""" @@ -48,8 +43,7 @@ INSERT_TO_MAP_TABLE_NAME = JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{sandbox_dataset_id}}.{{table_name}}` - (person_id, - src_id) + (person_id, src_id) {{select_query}} """) diff --git a/data_steward/cdr_cleaner/cleaning_rules/deid/recent_concept_suppression.py b/data_steward/cdr_cleaner/cleaning_rules/deid/recent_concept_suppression.py index 72563a1c5e..c0a1397bbc 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/deid/recent_concept_suppression.py +++ b/data_steward/cdr_cleaner/cleaning_rules/deid/recent_concept_suppression.py @@ -46,7 +46,7 @@ WITH concept_id_fields AS ( SELECT {{ table_info['concept_id_fields'] | join(', ') }}, - {{ table_info['primary_datefield'] }} + DATE({{ table_info['primary_datefield'] }}) AS {{ table_info['primary_datefield'] }} FROM `{{project_id}}.{{dataset_id}}.{{table_info['table_name']}}` c ), unpivoted_concept_id_fields AS ( diff --git a/data_steward/cdr_cleaner/cleaning_rules/deid/survey_version_info.py b/data_steward/cdr_cleaner/cleaning_rules/deid/survey_version_info.py index 62078d6b9b..00076abf8f 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/deid/survey_version_info.py +++ b/data_steward/cdr_cleaner/cleaning_rules/deid/survey_version_info.py @@ -68,7 +68,7 @@ def __init__(self, project_id, dataset_id, sandbox_dataset_id, - clean_survey_dataset_id=None, + clean_survey_dataset_id, table_namer=None): """ Initialize the class with proper info. diff --git a/data_steward/cdr_cleaner/cleaning_rules/deid/year_of_birth_records_suppression.py b/data_steward/cdr_cleaner/cleaning_rules/deid/year_of_birth_records_suppression.py index 8561b67036..ba1fb8197c 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/deid/year_of_birth_records_suppression.py +++ b/data_steward/cdr_cleaner/cleaning_rules/deid/year_of_birth_records_suppression.py @@ -17,11 +17,11 @@ # Project Imports import constants.cdr_cleaner.clean_cdr as cdr_consts from cdr_cleaner.cleaning_rules.base_cleaning_rule import BaseCleaningRule -from common import AOU_DEATH, AOU_REQUIRED, JINJA_ENV, DEATH, PERSON +from common import AOU_DEATH, AOU_REQUIRED, JINJA_ENV, DEATH, PERSON, FITBIT_TABLES, EHR_CONSENT_VALIDATION LOGGER = logging.getLogger(__name__) -ISSUE_NUMBERS = ['DC1977', 'DC2205'] +ISSUE_NUMBERS = ['DC1977', 'DC2205', 'DC3545'] BIRTH_DELIVERY_SUPPRESSION_CONCEPT_TABLE = '_birth_concepts' EXCLUDED_CONCEPTS = [4013886, 4135376, 4271761] @@ -105,7 +105,8 @@ def _get_time_columns(self, client): WHERE (lower(data_type) in ("date", "datetime", "time", "timestamp") and not REGEXP_CONTAINS(column_name, r'(?i)(partitiontime)')) -- tables we are not interested in cleaning -- - and not REGEXP_CONTAINS(table_name, '(?i)(_ext)|(person)|(activity_summary)|(steps_intraday)|(heart_rate)') + and not REGEXP_CONTAINS(table_name, '(?i)(_ext)|(person)|(consent)') + and lower(table_name) NOT IN ({{fitbit_tables_str}}) -- make sure table has a person_id column -- and table_name in ( @@ -120,7 +121,9 @@ def _get_time_columns(self, client): project=self.project_id, dataset=self.dataset_id, sandbox_dataset=self.sandbox_dataset_id, - lookup_table=self.sandbox_table_for(LOOKUP_TABLE)) + lookup_table=self.sandbox_table_for(LOOKUP_TABLE), + fitbit_tables_str='"' + '", "'.join(FITBIT_TABLES) + '"', + ) try: response = client.query(tables_columns_query, @@ -193,7 +196,8 @@ def get_sandbox_queries(self): {% if loop.index > 1 %} AND {% endif %} - {{column}} not in ({{exceptions}}) + ({{column}} not in ({{exceptions}}) + OR {{column}} is null) {% endfor %} ) {% endif %} @@ -230,7 +234,9 @@ def get_suppression_queries(self): suppression_queries = [] for table_name, _ in self.tables_and_columns.items(): - identifier = f'{table_name}_id' if table_name != DEATH else f'{PERSON}_id' + identifier = f'{table_name}_id' if table_name not in [ + DEATH, EHR_CONSENT_VALIDATION + ] else f'{PERSON}_id' suppression_record_query = suppression_record_query_template.render( project=self.project_id, dataset=self.dataset_id, diff --git a/data_steward/cdr_cleaner/cleaning_rules/generate_ext_tables.py b/data_steward/cdr_cleaner/cleaning_rules/generate_ext_tables.py index 9d500dc9ec..cc7c73e1e2 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/generate_ext_tables.py +++ b/data_steward/cdr_cleaner/cleaning_rules/generate_ext_tables.py @@ -49,7 +49,7 @@ , null as {{field}} {% endif %} {% endfor %} -FROM `{{project_id}}.{{mapping_dataset_id}}.{{mapping_table_id}}` m +FROM `{{project_id}}.{{dataset_id}}.{{mapping_table_id}}` m JOIN `{{project_id}}.{{shared_sandbox_id}}.{{site_maskings_table_id}}` s ON m.src_hpo_id = s.hpo_id ) @@ -65,7 +65,6 @@ def __init__(self, project_id, dataset_id, sandbox_dataset_id, - mapping_dataset_id, table_namer=None): """ Initialize the class with proper information. @@ -79,15 +78,14 @@ def __init__(self, in this dataset :param sandbox_dataset_id: dataset identifier. this dataset will hold helper/logging/lookup tables - :param mapping_dataset_id: dataset identifier. identifies a dataset that + :param dataset_id: dataset identifier. identifies a dataset that contains mapping tables to convert to extension tables. """ desc = 'Generate extension tables and populate with masking data from the site_maskings table' super().__init__(issue_numbers=ISSUE_NUMBERS, description=desc, affected_datasets=[ - cdr_consts.REGISTERED_TIER_DEID, - cdr_consts.CONTROLLED_TIER_DEID + cdr_consts.COMBINED, ], affected_tables=[], project_id=project_id, @@ -95,7 +93,6 @@ def __init__(self, sandbox_dataset_id=sandbox_dataset_id, table_namer=table_namer) - self._mapping_dataset_id = mapping_dataset_id self.mapping_table_ids = [] def get_table_fields_str(self, table, ext_table_id): @@ -134,11 +131,10 @@ def get_mapping_table_ids(self, client): """ returns all the mapping table ids found in the dataset :param project_id: project_id containing the dataset - :param mapping_dataset_id: dataset_id containing mapping tables + :param dataset_id: dataset_id containing mapping tables :return: returns mapping table ids """ - dataset_ref = DatasetReference(self.project_id, - self._mapping_dataset_id) + dataset_ref = DatasetReference(self.project_id, self.dataset_id) table_objs = client.list_tables(dataset_ref) mapping_table_ids = [ table_obj.table_id @@ -184,7 +180,6 @@ def get_query_specs(self): ext_table_fields=ext_table_fields_str, additional_fields=additional_field_names, cdm_table_id=cdm_table_id, - mapping_dataset_id=self._mapping_dataset_id, mapping_table_id=mapping_table_id, shared_sandbox_id=self.sandbox_dataset_id, site_maskings_table_id=SITE_MASKING_TABLE_ID) @@ -234,31 +229,19 @@ def validate_rule(self, client, *args, **keyword_args): import cdr_cleaner.args_parser as parser import cdr_cleaner.clean_cdr_engine as clean_engine - mapping_dataset_arg = { - parser.SHORT_ARGUMENT: '-m', - parser.LONG_ARGUMENT: '--mapping_dataset_id', - parser.ACTION: 'store', - parser.DEST: 'mapping_dataset_id', - parser.HELP: 'Identifies the dataset containing the mapping tables', - parser.REQUIRED: True - } - - ARGS = parser.default_parse_args([mapping_dataset_arg]) + ARGS = parser.default_parse_args() pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True) if ARGS.list_queries: clean_engine.add_console_logging() - query_list = clean_engine.get_query_list( - ARGS.project_id, - ARGS.dataset_id, - ARGS.sandbox_dataset_id, [(GenerateExtTables,)], - mapping_dataset_id=ARGS.mapping_dataset_id) + query_list = clean_engine.get_query_list(ARGS.project_id, + ARGS.dataset_id, + ARGS.sandbox_dataset_id, + [(GenerateExtTables,)]) for query_info in query_list: LOGGER.info(query_info) else: clean_engine.add_console_logging(ARGS.console_log) - clean_engine.clean_dataset(ARGS.project_id, - ARGS.dataset_id, + clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id, - [(GenerateExtTables,)], - mapping_dataset_id=ARGS.mapping_dataset_id) + [(GenerateExtTables,)]) diff --git a/data_steward/cdr_cleaner/cleaning_rules/populate_survey_conduct_ext.py b/data_steward/cdr_cleaner/cleaning_rules/populate_survey_conduct_ext.py index 5d826fc621..dbc9fb21f3 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/populate_survey_conduct_ext.py +++ b/data_steward/cdr_cleaner/cleaning_rules/populate_survey_conduct_ext.py @@ -34,7 +34,7 @@ def __init__(self, project_id, dataset_id, sandbox_dataset_id, - clean_survey_dataset_id=None, + clean_survey_dataset_id, table_namer=None): """ Initialize the class with proper information. diff --git a/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent.py b/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent.py index 4f93873f8f..778bbf6990 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent.py +++ b/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent.py @@ -21,7 +21,7 @@ LOGGER = logging.getLogger(__name__) -JIRA_ISSUE_NUMBERS = ['DC1644', 'DC3355'] +JIRA_ISSUE_NUMBERS = ['DC1644', 'DC3355', 'DC3434'] EHR_UNCONSENTED_PARTICIPANTS_LOOKUP_TABLE = '_ehr_unconsented_pids' @@ -65,8 +65,13 @@ `{{project}}.{{dataset}}.consent_validation` WHERE consent_for_electronic_health_records IS NULL - OR + OR UPPER(consent_for_electronic_health_records) != 'SUBMITTED' +) + OR + person_id IN ( -- dup accounts -- + SELECT DISTINCT person_id FROM + `{{project}}.{{duplicates_dataset}}.{{duplicates_table}}` ) ) """) @@ -88,7 +93,7 @@ person_id FROM `{{project}}.{{sandbox_dataset}}.{{unconsented_lookup}}`) - AND src_dataset_id LIKE '%ehr%' + AND src_dataset_id LIKE '%ehr%' ) """) @@ -101,7 +106,7 @@ SELECT {{domain_table}}_id FROM - `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}`) + `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}`) """) @@ -111,7 +116,13 @@ class RemoveEhrDataWithoutConsent(BaseCleaningRule): sandboxed and dropped from the CDR. """ - def __init__(self, project_id, dataset_id, sandbox_dataset_id): + def __init__(self, + project_id, + dataset_id, + sandbox_dataset_id, + table_namer=None, + ehr_duplicates_dataset=None, + ehr_duplicates_table=None): """ Initialize the class with proper information. @@ -124,15 +135,24 @@ def __init__(self, project_id, dataset_id, sandbox_dataset_id): """ desc = ( 'All EHR data associated with a participant if their EHR consent is not present in the observation ' - 'table will be sandboxed and dropped from the CDR.') + 'table will be sandboxed and dropped from the CDR. This includes duplicate records' + ) + + if not ehr_duplicates_table or not ehr_duplicates_table: + raise RuntimeError('duplicate data is not present') + + self.ehr_duplicates_dataset = ehr_duplicates_dataset + self.ehr_duplicates_table = ehr_duplicates_table - super().__init__(issue_numbers=JIRA_ISSUE_NUMBERS, - description=desc, - affected_datasets=[cdr_consts.COMBINED], - affected_tables=AFFECTED_TABLES, - project_id=project_id, - dataset_id=dataset_id, - sandbox_dataset_id=sandbox_dataset_id) + super().__init__( + issue_numbers=JIRA_ISSUE_NUMBERS, + description=desc, + affected_datasets=[cdr_consts.COMBINED], + affected_tables=AFFECTED_TABLES, + project_id=project_id, + dataset_id=dataset_id, + sandbox_dataset_id=sandbox_dataset_id, + ) def get_query_specs(self): """ @@ -151,7 +171,9 @@ def get_query_specs(self): project=self.project_id, dataset=self.dataset_id, sandbox_dataset=self.sandbox_dataset_id, - unconsented_lookup=EHR_UNCONSENTED_PARTICIPANTS_LOOKUP_TABLE + unconsented_lookup=EHR_UNCONSENTED_PARTICIPANTS_LOOKUP_TABLE, + duplicates_dataset=self.ehr_duplicates_dataset, + duplicates_table=self.ehr_duplicates_table, ) } lookup_queries.append(unconsented_lookup_query) @@ -222,17 +244,39 @@ def get_sandbox_tablenames(self): import cdr_cleaner.clean_cdr_engine as clean_engine ext_parser = parser.get_argument_parser() + ext_parser.add_argument('--ehr_duplicates_dataset', + action='store', + dest='ehr_duplicates_dataset', + required=True, + help='The dataset that includes duplicate records') + ext_parser.add_argument( + '--ehr_duplicates_table', + action='store', + dest='ehr_duplicates_table', + required=True, + help='The table (from the dataset) that includes duplicates PIDs') + ARGS = ext_parser.parse_args() if ARGS.list_queries: clean_engine.add_console_logging() query_list = clean_engine.get_query_list( - ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id, - [(RemoveEhrDataWithoutConsent,)]) + ARGS.project_id, + ARGS.dataset_id, + ARGS.sandbox_dataset_id, + [(RemoveEhrDataWithoutConsent,)], + ehr_duplicates_dataset=ARGS.duplicates_dataset, + ehr_duplicates_table=ARGS.duplicates_table, + ) + for query in query_list: LOGGER.info(query) else: clean_engine.add_console_logging(ARGS.console_log) - clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id, - ARGS.sandbox_dataset_id, ARGS.cutoff_date, - [(RemoveEhrDataWithoutConsent,)]) + clean_engine.clean_dataset( + ARGS.project_id, + ARGS.dataset_id, + ARGS.sandbox_dataset_id, + ARGS.cutoff_date, [(RemoveEhrDataWithoutConsent,)], + ehr_duplicates_dataset=ARGS.duplicates_dataset, + edr_duplicates_table=ARGS.duplicates_table) diff --git a/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables.py b/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables.py index 8b306d4439..7a17735762 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables.py +++ b/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables.py @@ -16,7 +16,7 @@ # Project imports from cdr_cleaner.cleaning_rules.base_cleaning_rule import BaseCleaningRule from constants.cdr_cleaner import clean_cdr as cdr_consts -from common import JINJA_ENV +from common import JINJA_ENV, AOU_CUSTOM_TABLES, WEAR_STUDY from resources import cdm_schemas, has_domain_table_id from utils import pipeline_logging from utils.bq import list_tables @@ -67,7 +67,9 @@ def __init__(self, f'{table}_ext' for table in cdm_schemas().keys() if has_domain_table_id(table) } - {'person_ext'}) + ['person_src_hpos_ext'] - affected_tables = cdm_achilles_vocab_tables + extension_tables + # To Keep AOU_DEATH table + custom_tables = AOU_CUSTOM_TABLES + [WEAR_STUDY] + affected_tables = cdm_achilles_vocab_tables + extension_tables + custom_tables super().__init__(issue_numbers=['DC1441'], description=desc, affected_datasets=[cdr_consts.CONTROLLED_TIER_DEID], diff --git a/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date.py b/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date.py index bfe50536b2..a5af0591e5 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date.py +++ b/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date.py @@ -67,7 +67,7 @@ OR (death_datetime IS NULL AND death_date >= DATE(d.deactivated_datetime)) {% elif table_ref.table_id in ['activity_summary', 'heart_rate_summary'] %} WHERE date >= DATE(d.deactivated_datetime) -{% elif table_ref.table_id in ['heart_rate_minute_level', 'steps_intraday'] %} +{% elif table_ref.table_id in ['heart_rate_intraday', 'steps_intraday'] %} WHERE datetime >= DATETIME(d.deactivated_datetime) {% elif table_ref.table_id in ['payer_plan_period', 'observation_period'] %} WHERE COALESCE({{table_ref.table_id + '_end_date'}}, @@ -302,7 +302,7 @@ def setup_rule(self, client): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.key_path # gets the deactivated participant dataset to ensure it's up-to-date - df = psr.get_deactivated_participants(self.api_project_id, + df = psr.get_deactivated_participants(client, self.api_project_id, DEACTIVATED_PARTICIPANTS_COLUMNS) LOGGER.info(f"Found '{len(df)}' deactivated participants via RDR API") diff --git a/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_pids.py b/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_pids.py index 688190af90..bf287aca45 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_pids.py +++ b/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_pids.py @@ -11,7 +11,7 @@ SELECT t.* FROM `{{project}}.{{dataset}}.{{table}}` t {% if ehr_only and table not in ['death', 'aou_death'] %} JOIN `{{project}}.{{dataset}}._mapping_{{table}}` m -ON t.{{table}}_id = m.{{table}}_id AND LOWER(m.src_hpo_id) != 'rdr' +ON t.{{table}}_id = m.{{table}}_id AND LOWER(m.src_hpo_id) NOT IN ('ce', 'vibrent', 'healthpro') {% endif %} WHERE person_id IN ( SELECT person_id FROM `{{project}}.{{sandbox_dataset}}.{{lookup_table}}` diff --git a/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_withdrawn_pids.py b/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_withdrawn_pids.py new file mode 100644 index 0000000000..fbd0ff32f1 --- /dev/null +++ b/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_withdrawn_pids.py @@ -0,0 +1,155 @@ +""" +This cleaning rule uses a list of participants to remove all participant data from combined. +""" + +# Python imports +import logging + +# Project imports +from cdr_cleaner.cleaning_rules.sandbox_and_remove_pids import SandboxAndRemovePids, JINJA_ENV, PERSON_TABLE_QUERY, AOU_DEATH, CDM_TABLES +from constants.cdr_cleaner import clean_cdr as cdr_consts +from gcloud.bq import BigQueryClient + +LOGGER = logging.getLogger(__name__) + +ISSUE_NUMBERS = ['DC3442'] + +# Query template to copy withdrawn_dups_table from rdr dataset to combined sandbox dataset +COPY_WITHDRAWN_DUPS_TABLE_TEMPLATE = JINJA_ENV.from_string(""" +CREATE OR REPLACE TABLE + `{{project_id}}.{{sandbox_dataset_id}}.{{withdrawn_dups_table}}` AS + ( + SELECT + person_id, + hpo_id + src_id, + consent_for_study_enrollment_authored, + withdrawal_status + FROM + `{{project_id}}.{{dataset_id}}.{{withdrawn_dups_table}}` + ) +""") + + +class SandboxAndRemoveWithdrawnPids(SandboxAndRemovePids): + """ + Removes all participant data using a list of participants. + """ + + def __init__(self, project_id, dataset_id, sandbox_dataset_id, + withdrawn_dups_table): + """ + Initialize the class with proper information. + + Set the issue numbers, description and affected datasets. As other tickets may affect + this SQL, append them to the list of Jira Issues. + DO NOT REMOVE ORIGINAL JIRA ISSUE NUMBERS! + """ + + self.withdrawn_dups_table = withdrawn_dups_table + + desc = 'Sandbox and remove participant data from a list of participants.' + + super().__init__(issue_numbers=ISSUE_NUMBERS, + description=desc, + affected_datasets=[cdr_consts.RDR], + project_id=project_id, + dataset_id=dataset_id, + sandbox_dataset_id=sandbox_dataset_id, + affected_tables=[]) + + def setup_rule(self, client: BigQueryClient, ehr_only: bool = False): + """ + Get list of tables that have a person_id column, excluding mapping tables + :param ehr_only: For Combined dataset, True if removing only EHR records. False if removing both RDR and EHR records. + """ + + person_table_query = PERSON_TABLE_QUERY.render(project=self.project_id, + dataset=self.dataset_id, + ehr_only=ehr_only) + person_tables = client.query(person_table_query).result() + + self.affected_tables = [ + table.get('table_name') + for table in person_tables + if table.get('table_name') in CDM_TABLES + [AOU_DEATH] + ] + + # Copy withdrawn_dups_table from rdr dataset to combined sandbox dataset + copy_withdrawn_dups_table_query = COPY_WITHDRAWN_DUPS_TABLE_TEMPLATE.render( + project_id=self.project_id, + sandbox_dataset_id=self.sandbox_dataset_id, + dataset_id=self.dataset_id, + withdrawn_dups_table=self.withdrawn_dups_table) + + client.query(copy_withdrawn_dups_table_query).result() + + def get_query_specs(self) -> list: + sandbox_records_queries = self.get_sandbox_queries( + lookup_table=self.withdrawn_dups_table) + remove_pids_queries = self.get_remove_pids_queries( + lookup_table=self.withdrawn_dups_table) + + return sandbox_records_queries + remove_pids_queries + + def get_sandbox_tablenames(self): + """ + generates sandbox table names + """ + return [self.sandbox_table_for(table) for table in self.affected_tables] + + def setup_validation(self, client): + """ + Run required steps for validation setup. + """ + raise NotImplementedError("Please fix me.") + + def validate_rule(self, client): + """ + Validates the cleaning rule which deletes or updates the data from the tables. + """ + raise NotImplementedError("Please fix me.") + + +def parse_args(): + """ + This function expands the default argument list defined in cdr_cleaner.args_parser + :return: an expanded argument list object + """ + + import cdr_cleaner.args_parser as parser + + additional_arguments = [{ + parser.SHORT_ARGUMENT: '-l', + parser.LONG_ARGUMENT: '--withdrawn_dups_table', + parser.ACTION: 'store', + parser.DEST: 'withdrawn_dups_table', + parser.HELP: 'withdrawn_dups_table', + parser.REQUIRED: True + }] + args = parser.default_parse_args(additional_arguments) + return args + + +if __name__ == '__main__': + import cdr_cleaner.args_parser as parser + import cdr_cleaner.clean_cdr_engine as clean_engine + + ARGS = parser.parse_args() + + if ARGS.list_queries: + clean_engine.add_console_logging() + query_list = clean_engine.get_query_list( + ARGS.project_id, + ARGS.dataset_id, + ARGS.sandbox_dataset_id, [(SandboxAndRemoveWithdrawnPids,)], + withdrawn_dups_table=ARGS.withdrawn_dups_table) + for query in query_list: + LOGGER.info(query) + else: + clean_engine.add_console_logging(ARGS.console_log) + clean_engine.clean_dataset( + ARGS.project_id, + ARGS.dataset_id, + ARGS.sandbox_dataset_id, [(SandboxAndRemoveWithdrawnPids,)], + withdrawn_dups_table=ARGS.withdrawn_dups_table) diff --git a/data_steward/common.py b/data_steward/common.py index e3c2622b8f..f608b87479 100644 --- a/data_steward/common.py +++ b/data_steward/common.py @@ -84,6 +84,10 @@ OBSERVATION_PERIOD = 'observation_period' NOTE_NLP = 'note_nlp' +CDM_SOURCE = 'cdm_source' +COHORT = 'cohort' +COHORT_ATTRIBUTE = 'cohort_attribute' + OTHER_CLINICAL_TABLES = [OBSERVATION_PERIOD, NOTE_NLP] OTHER_CDM_TABLES = [ @@ -113,7 +117,7 @@ # Wearables ACTIVITY_SUMMARY = 'activity_summary' -HEART_RATE_MINUTE_LEVEL = 'heart_rate_minute_level' +HEART_RATE_INTRADAY = 'heart_rate_intraday' HEART_RATE_SUMMARY = 'heart_rate_summary' STEPS_INTRADAY = 'steps_intraday' SLEEP_DAILY_SUMMARY = 'sleep_daily_summary' @@ -124,8 +128,8 @@ WEAR_STUDY = 'wear_study' FITBIT_TABLES = [ - ACTIVITY_SUMMARY, HEART_RATE_MINUTE_LEVEL, HEART_RATE_SUMMARY, - STEPS_INTRADAY, SLEEP_DAILY_SUMMARY, SLEEP_LEVEL, DEVICE + ACTIVITY_SUMMARY, HEART_RATE_INTRADAY, HEART_RATE_SUMMARY, STEPS_INTRADAY, + SLEEP_DAILY_SUMMARY, SLEEP_LEVEL, DEVICE ] # Vocabulary @@ -252,6 +256,9 @@ MAX_DEID_DATE_SHIFT = 364 COPE_SURVEY_MAP = 'cope_survey_semantic_version_map' EHR_CONSENT_VALIDATION = 'consent_validation' +WEAR_CONSENT = 'wear_consent' +PDR_WITHDRAWALS_LIST = 'pdr_withdrawals_list' +PDR_EHR_LIST = 'pdr_ehr_list' # pipeline_tables dataset and contents PIPELINE_TABLES = 'pipeline_tables' @@ -268,6 +275,8 @@ ZIP_CODE_AGGREGATION_MAP = 'zip_code_aggregation_map' DEID_QUESTIONNAIRE_RESPONSE_MAP = '_deid_questionnaire_response_map' +AIAN_LIST = 'aian_list' + # Participant Summary EHR_OPS = 'ehr_ops' DRC_OPS = 'drc_ops' @@ -279,11 +288,11 @@ # src_id tables from RDR SRC_ID_TABLES = [ CARE_SITE, CONDITION_ERA, CONDITION_OCCURRENCE, COPE_SURVEY_MAP, COST, - DEATH, DEVICE_EXPOSURE, DOSE_ERA, DRUG_ERA, DRUG_EXPOSURE, + AOU_DEATH, DEVICE_EXPOSURE, DOSE_ERA, DRUG_ERA, DRUG_EXPOSURE, FACT_RELATIONSHIP, LOCATION, MEASUREMENT, METADATA, NOTE_NLP, OBSERVATION, OBSERVATION_PERIOD, PAYER_PLAN_PERIOD, PERSON, PID_RID_MAPPING, PROCEDURE_OCCURRENCE, PROVIDER, QUESTIONNAIRE_RESPONSE_ADDITIONAL_INFO, - SURVEY_CONDUCT, VISIT_DETAIL, VISIT_OCCURRENCE + SURVEY_CONDUCT, VISIT_DETAIL, VISIT_OCCURRENCE, EHR_CONSENT_VALIDATION ] # JINJA @@ -314,3 +323,4 @@ BIGQUERY_DATASET_ID = os.environ.get('BIGQUERY_DATASET_ID') UNIONED_DATASET_ID = os.environ.get('UNIONED_DATASET_ID') RDR_DATASET_ID = os.environ.get('RDR_DATASET_ID') +COMBINED_DATASET_ID = os.environ.get('COMBINED_DATASET_ID') diff --git a/data_steward/constants/cdr_cleaner/clean_cdr.py b/data_steward/constants/cdr_cleaner/clean_cdr.py index c6ac31f167..95e6f25a07 100644 --- a/data_steward/constants/cdr_cleaner/clean_cdr.py +++ b/data_steward/constants/cdr_cleaner/clean_cdr.py @@ -11,13 +11,13 @@ REGISTERED_TIER_DEID = 'registered_tier_deid' REGISTERED_TIER_DEID_BASE = 'registered_tier_deid_base' REGISTERED_TIER_DEID_CLEAN = 'registered_tier_deid_clean' -REGISTERED_TIER_FITBIT = 'registered_tier_fitbit' +REGISTERED_TIER_FITBIT = 'registered_tier_fitbit_deid' # post deid Controlled tier datasets CONTROLLED_TIER_DEID = 'controlled_tier_deid' CONTROLLED_TIER_DEID_BASE = 'controlled_tier_deid_base' CONTROLLED_TIER_DEID_CLEAN = 'controlled_tier_deid_clean' -CONTROLLED_TIER_FITBIT = 'controlled_tier_fitbit' +CONTROLLED_TIER_FITBIT = 'controlled_tier_fitbit_deid' DATA_CONSISTENCY = 'data_consistency' CRON_RETRACTION = 'cron_retraction' diff --git a/data_steward/constants/tools/create_combined_backup_dataset.py b/data_steward/constants/tools/create_combined_backup_dataset.py index 280c46b702..570fa56d75 100644 --- a/data_steward/constants/tools/create_combined_backup_dataset.py +++ b/data_steward/constants/tools/create_combined_backup_dataset.py @@ -1,5 +1,6 @@ import cdm -from common import (DEATH, JINJA_ENV, PERSON, SURVEY_CONDUCT) +from common import (DEATH, JINJA_ENV, PERSON, SURVEY_CONDUCT, + EHR_CONSENT_VALIDATION) SOURCE_VALUE_EHR_CONSENT = 'EHRConsentPII_ConsentPermission' CONCEPT_ID_CONSENT_PERMISSION_YES = 1586100 # ConsentPermission_Yes @@ -10,7 +11,7 @@ 'visit_occurrence_id', 'location_id', 'care_site_id', 'provider_id', 'visit_detail_id' ] -RDR_TABLES_TO_COPY = [PERSON, SURVEY_CONDUCT] +RDR_TABLES_TO_COPY = [PERSON, SURVEY_CONDUCT, EHR_CONSENT_VALIDATION] DOMAIN_TABLES = list( set(cdm.tables_to_map()) - set(RDR_TABLES_TO_COPY) - set([DEATH])) TABLES_TO_PROCESS = RDR_TABLES_TO_COPY + DOMAIN_TABLES @@ -87,12 +88,12 @@ MAPPING_QUERY = JINJA_ENV.from_string(""" SELECT DISTINCT '{{rdr_dataset_id}}' AS src_dataset_id, - {{domain_table}}_id AS src_{{domain_table}}_id, + t.{{domain_table}}_id AS src_{{domain_table}}_id, v.src_id as src_hpo_id, {% if domain_table in ['survey_conduct', 'person'] %} - {{domain_table}}_id AS {{domain_table}}_id, + t.{{domain_table}}_id AS {{domain_table}}_id, {% else %} - {{domain_table}}_id + {{mapping_constant}} AS {{domain_table}}_id, + t.{{domain_table}}_id + {{mapping_constant}} AS {{domain_table}}_id, {% endif %} '{{domain_table}}' as src_table_id FROM `{{rdr_dataset_id}}.{{domain_table}}` AS t diff --git a/data_steward/deid/aou.py b/data_steward/deid/aou.py index e971b3ea29..d6248f2cf7 100644 --- a/data_steward/deid/aou.py +++ b/data_steward/deid/aou.py @@ -95,6 +95,8 @@ from constants.deid.deid import MAX_AGE from deid.parser import parse_args from deid.press import Press +from utils import auth +from common import CDR_SCOPES from resources import DEID_PATH from tools.concept_ids_suppression import get_all_concept_ids @@ -121,7 +123,7 @@ def create_concept_id_lookup_table(client, input_dataset, credentials): :param credentials: bigquery credentials """ - lookup_tablename = input_dataset + "._concept_ids_suppression" + lookup_tablename = f"{client.project}.{input_dataset}._concept_ids_suppression" columns = [ 'vocabulary_id', 'concept_code', 'concept_name', 'concept_id', 'domain_id', 'rule', 'question' @@ -131,7 +133,10 @@ def create_concept_id_lookup_table(client, input_dataset, credentials): data = get_all_concept_ids(columns, input_dataset, client) # write this to bigquery. - data.to_gbq(lookup_tablename, credentials=credentials, if_exists='replace') + data.to_gbq(lookup_tablename, + project_id=client.project, + credentials=credentials, + if_exists='replace') class AOU(Press): @@ -139,13 +144,15 @@ class AOU(Press): def __init__(self, **args): args['store'] = 'bigquery' Press.__init__(self, **args) + self.run_as_email = args.get('run_as_email', '') self.private_key = args.get('private_key', '') - self.credentials = service_account.Credentials.from_service_account_file( - self.private_key) + self.credentials = auth.get_impersonation_credentials( + self.run_as_email, CDR_SCOPES) self.partition = args.get('cluster', False) self.priority = args.get('interactive', 'BATCH') self.project_id = app_identity.get_application_id() - self.bq_client = BigQueryClient(project_id=self.project_id) + self.bq_client = BigQueryClient(project_id=self.project_id, + credentials=self.credentials) if 'shift' in self.deid_rules: # @@ -221,11 +228,13 @@ def get_dataframe(self, sql=None, limit=None, query_config=None): try: if query_config: df = pd.read_gbq(sql, + project_id=self.bq_client.project, credentials=self.credentials, dialect='standard', configuration=query_config) else: df = pd.read_gbq(sql, + project_id=self.bq_client.project, credentials=self.credentials, dialect='standard') @@ -400,7 +409,7 @@ def submit(self, sql, create, dml=None): """ dml = False if dml is None else dml table_name = self.get_tablename() - client = bq.Client.from_service_account_json(self.private_key) + client = self.bq_client #bq.Client.from_service_account_json(self.private_key) # # Let's make sure the out dataset exists datasets = list(client.list_datasets()) diff --git a/data_steward/deid/config/ids/config.json b/data_steward/deid/config/ids/config.json index 056481b4fd..8228ba45df 100644 --- a/data_steward/deid/config/ids/config.json +++ b/data_steward/deid/config/ids/config.json @@ -1,220 +1,323 @@ [ - { - "_id":"generalize", - "RACE": [ - { - "comment": [ - "aggregate multi-race answers before generalizing ", - "single race answers. treat HLS as an ethnicity, ", - "not a race." - ], - "apply": "SQL", - "statement": [ - "(SELECT COUNT(obs.person_id) ", - "FROM :idataset.:table AS obs ", - "WHERE obs.person_id = :table.person_id ", - "AND obs.value_source_concept_id IN (1586141, 1586142, 1586143, 1586144, 1586145, 1586146) ", - "GROUP BY obs.person_id ) " - ], - "qualifier": " > 1 AND value_source_concept_id != 1586147 ", - "into": 2000000008, - "drop_duplicates": "True" - }, - { - "comment": "generalize single race values", - "values": [1586141, 1586144, 1586145], - "into": 2000000001, - "qualifier": "IN" - } - ], - "SEXUAL-ORIENTATION":[ - { - "comment": [ - "multi sexual orientation generalization rule, the values the count is ", - "limited to is provided by the 'on' key. it is best to ", - "perform aggregate functions before other generalizations" - ], - "apply": "COUNT", - "into": 2000000003, - "qualifier": "> 1", - "values": [903096, 903079, 1585901, 1585902, 1585903, 1585904, 1585900], - "drop_duplicates": "True" - }, - { - "comment": "generalize single response that is not straight", - "qualifier": "IN", - "values": [903096, 903079, 1585901, 1585902, 1585903, 1585904], - "into": 2000000003, - "drop_duplicates": "True" - } - ], - "SEX-AT-BIRTH":[ - { - "comment": "generalize any response that is not male and not female", - "qualifier": "IN", - "values": [903096, 903079, 1585848, 1585849], - "into": 2000000009 - } - ], - "GENDER":[ - { - "comment": [ - "multi gender generalization rule, the values the count is ", - "limited to is provided by the 'on' key. it is best to ", - "perform aggregate functions before other generalizations" - ], - "apply": "COUNT", - "into": 2000000002, - "qualifier": "> 1", - "values": [1585839, 1585840, 1585841, 1585842, 1585843], - "drop_duplicates": "True" - }, - { - "comment": [ - "transgender generalization rule. if sex-at-birth does ", - "not match the identified gender, generalize the gender. ", - "generalizes born female but gender is male, or born ", - "male but gender is female. all other gender choices are ", - "already generalized" - ], - "apply": "SQL", - "statement": [ - ":table_id IN (", - "SELECT gender.:table_id ", - "FROM :idataset.:table AS gender ", - "JOIN :idataset.:table AS sex_at_birth ", - "USING (person_id) ", - "WHERE gender.observation_source_concept_id = 1585838 ", - "AND ((gender.:fields = 1585839 AND sex_at_birth.:fields = 1585847) ", - "OR (gender.:fields = 1585840 AND sex_at_birth.:fields = 1585846)))" - ], - "into": 2000000002 - }, - { - "comment": "values that are not male or female will be generalized into other", - "qualifier": "IN", - "values": [903096, 903079, 1585843, 1585841,1585842], - "into": 2000000002 - } - ], - "EDUCATION":[ - { - "comment": "generalizing to no highschool degree", - "qualifier": "IN", - "values": [1585941, 1585942, 1585943, 1585944], - "into": 2000000007 - }, - { - "comment": "generalizing to above highschool degree", - "into": 2000000006, - "values": [1585947, 1585948], - "qualifier": "IN" - } - ], - "EMPLOYMENT":[ - { - "comment":"this will generalize values to unemployed or not employed. first row is basics survey answer values and the second row is cope survey answer values", - "qualifier":"IN", - "values":[1585955, 1585956, 1585957, 1585958, 1585959, 1585960, - 1333224, 1333132, 1332926, 1332757, 1332716, 1333197], - "into":2000000005, - "drop_duplicates": "True" - }, - { - "comment":"generalizing to currently employed. first row is basics survey answer values and the second row is cope survey answer values.", - "qualifier":"IN", - "values":[1585953, 1585954, - 1333341, 1333321], - "into":2000000004, - "drop_duplicates": "True" - } + { + "_id": "generalize", + "RACE": [ + { + "comment": [ + "aggregate multi-race answers before generalizing ", + "single race answers. treat HLS as an ethnicity, ", + "not a race." + ], + "apply": "SQL", + "statement": [ + "(SELECT COUNT(obs.person_id) ", + "FROM :idataset.:table AS obs ", + "WHERE obs.person_id = :table.person_id ", + "AND obs.value_source_concept_id IN (1586141, 1586142, 1586143, 1586144, 1586145, 1586146) ", + "GROUP BY obs.person_id ) " + ], + "qualifier": " > 1 AND value_source_concept_id != 1586147 ", + "into": 2000000008, + "drop_duplicates": "True" + }, + { + "comment": "generalize single race values", + "values": [ + 1586141, + 1586144, + 1586145 + ], + "into": 2000000001, + "qualifier": "IN" + } + ], + "SEXUAL-ORIENTATION": [ + { + "comment": [ + "multi sexual orientation generalization rule, the values the count is ", + "limited to is provided by the 'on' key. it is best to ", + "perform aggregate functions before other generalizations" + ], + "apply": "COUNT", + "into": 2000000003, + "qualifier": "> 1", + "values": [ + 903096, + 903079, + 1585901, + 1585902, + 1585903, + 1585904, + 1585900 + ], + "drop_duplicates": "True" + }, + { + "comment": "generalize single response that is not straight", + "qualifier": "IN", + "values": [ + 903096, + 903079, + 1585901, + 1585902, + 1585903, + 1585904 + ], + "into": 2000000003, + "drop_duplicates": "True" + } + ], + "SEX-AT-BIRTH": [ + { + "comment": "generalize any response that is not male and not female", + "qualifier": "IN", + "values": [ + 903096, + 903079, + 1585848, + 1585849 + ], + "into": 2000000009 + } + ], + "GENDER": [ + { + "comment": [ + "multi gender generalization rule, the values the count is ", + "limited to is provided by the 'on' key. it is best to ", + "perform aggregate functions before other generalizations" + ], + "apply": "COUNT", + "into": 2000000002, + "qualifier": "> 1", + "values": [ + 1585839, + 1585840, + 1585841, + 1585842, + 1585843 + ], + "drop_duplicates": "True" + }, + { + "comment": [ + "transgender generalization rule. if sex-at-birth does ", + "not match the identified gender, generalize the gender. ", + "generalizes born female but gender is male, or born ", + "male but gender is female. all other gender choices are ", + "already generalized" + ], + "apply": "SQL", + "statement": [ + ":table_id IN (", + "SELECT gender.:table_id ", + "FROM :idataset.:table AS gender ", + "JOIN :idataset.:table AS sex_at_birth ", + "USING (person_id) ", + "WHERE gender.observation_source_concept_id = 1585838 ", + "AND ((gender.:fields = 1585839 AND sex_at_birth.:fields = 1585847) ", + "OR (gender.:fields = 1585840 AND sex_at_birth.:fields = 1585846)))" + ], + "into": 2000000002 + }, + { + "comment": "values that are not male or female will be generalized into other", + "qualifier": "IN", + "values": [ + 903096, + 903079, + 1585843, + 1585841, + 1585842 + ], + "into": 2000000002 + } + ], + "EDUCATION": [ + { + "comment": "generalizing to no highschool degree", + "qualifier": "IN", + "values": [ + 1585941, + 1585942, + 1585943, + 1585944 + ], + "into": 2000000007 + }, + { + "comment": "generalizing to above highschool degree", + "into": 2000000006, + "values": [ + 1585947, + 1585948 + ], + "qualifier": "IN" + } + ], + "EMPLOYMENT": [ + { + "comment": "this will generalize values to unemployed or not employed. first row is basics survey answer values and the second row is cope survey answer values", + "qualifier": "IN", + "values": [ + 1585955, + 1585956, + 1585957, + 1585958, + 1585959, + 1585960, + 1333224, + 1333132, + 1332926, + 1332757, + 1332716, + 1333197 + ], + "into": 2000000005, + "drop_duplicates": "True" + }, + { + "comment": "generalizing to currently employed. first row is basics survey answer values and the second row is cope survey answer values.", + "qualifier": "IN", + "values": [ + 1585953, + 1585954, + 1333341, + 1333321 + ], + "into": 2000000004, + "drop_duplicates": "True" + } + ] + }, + { + "_id": "suppress", + "DEMOGRAPHICS-COLUMNS": [ + { + "comment": "list of columns to be suppressed regardless of the table", + "values": [ + "year_of_birth", + "month_of_birth", + "day_of_birth", + "race_concept_id", + "ethnicity_concept_id", + "location_id", + "provider_id", + "care_site_id", + "person_source_value", + "gender_concept_id", + "gender_source_value", + "gender_source_concept_id", + "race_source_value", + "race_source_concept_id", + "ethnicity_source_value", + "ethnicity_source_concept_id", + "cause_concept_id", + "cause_source_value", + "cause_source_concept_id", + "value_source_value", + "value_as_string" ] - - }, - { - "_id":"suppress", - "DEMOGRAPHICS-COLUMNS":[ - { - "comment":"list of columns to be suppressed regardless of the table", - "values": [ - "year_of_birth","month_of_birth","day_of_birth", - "race_concept_id","ethnicity_concept_id","location_id", - "provider_id","care_site_id","person_source_value", - "gender_concept_id","gender_source_value","gender_source_concept_id", - "race_source_value","race_source_concept_id", - "ethnicity_source_value","ethnicity_source_concept_id", - "cause_concept_id","cause_source_value","cause_source_concept_id", - "value_source_value","value_as_string" - ] - } - ], - "FILTERS":[ - {"filter":"person_id IN (SELECT DISTINCT person_id FROM :idataset._deid_map)"} - ], - "ICD-9":[ - {"apply":"REGEXP", "values":["^E8[0-4"],"description":"rare accidents"}, - {"apply":"REGEXP", "values":["^E8[0-4]"],"description":"rare accidents"}, - { - "apply":"REGEXP", - "values":[ - "^E8[0-4].*","^E91[0,3].*","^E9([9,7,6,5]|28.0).*", - "^79[8,9].*","^V3.*","^(76[4-9]|77[0-9])\\\\.([0-9]){0,2}.*", - "^P[0-9]{2}\\\\.[0-9]{1}.*","^Z38.*","^R99.*", - "^Y3[5,6,7,8].*","^x52.*","^(W6[5-9]|W7[0-4][0-9]).*", - "^(X92[0-9]|Y0[1-9]).*","^V[0-9]{2}.*" - ] - } + } + ], + "FILTERS": [ + { + "filter": "person_id IN (SELECT DISTINCT person_id FROM :idataset._deid_map)" + } + ], + "ICD-9": [ + { + "apply": "REGEXP", + "values": [ + "^E8[0-4" + ], + "description": "rare accidents" + }, + { + "apply": "REGEXP", + "values": [ + "^E8[0-4]" + ], + "description": "rare accidents" + }, + { + "apply": "REGEXP", + "values": [ + "^E8[0-4].*", + "^E91[0,3].*", + "^E9([9,7,6,5]|28.0).*", + "^79[8,9].*", + "^V3.*", + "^(76[4-9]|77[0-9])\\\\.([0-9]){0,2}.*", + "^P[0-9]{2}\\\\.[0-9]{1}.*", + "^Z38.*", + "^R99.*", + "^Y3[5,6,7,8].*", + "^x52.*", + "^(W6[5-9]|W7[0-4][0-9]).*", + "^(X92[0-9]|Y0[1-9]).*", + "^V[0-9]{2}.*" ] - }, - { - "_id": "shift", - "day": "SELECT date_shift from :table where :key_field=:key_value ", - "date": "DATE_SUB( CAST(:FIELD AS DATE), INTERVAL (:SHIFT) DAY) AS :FIELD", - "datetime": "TIMESTAMP_SUB( CAST(:FIELD AS TIMESTAMP), INTERVAL (:SHIFT) DAY) AS :FIELD" - }, - { - "_id": "compute", - "id": ["(SELECT research_id FROM :idataset._deid_map WHERE _deid_map.person_id = :value_field) as :FIELD"], - "month": ["EXTRACT (MONTH FROM :value_field) AS :FIELD"], - "day": ["EXTRACT (DAY FROM :value_field) AS :FIELD"], - "year": ["EXTRACT (YEAR FROM :value_field) AS :FIELD"] - }, - { - "_id": "dml_statements", + } + ] + }, + { + "_id": "shift", + "day": "SELECT date_shift from :table where :key_field=:key_value ", + "date": "DATE_SUB( CAST(:FIELD AS DATE), INTERVAL (:SHIFT) DAY) AS :FIELD", + "datetime": "TIMESTAMP_SUB( CAST(:FIELD AS TIMESTAMP), INTERVAL (:SHIFT) DAY) AS :FIELD" + }, + { + "_id": "compute", + "id": [ + "(SELECT research_id FROM :idataset._deid_map WHERE _deid_map.person_id = :value_field) as :FIELD" + ], + "month": [ + "EXTRACT (MONTH FROM :value_field) AS :FIELD" + ], + "day": [ + "EXTRACT (DAY FROM :value_field) AS :FIELD" + ], + "year": [ + "EXTRACT (YEAR FROM :value_field) AS :FIELD" + ] + }, + { + "_id": "dml_statements", + "comment": [ + "A place to define Data Manipulation Statments to execute after creating ", + "a de-identified table. The statements are defined as a list of dictionaries ", + "containing a comment about the intent of each statement and an SQL DML statement." + ], + "observation": [ + { "comment": [ - "A place to define Data Manipulation Statments to execute after creating ", - "a de-identified table. The statements are defined as a list of dictionaries ", - "containing a comment about the intent of each statement and an SQL DML statement." - ], - "observation": [ - { - "comment": [ - "SQL to delete extra rows created by generalization rules. ", - "This can happen when questions with multiple possible answers are ", - "generalized. The exact delete statement to use will depend on the ", - "table receiving the generalization and its available fields." - ], - "name": "drop_generalized_duplicates", - "label": "cleaning_query", - "statement": [ - "DELETE FROM", - ":odataset.observation AS o ", - "WHERE observation_id in (", - "SELECT observation_id FROM (", - "SELECT DENSE_RANK() OVER(PARTITION BY person_id, observation_source_concept_id, value_source_concept_id ", - "ORDER BY observation_datetime DESC, observation_id DESC) AS rank_order, ", - "observation_id ", - "FROM :odataset.observation ", - "JOIN :idataset._mapping_observation AS map ", - "USING (observation_id) ", - "WHERE observation_source_concept_id IN (:key_values) ", - "AND value_source_concept_id IN (:generalized_values) ", - "AND map.src_hpo_id like \"rdr\"", - ") o ", - "WHERE o.rank_order <> 1 ", - ")" - ] - } + "SQL to delete extra rows created by generalization rules. ", + "This can happen when questions with multiple possible answers are ", + "generalized. The exact delete statement to use will depend on the ", + "table receiving the generalization and its available fields." + ], + "name": "drop_generalized_duplicates", + "label": "cleaning_query", + "statement": [ + "DELETE FROM", + ":odataset.observation AS o ", + "WHERE observation_id in (", + "SELECT observation_id FROM (", + "SELECT DENSE_RANK() OVER(PARTITION BY person_id, observation_source_concept_id, value_source_concept_id ", + "ORDER BY observation_datetime DESC, observation_id DESC) AS rank_order, ", + "observation_id ", + "FROM :odataset.observation ", + "JOIN :idataset._mapping_observation AS map ", + "USING (observation_id) ", + "WHERE observation_source_concept_id IN (:key_values) ", + "AND value_source_concept_id IN (:generalized_values) ", + "AND map.src_hpo_id in (\"ce\", \"healthpro\", \"vibrent\")", + ") o ", + "WHERE o.rank_order <> 1 ", + ")" ] - } + } + ] + } ] diff --git a/data_steward/deid/parser.py b/data_steward/deid/parser.py index 353c2aa872..ce73149441 100644 --- a/data_steward/deid/parser.py +++ b/data_steward/deid/parser.py @@ -135,6 +135,11 @@ def parse_args(raw_args=None): dest='rules', help='Filepath to the JSON file containing rules', required=True) + parser.add_argument('--run_as', + action='store', + dest='run_as_email', + help='Service account email address to impersonate', + required=True) parser.add_argument('--idataset', action='store', dest='idataset', diff --git a/data_steward/deid/press.py b/data_steward/deid/press.py index 152bcde488..6b1964f77b 100644 --- a/data_steward/deid/press.py +++ b/data_steward/deid/press.py @@ -18,6 +18,8 @@ import app_identity from resources import fields_for from deid.rules import Deid, create_on_string +from utils import auth +from common import CDR_SCOPES LOGGER = logging.getLogger(__name__) @@ -70,10 +72,16 @@ def __init__(self, **args): self.idataset = args.get('idataset', '') self.odataset = args.get('odataset', '') self.tablepath = args.get('table') + self.run_as_email = args.get('run_as_email', '') + self.credentials = None + if self.run_as_email: + self.credentials = auth.get_impersonation_credentials( + self.run_as_email, CDR_SCOPES) self.tablename = os.path.basename( self.tablepath).split('.json')[0].strip() self.project_id = app_identity.get_application_id() - self.bq_client = BigQueryClient(project_id=self.project_id) + self.bq_client = BigQueryClient(project_id=self.project_id, + credentials=self.credentials) self.logpath = args.get('logs', 'logs') set_up_logging(self.logpath, self.idataset) diff --git a/data_steward/gcloud/bq/__init__.py b/data_steward/gcloud/bq/__init__.py index 2454ffc464..b9fd9ce517 100644 --- a/data_steward/gcloud/bq/__init__.py +++ b/data_steward/gcloud/bq/__init__.py @@ -24,7 +24,8 @@ # Project imports from utils import auth -from resources import fields_for, get_and_validate_schema_fields, replace_special_characters_for_labels +from resources import fields_for, get_and_validate_schema_fields, replace_special_characters_for_labels, \ + is_rdr_dataset, is_mapping_table from constants.utils import bq as consts from common import JINJA_ENV, IDENTITY_MATCH, PARTICIPANT_MATCH @@ -481,6 +482,9 @@ def build_and_copy_contents(self, src_dataset: str, dest_dataset: str): for table_item in table_list: # create empty schemaed table with client object try: + if is_rdr_dataset(src_dataset) and is_mapping_table( + table_item.table_id): + raise RuntimeError schema_list = self.get_table_schema(table_item.table_id) except RuntimeError as re: schema_list = None diff --git a/data_steward/resource_files/schemas/aou_custom/duplicates_table.json b/data_steward/resource_files/schemas/aou_custom/duplicates_table.json new file mode 100644 index 0000000000..e364c54c9f --- /dev/null +++ b/data_steward/resource_files/schemas/aou_custom/duplicates_table.json @@ -0,0 +1,32 @@ +[ + { + "type": "integer", + "name": "participant_id", + "mode": "nullable", + "description": "" + }, + { + "name": "hpo_id", + "type": "integer", + "mode": "nullable", + "description": "" + }, + { + "name": "src_id", + "type": "string", + "mode": "nullable", + "description": "" + }, + { + "name": "consent_for_study_enrollment_authored", + "type": "date", + "mode": "nullable", + "description": "" + }, + { + "name": "withdrawal_status", + "type": "string", + "mode": "nullable", + "description": "" + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/cdm/clinical/survey_conduct.json b/data_steward/resource_files/schemas/cdm/clinical/survey_conduct.json index ee1eb8bccf..de99ea5397 100644 --- a/data_steward/resource_files/schemas/cdm/clinical/survey_conduct.json +++ b/data_steward/resource_files/schemas/cdm/clinical/survey_conduct.json @@ -122,7 +122,7 @@ { "name": "validated_survey_source_value", "description": "Source value representing the validation status of the survey.", - "type": "integer", + "type": "string", "mode": "nullable" }, { diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_attribute_definition.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_attribute_definition.json new file mode 100644 index 0000000000..7da0b65d01 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_attribute_definition.json @@ -0,0 +1,38 @@ +[ + { + "type": "integer", + "name": "attribute_definition_id", + "mode": "required", + "description": "A unique identifier for each Attribute." + }, + { + "type": "string", + "name": "attribute_name", + "mode": "required", + "description": "A short description of the Attribute." + }, + { + "type": "string", + "name": "attribute_description", + "mode": "nullable", + "description": "A complete description of the Attribute definition." + }, + { + "type": "integer", + "name": "attribute_type_concept_id", + "mode": "required", + "description": "Type defining what kind of Attribute Definition the record represents and how the syntax may be executed." + }, + { + "type": "string", + "name": "attribute_syntax", + "mode": "nullable", + "description": "Syntax or code to operationalize the Attribute definition." + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_cohort_definition.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_cohort_definition.json new file mode 100644 index 0000000000..c6dba2558b --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_cohort_definition.json @@ -0,0 +1,50 @@ +[ + { + "type": "integer", + "name": "cohort_definition_id", + "mode": "required", + "description": "A unique identifier for each Cohort." + }, + { + "type": "string", + "name": "cohort_definition_name", + "mode": "required", + "description": "A short description of the Cohort." + }, + { + "type": "string", + "name": "cohort_definition_description", + "mode": "nullable", + "description": "A complete description of the Cohort definition" + }, + { + "type": "integer", + "name": "definition_type_concept_id", + "mode": "required", + "description": "Type defining what kind of Cohort Definition the record represents and how the syntax may be executed" + }, + { + "type": "string", + "name": "cohort_definition_syntax", + "mode": "nullable", + "description": "Syntax or code to operationalize the Cohort definition" + }, + { + "type": "integer", + "name": "subject_concept_id", + "mode": "required", + "description": "A foreign key to the Concept to which defines the domain of subjects that are members of the cohort (e.g., Person, Provider, Visit)." + }, + { + "type": "date", + "name": "cohort_initiation_date", + "mode": "nullable", + "description": "A date to indicate when the Cohort was initiated in the COHORT table." + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_device_cost.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_device_cost.json new file mode 100644 index 0000000000..a4ca523ba4 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_device_cost.json @@ -0,0 +1,63 @@ +[ + { + "type": "integer", + "name": "device_cost_id", + "mode": "required" + }, + { + "type": "integer", + "name": "device_exposure_id", + "mode": "required" + }, + { + "type": "integer", + "name": "currency_concept_id", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_copay", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_coinsurance", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_toward_deductible", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_payer", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_coordination_benefits", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_out_of_pocket", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_paid", + "mode": "nullable" + }, + { + "type": "integer", + "name": "payer_plan_period_id", + "mode": "nullable" + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_drug_cost.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_drug_cost.json new file mode 100644 index 0000000000..73cf256d27 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_drug_cost.json @@ -0,0 +1,78 @@ +[ + { + "type": "integer", + "name": "drug_cost_id", + "mode": "required" + }, + { + "type": "integer", + "name": "drug_exposure_id", + "mode": "required" + }, + { + "type": "integer", + "name": "currency_concept_id", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_copay", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_coinsurance", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_toward_deductible", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_payer", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_coordination_benefits", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_out_of_pocket", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_paid", + "mode": "nullable" + }, + { + "type": "float", + "name": "ingredient_cost", + "mode": "nullable" + }, + { + "type": "float", + "name": "dispensing_fee", + "mode": "nullable" + }, + { + "type": "float", + "name": "average_wholesale_price", + "mode": "nullable" + }, + { + "type": "integer", + "name": "payer_plan_period_id", + "mode": "nullable" + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_note.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_note.json new file mode 100644 index 0000000000..0b2b8678fb --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_note.json @@ -0,0 +1,92 @@ +[ + { + "type": "integer", + "name": "note_id", + "mode": "required", + "description": "A unique identifier for each note." + }, + { + "type": "integer", + "name": "person_id", + "mode": "required", + "description": "A foreign key identifier to the Person about whom the Note was recorded. The demographic details of that Person are stored in the PERSON table." + }, + { + "type": "date", + "name": "note_date", + "mode": "required", + "description": "The date the note was recorded." + }, + { + "type": "timestamp", + "name": "note_datetime", + "mode": "required", + "description": "The date and time the note was recorded." + }, + { + "type": "integer", + "name": "note_type_concept_id", + "mode": "required", + "description": "A foreign key to the predefined Concept in the Standardized Vocabularies reflecting the type, origin or provenance of the Note." + }, + { + "type": "integer", + "name": "note_class_concept_id", + "mode": "required", + "description": "A foreign key to the predefined Concept in the Standardized Vocabularies reflecting the HL7 LOINC Document Type Vocabulary classification of the note." + }, + { + "type": "string", + "name": "note_title", + "mode": "required", + "description": "The title of the Note as it appears in the source." + }, + { + "type": "string", + "name": "note_text", + "mode": "required", + "description": "The content of the Note." + }, + { + "type": "integer", + "name": "encoding_concept_id", + "mode": "required", + "description": "A foreign key to the predefined Concept in the Standardized Vocabularies reflecting the note character encoding type" + }, + { + "type": "integer", + "name": "language_concept_id", + "mode": "required", + "description": "A foreign key to the predefined Concept in the Standardized Vocabularies reflecting the language of the note" + }, + { + "type": "integer", + "name": "provider_id", + "mode": "nullable", + "description": "A foreign key to the Provider in the PROVIDER table who took the Note." + }, + { + "type": "integer", + "name": "visit_occurrence_id", + "mode": "nullable", + "description": "A foreign key to the Visit in the VISIT_OCCURRENCE table when the Note was taken." + }, + { + "type": "integer", + "name": "visit_detail_id", + "mode": "nullable", + "description": "The Visit Detail during which the note was written." + }, + { + "type": "string", + "name": "note_source_value", + "mode": "nullable", + "description": "The source value associated with the origin of the Note" + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_pdr_ehr_list.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_pdr_ehr_list.json new file mode 100644 index 0000000000..fc7c5ddaa2 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_pdr_ehr_list.json @@ -0,0 +1,32 @@ +[ + { + "type": "integer", + "name": "person_id", + "mode": "nullable", + "description": "A unique identifier for each Person's visit or encounter at a healthcare provider." + }, + { + "type": "integer", + "name": "hpo_id", + "mode": "nullable", + "description": "A foreign key identifier to the Person for whom the visit is recorded. The demographic details of that Person are stored in the PERSON table." + }, + { + "type": "string", + "name": "participant_origin", + "mode": "nullable", + "description": "A foreign key that refers to a visit Concept identifier in the Standardized Vocabularies." + }, + { + "type": "string", + "name": "consent_for_study_enrollment_authored", + "mode": "nullable", + "description": "The start date of the visit." + }, + { + "type": "string", + "name": "consent_for_electronic_health_records", + "mode": "nullable", + "description": "The date and time of the visit started." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_pdr_withdrawals_list.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_pdr_withdrawals_list.json new file mode 100644 index 0000000000..a9a8cf1483 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_pdr_withdrawals_list.json @@ -0,0 +1,32 @@ +[ + { + "type": "integer", + "name": "person_id", + "mode": "nullable", + "description": "A unique identifier for each Person's visit or encounter at a healthcare provider." + }, + { + "type": "integer", + "name": "hpo_id", + "mode": "nullable", + "description": "A foreign key identifier to the Person for whom the visit is recorded. The demographic details of that Person are stored in the PERSON table." + }, + { + "type": "string", + "name": "participant_origin", + "mode": "nullable", + "description": "A foreign key that refers to a visit Concept identifier in the Standardized Vocabularies." + }, + { + "type": "string", + "name": "consent_for_study_enrollment_authored", + "mode": "nullable", + "description": "The start date of the visit." + }, + { + "type": "string", + "name": "withdrawal_status", + "mode": "nullable", + "description": "The date and time of the visit started." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_procedure_cost.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_procedure_cost.json new file mode 100644 index 0000000000..ee8fb07013 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_procedure_cost.json @@ -0,0 +1,73 @@ +[ + { + "type": "integer", + "name": "procedure_cost_id", + "mode": "required" + }, + { + "type": "integer", + "name": "procedure_occurrence_id", + "mode": "required" + }, + { + "type": "integer", + "name": "currency_concept_id", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_copay", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_coinsurance", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_toward_deductible", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_payer", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_coordination_benefits", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_out_of_pocket", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_paid", + "mode": "nullable" + }, + { + "type": "integer", + "name": "revenue_code_concept_id", + "mode": "nullable" + }, + { + "type": "integer", + "name": "payer_plan_period_id", + "mode": "nullable" + }, + { + "type": "string", + "name": "revenue_code_source_value", + "mode": "nullable" + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_specimen.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_specimen.json new file mode 100644 index 0000000000..e8e2a8cac7 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_specimen.json @@ -0,0 +1,98 @@ +[ + { + "type": "integer", + "name": "specimen_id", + "mode": "required", + "description": "A unique identifier for each specimen." + }, + { + "type": "integer", + "name": "person_id", + "mode": "required", + "description": "A foreign key identifier to the Person for whom the Specimen is recorded." + }, + { + "type": "integer", + "name": "specimen_concept_id", + "mode": "required", + "description": "A foreign key referring to a Standard Concept identifier in the Standardized Vocabularies for the Specimen." + }, + { + "type": "integer", + "name": "specimen_type_concept_id", + "mode": "required", + "description": "A foreign key referring to the Concept identifier in the Standardized Vocabularies reflecting the system of record from which the Specimen was represented in the source data." + }, + { + "type": "date", + "name": "specimen_date", + "mode": "required", + "description": "The date the specimen was obtained from the Person." + }, + { + "type": "timestamp", + "name": "specimen_datetime", + "mode": "nullable", + "description": "The date and time on the date when the Specimen was obtained from the person." + }, + { + "type": "float", + "name": "quantity", + "mode": "nullable", + "description": "The amount of specimen collection from the person during the sampling procedure." + }, + { + "type": "integer", + "name": "unit_concept_id", + "mode": "nullable", + "description": "A foreign key to a Standard Concept identifier for the Unit associated with the numeric quantity of the Specimen collection." + }, + { + "type": "integer", + "name": "anatomic_site_concept_id", + "mode": "nullable", + "description": "A foreign key to a Standard Concept identifier for the anatomic location of specimen collection." + }, + { + "type": "integer", + "name": "disease_status_concept_id", + "mode": "nullable", + "description": "A foreign key to a Standard Concept identifier for the Disease Status of specimen collection." + }, + { + "type": "string", + "name": "specimen_source_id", + "mode": "nullable", + "description": "The Specimen identifier as it appears in the source data." + }, + { + "type": "string", + "name": "specimen_source_value", + "mode": "nullable", + "description": "The Specimen value as it appears in the source data. This value is mapped to a Standard Concept in the Standardized Vocabularies and the original code is, stored here for reference." + }, + { + "type": "string", + "name": "unit_source_value", + "mode": "nullable", + "description": "The information about the Unit as detailed in the source." + }, + { + "type": "string", + "name": "anatomic_site_source_value", + "mode": "nullable", + "description": "The information about the anatomic site as detailed in the source." + }, + { + "type": "string", + "name": "disease_status_source_value", + "mode": "nullable", + "description": "The information about the disease status as detailed in the source." + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_survey_conduct.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_survey_conduct.json index 37fd06bfe8..5d58f0912e 100644 --- a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_survey_conduct.json +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_survey_conduct.json @@ -122,7 +122,7 @@ { "name": "validated_survey_source_value", "description": "Source value representing the validation status of the survey.", - "type": "integer", + "type": "string", "mode": "nullable" }, { diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_visit_cost.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_visit_cost.json new file mode 100644 index 0000000000..b17b0e496c --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_visit_cost.json @@ -0,0 +1,63 @@ +[ + { + "type": "integer", + "name": "visit_cost_id", + "mode": "required" + }, + { + "type": "integer", + "name": "visit_occurrence_id", + "mode": "required" + }, + { + "type": "integer", + "name": "currency_concept_id", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_copay", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_coinsurance", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_toward_deductible", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_payer", + "mode": "nullable" + }, + { + "type": "float", + "name": "paid_by_coordination_benefits", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_out_of_pocket", + "mode": "nullable" + }, + { + "type": "float", + "name": "total_paid", + "mode": "nullable" + }, + { + "type": "integer", + "name": "payer_plan_period_id", + "mode": "nullable" + }, + { + "type": "string", + "name": "src_id", + "mode": "required", + "description": "The source of the record." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_wear_consent.json b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_wear_consent.json new file mode 100644 index 0000000000..b5809ef956 --- /dev/null +++ b/data_steward/resource_files/schemas/rdr_src_id/tables/rdr_wear_consent.json @@ -0,0 +1,32 @@ +[ + { + "type": "integer", + "name": "person_id", + "mode": "nullable", + "description": "A unique identifier for each Person's visit or encounter at a healthcare provider." + }, + { + "type": "integer", + "name": "research_id", + "mode": "nullable", + "description": "A foreign key identifier to the Person for whom the visit is recorded. The demographic details of that Person are stored in the PERSON table." + }, + { + "type": "timestamp", + "name": "authored", + "mode": "nullable", + "description": "A foreign key that refers to a visit Concept identifier in the Standardized Vocabularies." + }, + { + "type": "string", + "name": "consent_status", + "mode": "nullable", + "description": "The start date of the visit." + }, + { + "type": "string", + "name": "src_id", + "mode": "nullable", + "description": "The date and time of the visit started." + } +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/wearables/fitbit/activity_summary.json b/data_steward/resource_files/schemas/wearables/fitbit/activity_summary.json index b26e49520d..e3e9ea2909 100644 --- a/data_steward/resource_files/schemas/wearables/fitbit/activity_summary.json +++ b/data_steward/resource_files/schemas/wearables/fitbit/activity_summary.json @@ -27,7 +27,7 @@ "type": "float", "name": "elevation", "mode": "nullable", - "description": "The elevation traveled for the day displayed in the units defined by the data source. When src_id is 'vibrent', the unit is feet. When src_id is 'ce', the unit is meters." + "description": "The elevation traveled for the day displayed in the units defined by the data source. When src_id is 'PTSC', the unit is feet. When src_id is 'TPC', the unit is meters." }, { "type": "float", @@ -83,4 +83,4 @@ "mode": "nullable", "description": "Flag indicating the participant portal that this record originated from." } -] +] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/wearables/fitbit/heart_rate_minute_level.json b/data_steward/resource_files/schemas/wearables/fitbit/heart_rate_intraday.json similarity index 82% rename from data_steward/resource_files/schemas/wearables/fitbit/heart_rate_minute_level.json rename to data_steward/resource_files/schemas/wearables/fitbit/heart_rate_intraday.json index c1452062fa..6c7f88c116 100644 --- a/data_steward/resource_files/schemas/wearables/fitbit/heart_rate_minute_level.json +++ b/data_steward/resource_files/schemas/wearables/fitbit/heart_rate_intraday.json @@ -9,7 +9,7 @@ "type": "integer", "name": "heart_rate_value", "mode": "nullable", - "description": "Heart rate value on a minute level." + "description": "Heart rate value. The value is at minute level for PTSC records and at second level for TPC records." }, { "type": "integer", @@ -23,4 +23,4 @@ "mode": "nullable", "description": "Flag indicating the participant portal that this record originated from." } -] +] \ No newline at end of file diff --git a/data_steward/resources.py b/data_steward/resources.py index 47392757d6..c9fa716356 100644 --- a/data_steward/resources.py +++ b/data_steward/resources.py @@ -648,6 +648,16 @@ def get_primary_key(table: str) -> List[str]: return f'{table}_id' if has_primary_key(table) else None +def is_rdr_dataset(dataset_id): + """ + Returns boolean indicating if a dataset is a rdr dataset using the dataset_id + :param dataset_id: Identifies the dataset + :return: Boolean indicating if the dataset is an ehr dataset + NOTE It returns True for rdr_xyz datasets too (e.g. rdr_sandbox) + """ + return 'rdr' in dataset_id + + def get_git_tag(): """ gets latest git tag. diff --git a/data_steward/tools/add_hpo.py b/data_steward/tools/add_hpo.py index ffade9efa1..4d5f4426b2 100644 --- a/data_steward/tools/add_hpo.py +++ b/data_steward/tools/add_hpo.py @@ -172,6 +172,8 @@ def add_src_hpos_allowed_state_file_df(hpo_id, us_state, :param src_hpos_allowed_state_path: path to csv file containing hpo site information :raises ValueError if hpo_id already exists in the lookup table """ + hpo_id = hpo_id.lower() + hpo_table = bq_utils.get_hpo_site_state_info() hpo_table_df = pd.DataFrame(hpo_table) if hpo_id in set(hpo_table_df['hpo_id']) and us_state in set( @@ -244,8 +246,9 @@ def add_src_hpos_allowed_state_csv(hpo_id, us_state, value_source_concept_id, hpo_file_df = add_src_hpos_allowed_state_file_df( hpo_id, us_state, value_source_concept_id, src_hpos_allowed_state_path) hpo_file_df.to_csv(src_hpos_allowed_state_path, - quoting=csv.QUOTE_ALL, - index=False) + quoting=csv.QUOTE_NONE, + index=False, + float_format=lambda x: '%d' % x) def add_hpo_site_to_csv_files(hpo_id, diff --git a/data_steward/tools/create_combined_backup_dataset.py b/data_steward/tools/create_combined_backup_dataset.py index 39fffa42b2..40bf1a3e76 100644 --- a/data_steward/tools/create_combined_backup_dataset.py +++ b/data_steward/tools/create_combined_backup_dataset.py @@ -48,7 +48,8 @@ MEASUREMENT_DOMAIN_CONCEPT_ID, OBSERVATION_DOMAIN_CONCEPT_ID, PERSON, PIPELINE_TABLES, RDR_ID_CONSTANT, SITE_MASKING_TABLE_ID, SURVEY_CONDUCT, - VISIT_DETAIL) + VISIT_DETAIL, EHR_CONSENT_VALIDATION) + from resources import (fields_for, get_git_tag, has_person_id, mapping_table_for, CDM_TABLES) from utils import auth, pipeline_logging @@ -72,6 +73,8 @@ def assert_tables_in(client: BigQueryClient, dataset_id: str): table_ids = set([table.table_id for table in tables]) LOGGER.info(f'Confirming dataset, {dataset_id}, has tables: {table_ids}') for table in combine_consts.TABLES_TO_PROCESS: + if 'unioned' in dataset_id and table == EHR_CONSENT_VALIDATION: + continue if table not in table_ids: raise RuntimeError( f'Dataset {dataset_id} is missing table {table}. Aborting.') @@ -503,8 +506,8 @@ def main(raw_args=None): LOGGER.info('EHR + RDR combine started') LOGGER.info('Verifying all CDM tables in EHR and RDR datasets...') - assert_ehr_and_rdr_tables(client, args.rdr_dataset, - args.unioned_ehr_dataset) + assert_ehr_and_rdr_tables(client, args.unioned_ehr_dataset, + args.rdr_dataset) combined_sandbox = create_dataset(client, args.release_tag, 'sandbox') ehr_consent(client, args.rdr_dataset, combined_sandbox) diff --git a/data_steward/tools/create_combined_dataset.py b/data_steward/tools/create_combined_dataset.py index f8c6df2e78..1a911681d9 100644 --- a/data_steward/tools/create_combined_dataset.py +++ b/data_steward/tools/create_combined_dataset.py @@ -76,6 +76,18 @@ def parse_combined_args(raw_args=None): required=True, help='The EHR snapshot dataset ID') + parser.add_argument('--ehr_duplicates_dataset', + action='store', + dest='ehr_duplicates_dataset', + required=True, + help='The dataset that includes duplicate records') + parser.add_argument( + '--ehr_duplicates_table', + action='store', + dest='ehr_duplicates_table', + required=True, + help='The table (from the dataset) that includes duplicates PIDs') + common_args, unknown_args = parser.parse_known_args(raw_args) custom_args = clean_cdr._get_kwargs(unknown_args) return common_args, custom_args @@ -122,11 +134,29 @@ def main(raw_args=None): # clean the combined staging dataset cleaning_args = [ - '-p', args.curation_project_id, '-d', combined_staging, '-b', - combined_sandbox, '--data_stage', 'combined', "--cutoff_date", - args.cutoff_date, '--validation_dataset_id', args.validation_dataset_id, - '--ehr_dataset_id', args.ehr_dataset_id, '--api_project_id', - args.api_project_id, '--run_as', args.run_as_email, '-s' + '-p', + args.curation_project_id, + '-d', + combined_staging, + '-b', + combined_sandbox, + '--data_stage', + 'combined', + "--cutoff_date", + args.cutoff_date, + '--validation_dataset_id', + args.validation_dataset_id, + '--ehr_dataset_id', + args.ehr_dataset_id, + '--api_project_id', + args.api_project_id, + '--ehr_duplicates_dataset', + args.ehr_duplicates_dataset, + '--ehr_duplicates_table', + args.ehr_duplicates_table, + '--run_as', + args.run_as_email, + '-s', ] all_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs) @@ -159,7 +189,7 @@ def main(raw_args=None): def create_dataset(client, release_tag, dataset_type) -> str: """ - Create a dataset for the specified dataset type in the combined stage. + Create a dataset for the specified dataset type in the combined stage. :param client: a BigQueryClient :param release_tag: the release tag for this CDR diff --git a/data_steward/tools/create_ehr_snapshot.sh b/data_steward/tools/create_ehr_snapshot.sh index 0101e9f51a..93122e807c 100755 --- a/data_steward/tools/create_ehr_snapshot.sh +++ b/data_steward/tools/create_ehr_snapshot.sh @@ -36,7 +36,7 @@ if [[ -z "${key_file}" ]] || [[ -z "${ehr_dataset}" ]] || [[ -z "${dataset_relea exit 1 fi -app_id=$(python -c 'import json,sys;obj=json.load(sys.stdin);print(obj["project_id"]);' < "${key_file}") +app_id=$(python -c 'import json,sys;obj=json.load(sys.stdin);print(obj["quota_project_id"]);' < "${key_file}") echo "ehr_dataset --> ${ehr_dataset}" echo "app_id --> ${app_id}" @@ -51,7 +51,6 @@ export GOOGLE_APPLICATION_CREDENTIALS="${key_file}" export GOOGLE_CLOUD_PROJECT="${app_id}" #set application environment (ie dev, test, prod) -gcloud auth activate-service-account --key-file=${key_file} gcloud config set project ${app_id} # shellcheck source=src/set_path.sh diff --git a/data_steward/tools/create_rdr_snapshot.py b/data_steward/tools/create_rdr_snapshot.py index 40516b0af6..0530d0dd6e 100755 --- a/data_steward/tools/create_rdr_snapshot.py +++ b/data_steward/tools/create_rdr_snapshot.py @@ -12,16 +12,14 @@ from cdr_cleaner import clean_cdr from cdr_cleaner.args_parser import add_kwargs_to_args from gcloud.bq import BigQueryClient -from utils import auth, pipeline_logging import app_identity from resources import mapping_table_for from utils import auth, pipeline_logging from common import (AOU_DEATH, CDR_SCOPES, DEATH, METADATA, PID_RID_MAPPING, QUESTIONNAIRE_RESPONSE_ADDITIONAL_INFO, FACT_RELATIONSHIP, - COPE_SURVEY_MAP, BIGQUERY_DATASET_ID) -from utils import auth -from utils import pipeline_logging -from common import CDR_SCOPES, FACT_RELATIONSHIP, METADATA, DEATH + COPE_SURVEY_MAP, VOCABULARY_TABLES, BIGQUERY_DATASET_ID, + EHR_CONSENT_VALIDATION, WEAR_CONSENT, CDM_SOURCE, COHORT, + COHORT_ATTRIBUTE, PDR_WITHDRAWALS_LIST, PDR_EHR_LIST) LOGGER = logging.getLogger(__name__) @@ -124,14 +122,16 @@ def main(raw_args=None): f'{bq_client.project}.{datasets.get("staging")}') ] skip_tables = [ - AOU_DEATH, COPE_SURVEY_MAP, PID_RID_MAPPING, - QUESTIONNAIRE_RESPONSE_ADDITIONAL_INFO - ] + AOU_DEATH, COPE_SURVEY_MAP, PID_RID_MAPPING, WEAR_CONSENT, + QUESTIONNAIRE_RESPONSE_ADDITIONAL_INFO, EHR_CONSENT_VALIDATION, + CDM_SOURCE, COHORT, COHORT_ATTRIBUTE, PDR_WITHDRAWALS_LIST, PDR_EHR_LIST + ] + VOCABULARY_TABLES + for domain_table in domain_tables: if domain_table in skip_tables: continue else: - if domain_table not in [METADATA, FACT_RELATIONSHIP]: + if domain_table != METADATA: logging.info(f'Mapping {domain_table}...') mapping(bq_client, datasets.get("staging"), domain_table) drop_src_id(bq_client, datasets.get("staging"), domain_table) @@ -145,16 +145,16 @@ def main(raw_args=None): args.export_date, '--run_as', args.run_as_email ] + # Create an empty DEATH for clean RDR. Actual data is in AOU_DEATH. + _ = bq_client.create_tables( + [f"{bq_client.project}.{datasets.get('staging', 'UNSET')}.{DEATH}"]) + all_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs) clean_cdr.main(args=all_cleaning_args) bq_client.build_and_copy_contents(datasets.get('staging', 'UNSET'), datasets.get('clean', 'UNSET')) - # Create an empty DEATH for clean RDR. Actual data is in AOU_DEATH. - _ = bq_client.create_tables( - [f"{bq_client.project}.{datasets.get('clean', 'UNSET')}.{DEATH}"]) - # update sandbox description and labels sandbox_dataset = bq_client.get_dataset(datasets.get( 'sandbox', 'UNSET')) # Make an API request. @@ -257,15 +257,25 @@ def mapping_query(table_name, dataset_id=None, project_id=None): project_id = app_identity.get_application_id() domain_id = f'{table_name}_id' - return f''' - CREATE OR REPLACE TABLE `{project_id}.{dataset_id}.{mapping_table_for(table_name)}` AS ( - SELECT - dt.{domain_id}, dt.src_id - FROM - `{project_id}.{dataset_id}.{table_name}` as dt - ) - ''' + if table_name == FACT_RELATIONSHIP: + query = f''' + CREATE OR REPLACE TABLE `{project_id}.{dataset_id}.{mapping_table_for(table_name)}` ( + {domain_id} INTEGER, + src_id INTEGER + ) + ''' + + else: + query = f''' + CREATE OR REPLACE TABLE `{project_id}.{dataset_id}.{mapping_table_for(table_name)}` AS ( + SELECT + dt.{domain_id}, dt.src_id + FROM + `{project_id}.{dataset_id}.{table_name}` as dt + ) + ''' + return query if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/data_steward/tools/create_tier.py b/data_steward/tools/create_tier.py index 9ad8988514..2bd1c257bb 100644 --- a/data_steward/tools/create_tier.py +++ b/data_steward/tools/create_tier.py @@ -20,7 +20,7 @@ LOGGER = logging.getLogger(__name__) TIER_LIST = ['controlled', 'registered'] -DEID_STAGE_LIST = ['deid', 'deid_base', 'deid_clean', 'fitbit'] +DEID_STAGE_LIST = ['deid', 'deid_base', 'deid_clean', 'fitbit_deid'] def validate_tier_param(tier): diff --git a/data_steward/tools/deid_runner.sh b/data_steward/tools/deid_runner.sh index b950c4d1fd..c1b56ce173 100755 --- a/data_steward/tools/deid_runner.sh +++ b/data_steward/tools/deid_runner.sh @@ -13,6 +13,7 @@ Usage: deid_runner.sh --vocab_dataset --dataset_release_tag --deid_max_age + --clean_survey_dataset_id " while true; do @@ -57,9 +58,9 @@ while true; do esac done -if [[ -z "${key_file}" ]] || [[ -z "${cdr_id}" ]] || [[ -z "${run_as}" ]] || [[ -z "${pmi_email}" ]] || \ - [[ -z "${deid_questionnaire_response_map_dataset}" ]] || [[ -z "${vocab_dataset}" ]] || \ - [[ -z "${dataset_release_tag}" ]] || [[ -z "${deid_max_age}" ]]; then +if [[ -z "${key_file}" ]] || [[ -z "${cdr_id}" ]] || [[ -z "${run_as}" ]] || [[ -z "${pmi_email}" ]] || + [[ -z "${deid_questionnaire_response_map_dataset}" ]] || [[ -z "${vocab_dataset}" ]] || + [[ -z "${dataset_release_tag}" ]] || [[ -z "${deid_max_age}" ]]; then echo "${USAGE}" exit 1 fi @@ -68,12 +69,12 @@ echo "key_file --> ${key_file}" echo "cdr_id --> ${cdr_id}" echo "vocab_dataset --> ${vocab_dataset}" -APP_ID=$(python -c 'import json,sys;obj=json.load(sys.stdin);print(obj["project_id"]);' < "${key_file}") +APP_ID=$(python -c 'import json,sys;obj=json.load(sys.stdin);print(obj["quota_project_id"]);' <"${key_file}") +export PROJECT_ID="${APP_ID}" export GOOGLE_APPLICATION_CREDENTIALS="${key_file}" export GOOGLE_CLOUD_PROJECT="${APP_ID}" #set application environment (ie dev, test, prod) -gcloud auth activate-service-account --key-file="${key_file}" gcloud config set project "${APP_ID}" registered_cdr_deid="R${dataset_release_tag}_deid" @@ -85,6 +86,7 @@ DEID_DIR="${DATA_STEWARD_DIR}/deid" CLEANER_DIR="${DATA_STEWARD_DIR}/cdr_cleaner" HANDOFF_DATE="$(date -v +2d +'%Y-%m-%d')" data_stage="registered_tier_deid" +combined_dataset="${dataset_release_tag}_combined" export BIGQUERY_DATASET_ID="${registered_cdr_deid}" export PYTHONPATH="${PYTHONPATH}:${DEID_DIR}:${DATA_STEWARD_DIR}" @@ -92,18 +94,24 @@ export PYTHONPATH="${PYTHONPATH}:${DEID_DIR}:${DATA_STEWARD_DIR}" # Version is the most recent tag accessible from the current branch version=$(git describe --abbrev=0 --tags) +# create empty dataset for reg_{combined dataaset} +bq mk --dataset --description "Copy of ${combined_dataset}" --label "owner:curation" --label "phase:clean" --label "data_tier:registered" --label "release_tag:${dataset_release_tag}" --label "de_identified:false" "${APP_ID}":"${cdr_id}" + # create empty de-id dataset bq mk --dataset --description "${version} deidentified version of ${cdr_id}" --label "owner:curation" --label "phase:clean" --label "data_tier:registered" --label "release_tag:${dataset_release_tag}" --label "de_identified:true" "${APP_ID}":"${registered_cdr_deid}" # create the clinical tables python "${DATA_STEWARD_DIR}/cdm.py" "${registered_cdr_deid}" +#copy tables +"${TOOLS_DIR}"/table_copy.sh --source_app_id ${APP_ID} --target_app_id ${APP_ID} --source_dataset ${combined_dataset} --target_dataset ${cdr_id} --sync false + # copy OMOP vocabulary python "${DATA_STEWARD_DIR}/cdm.py" --component vocabulary "${registered_cdr_deid}" "${TOOLS_DIR}"/table_copy.sh --source_app_id "${APP_ID}" --target_app_id "${APP_ID}" --source_dataset "${vocab_dataset}" --target_dataset "${registered_cdr_deid}" # apply de-identification on registered tier dataset -python "${TOOLS_DIR}/run_deid.py" --idataset "${cdr_id}" --private_key "${key_file}" --action submit --interactive --console-log --age_limit "${deid_max_age}" --odataset "${registered_cdr_deid}" 2>&1 | tee deid_run.txt +python "${TOOLS_DIR}/run_deid.py" --idataset "${cdr_id}" --private_key "${key_file}" --action submit --interactive --console-log --age_limit "${deid_max_age}" --odataset "${registered_cdr_deid}" --run_as "${run_as}" 2>&1 | tee deid_run.txt # create empty sandbox dataset for the deid bq mk --dataset --force --description "${version} sandbox dataset to apply cleaning rules on ${registered_cdr_deid}" --label "owner:curation" --label "phase:sandbox" --label "data_tier:registered" --label "release_tag:${dataset_release_tag}" --label "de_identified:true" "${APP_ID}":"${registered_cdr_deid_sandbox}" @@ -117,7 +125,6 @@ python "${CLEANER_DIR}/clean_cdr.py" --project_id "${APP_ID}" --dataset_id "${re # Add GOOGLE_APPLICATION_CREDENTIALS environment variable export GOOGLE_APPLICATION_CREDENTIALS="${key_file}" -gcloud auth activate-service-account --key-file="${key_file}" # Copy cdr_metadata table python "${TOOLS_DIR}/add_cdr_metadata.py" --component "copy" --project_id "${APP_ID}" --target_dataset "${registered_cdr_deid}" --source_dataset "${cdr_id}" diff --git a/data_steward/tools/generate_unioned_ehr_dataset.sh b/data_steward/tools/generate_unioned_ehr_dataset.sh index 661b7aca3f..3e99862da7 100755 --- a/data_steward/tools/generate_unioned_ehr_dataset.sh +++ b/data_steward/tools/generate_unioned_ehr_dataset.sh @@ -61,7 +61,7 @@ if [[ -z "${key_file}" ]] || [[ -z "${run_as}" ]] || [[ -z "${pmi_email}" ]] || exit 1 fi -app_id=$(python -c 'import json,sys;obj=json.load(sys.stdin);print(obj["project_id"]);' < "${key_file}") +app_id=$(python -c 'import json,sys;obj=json.load(sys.stdin);print(obj["quota_project_id"]);' < "${key_file}") tag=$(git describe --abbrev=0 --tags) version=${tag} @@ -82,7 +82,6 @@ export GOOGLE_APPLICATION_CREDENTIALS="${key_file}" export GOOGLE_CLOUD_PROJECT="${app_id}" #set application environment (ie dev, test, prod) -gcloud auth activate-service-account --key-file=${key_file} gcloud config set project ${app_id} source "${TOOLS_DIR}/set_path.sh" @@ -114,7 +113,7 @@ python "${DATA_STEWARD_DIR}/cdm.py" --component vocabulary ${unioned_ehr_dataset #---------------------------------------------------------------------- # Step 5 copy mapping tables tables -"${TOOLS_DIR}/table_copy.sh" --source_app_id ${app_id} --target_app_id ${app_id} --source_dataset ${ehr_snapshot} --source_prefix _mapping_ --target_dataset ${unioned_ehr_dataset_backup} --target_prefix _mapping_ --sync false +"${TOOLS_DIR}/table_copy.sh" --source_app_id ${app_id} --target_app_id ${app_id} --source_dataset ${ehr_snapshot} --source_prefix _mapping_ --target_dataset ${unioned_ehr_dataset_backup} --target_prefix _mapping_ echo "removing tables copies unintentionally" bq rm -f ${unioned_ehr_dataset_backup}._mapping_ipmc_nu_condition_occurrence @@ -149,7 +148,6 @@ gcloud config set account "${pmi_email}" python "${CLEANER_DIR}/clean_cdr.py" --project_id "${app_id}" --run_as "${run_as}" --dataset_id "${unioned_ehr_dataset_staging}" --sandbox_dataset_id "${unioned_ehr_dataset_sandbox}" --data_stage "${data_stage}" -s --cutoff_date "${ehr_cutoff_date}" 2>&1 | tee unioned_cleaning_log_"${unioned_ehr_dataset_staging}".txt export GOOGLE_APPLICATION_CREDENTIALS="${key_file}" -gcloud auth activate-service-account --key-file=${key_file} # Create a snapshot dataset with the result python "${TOOLS_DIR}/snapshot_by_query.py" --project_id "${app_id}" --dataset_id "${unioned_ehr_dataset_staging}" --snapshot_dataset_id "${unioned_ehr_dataset}" diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index 24cf0bc471..537b4fcc2b 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -19,7 +19,7 @@ from common import CDR_SCOPES, AOU_DEATH, DEATH from resources import (replace_special_characters_for_labels, validate_date_string, rdr_src_id_schemas, cdm_schemas, - fields_for) + fields_for, rdr_specific_schemas) from tools.snapshot_by_query import BIGQUERY_DATA_TYPES from tools.import_rdr_omop import copy_vocab_tables @@ -91,6 +91,7 @@ def get_destination_schemas() -> Dict[str, List[Dict[str, str]]]: (3) rdr_xyz schema is used where possible. """ schema_dict = cdm_schemas() + schema_dict.update(rdr_specific_schemas()) schema_dict.update(rdr_src_id_schemas()) # Use aou_death instead of death for destination @@ -107,11 +108,11 @@ def create_rdr_tables(client, destination_dataset, rdr_project, Uses the client to load data directly from the dataset into a table. - + NOTE: Death records are loaded to AOU_DEATH table. We do not create DEATH table here because RDR's death records contain NULL death_date records, which violates CDM's DEATH definition. - We assign `aou_death_id` using UUID on the fly. + We assign `aou_death_id` using UUID on the fly. `primary_death_record` is set to FALSE here. The CR CalculatePrimaryDeathRecord will update it to the right values later in the RDR data stage. @@ -146,21 +147,23 @@ def create_rdr_tables(client, destination_dataset, rdr_project, destination_table.time_partitioning = bigquery.table.TimePartitioning( type_='DAY') + LOGGER.info(f'Creating empty CDM table, `{destination_table_id}`') + dest_table_ref = client.create_table(destination_table) + LOGGER.info( f'Loading `{source_table_id}` into `{destination_table_id}`') try: LOGGER.info(f'Get table `{source_table_id}` in RDR') - client.get_table(source_table_id) - - LOGGER.info(f'Creating empty CDM table, `{table}`') - destination_table = client.create_table( - destination_table) # Make an API request. + table_ref = client.get_table(source_table_id) LOGGER.info( f'Copying source table `{source_table_id}` to destination table `{destination_table_id}`' ) + if table_ref.num_rows == 0: + raise NotFound(f'`{source_table_id}` has No data To copy from') + sc_list = [] for item in schema_list: if item.name == 'aou_death_id': @@ -174,14 +177,17 @@ def create_rdr_tables(client, destination_dataset, rdr_project, fields_name_str = ',\n'.join(sc_list) # copy contents from source dataset to destination dataset - sql = tpl.render(schema_list=schema_list, - BIGQUERY_DATA_TYPES=BIGQUERY_DATA_TYPES, - source_table_id=source_table_id) + if table == 'cope_survey_semantic_version_map': + sql = (f'SELECT {fields_name_str} ' + f'FROM `{source_table_id}` ' + f'WHERE participant_id IS NOT Null') + else: + sql = (f'SELECT {fields_name_str} ' f'FROM `{source_table_id}`') job_config = bigquery.job.QueryJobConfig( write_disposition=bigquery.job.WriteDisposition.WRITE_EMPTY, priority=bigquery.job.QueryPriority.BATCH, - destination=destination_table, + destination=dest_table_ref, labels={ 'table_name': table.lower(), @@ -195,19 +201,16 @@ def create_rdr_tables(client, destination_dataset, rdr_project, f'{datetime.now().strftime("%Y%m%d_%H%M%S")}') job = client.query(sql, job_config=job_config, job_id=job_id) job.result() # Wait for the job to complete. + except NotFound: LOGGER.info( - f'{table} not provided by RDR team. Creating empty table ' - f'in dataset: `{destination_dataset}`') + f'`{destination_table_id}` is left empty because either ' + f'`{source_table_id}` does not exist or has no records.') - LOGGER.info(f'Creating empty CDM table, `{table}`') - destination_table = client.create_table(destination_table) - LOGGER.info(f'Created empty table `{destination_table.table_id}`') else: - destination_table = client.get_table( - destination_table_id) # Make an API request. - LOGGER.info(f'Loaded {destination_table.num_rows} rows into ' - f'`{destination_table.table_id}`.') + dest_table_ref = client.get_table(destination_table_id) + LOGGER.info(f'Loaded {dest_table_ref.num_rows} rows into ' + f'`{dest_table_ref.table_id}`.') LOGGER.info( f"Finished RDR table LOAD from dataset {rdr_project}.{rdr_source_dataset}" diff --git a/data_steward/tools/run_deid.py b/data_steward/tools/run_deid.py index 904ca59ae2..f058bc8e71 100644 --- a/data_steward/tools/run_deid.py +++ b/data_steward/tools/run_deid.py @@ -182,6 +182,11 @@ def parse_args(raw_args=None): dest='input_dataset', help='Name of the input dataset', required=True) + parser.add_argument('--run_as', + action='store', + dest='run_as_email', + help='Service account email address to impersonate', + required=True) parser.add_argument('-p', '--private_key', dest='private_key', @@ -353,10 +358,11 @@ def main(raw_args=None): parameter_list = [ '--rules', - os.path.join(DEID_PATH, 'config', 'ids', 'config.json'), - '--private_key', args.private_key, '--table', tablepath, '--action', - args.action, '--idataset', args.input_dataset, '--log', LOGS_PATH, - '--odataset', args.odataset, '--age-limit', args.age_limit + os.path.join(DEID_PATH, 'config', 'ids', + 'config.json'), '--private_key', args.private_key, + '--table', tablepath, '--action', args.action, '--idataset', + args.input_dataset, '--log', LOGS_PATH, '--odataset', args.odataset, + '--age-limit', args.age_limit, '--run_as', args.run_as_email ] if args.interactive_mode: diff --git a/data_steward/utils/participant_summary_requests.py b/data_steward/utils/participant_summary_requests.py index 1dbfc53efa..56496e9b42 100644 --- a/data_steward/utils/participant_summary_requests.py +++ b/data_steward/utils/participant_summary_requests.py @@ -61,7 +61,7 @@ MAX_TIMEOUT = 62 -def get_access_token(): +def get_access_token(client): """ Obtains GCP Bearer token @@ -72,8 +72,13 @@ def get_access_token(): 'https://www.googleapis.com/auth/cloud-platform', 'email', 'profile' ] - credentials, _ = default() - credentials = auth.delegated_credentials(credentials, scopes=scopes) + if client is None: + credentials, _ = default() + credentials = auth.delegated_credentials(credentials, scopes=scopes) + + else: + credentials = auth.get_impersonation_credentials( + client._credentials.service_account_email, target_scopes=scopes) request = req.Request() credentials.refresh(request) @@ -82,12 +87,14 @@ def get_access_token(): return access_token -def get_participant_data(api_project_id: str, +def get_participant_data(client, + api_project_id: str, params: Dict, expected_fields: List[str] = None) -> List[Dict]: """ Fetches participant data via ParticipantSummary API + :param client: BigQuery client object :param api_project_id: RDR project id when PS API rests :param params: the fields and their values :param expected_fields: filter participants not containing any of the fields. @@ -101,7 +108,7 @@ def get_participant_data(api_project_id: str, done = False participant_data = [] - token = get_access_token() + token = get_access_token(client) headers = { 'content-type': 'application/json', @@ -172,7 +179,7 @@ def get_paginated_participant_data(api_project_id: str, else: url = BASE_URL.format(api_project_id=api_project_id) - token = get_access_token() + token = get_access_token(None) headers = { 'content-type': 'application/json', @@ -306,11 +313,12 @@ def process_digital_health_data_to_json(api_data: List[Dict], return participant_records -def get_deactivated_participants(api_project_id: str, +def get_deactivated_participants(client, api_project_id: str, columns: List[str]) -> pandas.DataFrame: """ Fetches all deactivated participants via API if suspensionStatus = 'NO_CONTACT' and stores all the deactivated participants in a BigQuery dataset table + :param client: BigQuery client object :param api_project_id: The RDR project that contains participant summary data :param columns: columns to be pushed to a table in BigQuery in the form of a list of strings :return: returns dataframe of deactivated participants @@ -328,7 +336,9 @@ def get_deactivated_participants(api_project_id: str, # See https://github.com/all-of-us/raw-data-repository/blob/master/opsdataAPI.md for documentation of this api. params = {'_sort': 'lastModified', 'suspensionStatus': 'NO_CONTACT'} - participant_data = get_participant_data(api_project_id, params=params) + participant_data = get_participant_data(client, + api_project_id, + params=params) column_map = { 'participant_id': 'person_id', @@ -371,7 +381,7 @@ def get_site_participant_information(project_id: str, hpo_id: str): '_count': '1000' } - participant_data = get_participant_data(project_id, params=params) + participant_data = get_participant_data(None, project_id, params=params) column_map = {'participant_id': 'person_id'} @@ -415,7 +425,7 @@ def get_org_participant_information(project_id: str, '_count': '1000' } - participant_data = get_participant_data(project_id, params=params) + participant_data = get_participant_data(None, project_id, params=params) column_map = {'participant_id': 'person_id'} @@ -453,7 +463,7 @@ def get_all_participant_information(project_id: str) -> pandas.DataFrame: '_count': '10000' } - participant_data = get_participant_data(project_id, params=params) + participant_data = get_participant_data(None, project_id, params=params) column_map = {'participant_id': 'person_id'} @@ -485,6 +495,7 @@ def get_digital_health_information(project_id: str) -> List[Dict]: } participant_data = get_participant_data( + None, project_id, params=params, expected_fields=FIELDS_OF_INTEREST_FOR_DIGITAL_HEALTH) diff --git a/data_steward/validation/ehr_union.py b/data_steward/validation/ehr_union.py index d176bee984..e32a251a1e 100644 --- a/data_steward/validation/ehr_union.py +++ b/data_steward/validation/ehr_union.py @@ -954,15 +954,20 @@ def main(input_dataset_id, dataset_id=output_dataset_id) # Create mapping tables. AOU_DEATH and DEATH are not included here. + # SURVEY_CONDUCT's mapping table is created empty here b/c HPO sites do not submit survery_conduct records. for domain_table in cdm.tables_to_map(): if domain_table == SURVEY_CONDUCT: - continue + bq_client.create_tables([ + f'{project_id}.{output_dataset_id}.{mapping_table_for(SURVEY_CONDUCT)}' + ], + exists_ok=True) logging.info(f'Mapping {domain_table}...') mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id, bq_client) # Load all tables with union of submitted tables # AOU_DEATH and DEATH are not loaded here. + # SURVEY_CONDUCT is skipped here b/c HPO sites do not submit survery_conduct records. for table_name in CDM_TABLES: if table_name in [DEATH, SURVEY_CONDUCT]: continue diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle_test.py index d969be4b50..705f4c8f70 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_lifestyle_test.py @@ -10,7 +10,7 @@ # Project imports from app_identity import PROJECT_ID from cdr_cleaner.cleaning_rules.backfill_lifestyle import BackfillLifestyle -from common import JINJA_ENV, OBSERVATION, PERSON +from common import JINJA_ENV, OBSERVATION, PERSON, MAPPING_PREFIX from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest OBSERVATION_TMPL = JINJA_ENV.from_string(""" @@ -43,6 +43,26 @@ (3, 8532, 2001, 999, 99999) """) +MAPPING_TMPL = JINJA_ENV.from_string(""" +CREATE OR REPLACE TABLE `{{project}}.{{dataset}}._mapping_observation` + (observation_id INT64, src_id STRING) + ; +INSERT INTO `{{project}}.{{dataset}}._mapping_observation` +(observation_id, src_id) +VALUES + (101, 'src_1'), + (102, 'src_1'), + (103, 'src_1'), + (104, 'src_1'), + (105, 'src_1'), + (106, 'src_1'), + (107, 'src_1'), + (201, 'src_1'), + (301, 'src_2'), + (302, 'src_2'), + (303, 'src_2') +""") + class BackfillLifestyleTest(BaseTest.CleaningRulesTestBase): @@ -61,11 +81,19 @@ def setUpClass(cls): cls.rule_instance = BackfillLifestyle(cls.project_id, cls.dataset_id, cls.sandbox_id) - cls.fq_sandbox_table_names = [] + # NOTE _mapping_observation is not in cls.fq_table_names because its columns are different from the ones + # defined in the resource_files folder. It has the columns defined in `create_rdr_snapshot.py` instead. + cls.fq_mapping_table_name = f'{cls.project_id}.{cls.dataset_id}.{MAPPING_PREFIX}{OBSERVATION}' + + # Generate sandbox table names + sandbox_table_names = cls.rule_instance.get_sandbox_tablenames() + for table_name in sandbox_table_names: + cls.fq_sandbox_table_names.append( + f'{cls.project_id}.{cls.sandbox_id}.{table_name}') cls.fq_table_names = [ f'{cls.project_id}.{cls.dataset_id}.{OBSERVATION}', - f'{cls.project_id}.{cls.dataset_id}.{PERSON}', + f'{cls.project_id}.{cls.dataset_id}.{PERSON}' ] super().setUpClass() @@ -81,8 +109,10 @@ def setUp(self): dataset=self.dataset_id) insert_person = PERSON_TMPL.render(project=self.project_id, dataset=self.dataset_id) + insert_mapping = MAPPING_TMPL.render(project=self.project_id, + dataset=self.dataset_id) - queries = [insert_observation, insert_person] + queries = [insert_observation, insert_person, insert_mapping] self.load_test_data(queries) def test_backfill_lifestyle(self): @@ -99,13 +129,13 @@ def test_backfill_lifestyle(self): """ tables_and_counts = [{ 'fq_table_name': - f'{self.project_id}.{self.dataset_id}.{OBSERVATION}', + self.fq_table_names[0], 'fq_sandbox_table_name': - None, + self.fq_sandbox_table_names[0], 'loaded_ids': [ 101, 102, 103, 104, 105, 106, 107, 201, 301, 302, 303 ], - 'sandboxed_ids': [], + 'sandboxed_ids': [304, 305, 306, 307], 'fields': [ 'observation_id', 'person_id', 'observation_concept_id', 'observation_date', 'observation_type_concept_id', @@ -128,6 +158,22 @@ def test_backfill_lifestyle(self): (306, 3, 1586190, self.date_2022, 45905771, 1586190, None), (307, 3, 40766357, self.date_2022, 45905771, 1586198, None) ] + }, { + 'fq_table_name': + self.fq_mapping_table_name, + 'loaded_ids': [ + 101, 102, 103, 104, 105, 106, 107, 201, 301, 302, 303 + ], + 'fields': ['observation_id', 'src_id'], + 'cleaned_values': [(101, 'src_1'), (102, 'src_1'), (103, 'src_1'), + (104, 'src_1'), (105, 'src_1'), (106, 'src_1'), + (107, 'src_1'), (201, 'src_1'), (301, 'src_2'), + (302, 'src_2'), (303, 'src_2'), (304, 'src_2'), + (305, 'src_2'), (306, 'src_2'), (307, 'src_2')] }] - self.default_test(tables_and_counts) \ No newline at end of file + self.default_test(tables_and_counts) + + def tearDown(self): + self.client.delete_table(self.fq_mapping_table_name, not_found_ok=True) + super().tearDown() diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health_test.py index 17632b3238..7bb115c308 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_overall_health_test.py @@ -10,7 +10,7 @@ # Project imports from app_identity import PROJECT_ID from cdr_cleaner.cleaning_rules.backfill_overall_health import BackfillOverallHealth -from common import JINJA_ENV, OBSERVATION, PERSON +from common import JINJA_ENV, OBSERVATION, PERSON, MAPPING_PREFIX from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest OBSERVATION_TMPL = JINJA_ENV.from_string(""" @@ -60,6 +60,37 @@ (4, 8507, 2001, 999, 99999) """) +MAPPING_TMPL = JINJA_ENV.from_string(""" +CREATE OR REPLACE TABLE `{{project}}.{{dataset}}._mapping_observation` + (observation_id INT64, src_id STRING) + ; +INSERT INTO `{{project}}.{{dataset}}._mapping_observation` +(observation_id, src_id) +VALUES + (101, 'src_1'), + (102, 'src_1'), + (103, 'src_1'), + (104, 'src_1'), + (105, 'src_1'), + (106, 'src_1'), + (107, 'src_1'), + (108, 'src_1'), + (109, 'src_1'), + (110, 'src_1'), + (111, 'src_1'), + (112, 'src_1'), + (113, 'src_1'), + (114, 'src_1'), + (115, 'src_1'), + (116, 'src_1'), + (301, 'src_2'), + (302, 'src_2'), + (303, 'src_2'), + (401, 'src_2'), + (402, 'src_2'), + (403, 'src_2') +""") + class BackfillOverallHealthTest(BaseTest.CleaningRulesTestBase): @@ -81,6 +112,16 @@ def setUpClass(cls): cls.fq_sandbox_table_names = [] + # NOTE _mapping_observation is not in cls.fq_table_names because its columns are different from the ones + # defined in the resource_files folder. It has the columns defined in `create_rdr_snapshot.py` instead. + cls.fq_mapping_table_name = f'{cls.project_id}.{cls.dataset_id}.{MAPPING_PREFIX}{OBSERVATION}' + + # Generate sandbox table names + sandbox_table_names = cls.rule_instance.get_sandbox_tablenames() + for table_name in sandbox_table_names: + cls.fq_sandbox_table_names.append( + f'{cls.project_id}.{cls.sandbox_id}.{table_name}') + cls.fq_table_names = [ f'{cls.project_id}.{cls.dataset_id}.{OBSERVATION}', f'{cls.project_id}.{cls.dataset_id}.{PERSON}', @@ -99,8 +140,10 @@ def setUp(self): dataset=self.dataset_id) insert_person = PERSON_TMPL.render(project=self.project_id, dataset=self.dataset_id) + insert_mapping = MAPPING_TMPL.render(project=self.project_id, + dataset=self.dataset_id) - queries = [insert_observation, insert_person] + queries = [insert_observation, insert_person, insert_mapping] self.load_test_data(queries) def test_backfill_overall_health(self): @@ -123,14 +166,17 @@ def test_backfill_overall_health(self): """ tables_and_counts = [{ 'fq_table_name': - f'{self.project_id}.{self.dataset_id}.{OBSERVATION}', + self.fq_table_names[0], 'fq_sandbox_table_name': - None, + self.fq_sandbox_table_names[0], 'loaded_ids': [ 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 201, 301, 302, 303, 401, 402, 403 ], - 'sandboxed_ids': [], + 'sandboxed_ids': [ + 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, + 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428 + ], 'fields': [ 'observation_id', 'person_id', 'observation_concept_id', 'observation_date', 'observation_type_concept_id', @@ -186,6 +232,34 @@ def test_backfill_overall_health(self): (427, 4, 1585803, self.date_2021, 45905771, 1585803, None), (428, 4, 1585815, self.date_2021, 45905771, 1585815, None) ] + }, { + 'fq_table_name': + self.fq_mapping_table_name, + 'loaded_ids': [ + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, + 114, 115, 116, 301, 302, 303, 401, 402, 403 + ], + 'fields': ['observation_id', 'src_id'], + 'cleaned_values': [(101, 'src_1'), (102, 'src_1'), (103, 'src_1'), + (104, 'src_1'), (105, 'src_1'), (106, 'src_1'), + (107, 'src_1'), (108, 'src_1'), (109, 'src_1'), + (110, 'src_1'), (111, 'src_1'), (112, 'src_1'), + (113, 'src_1'), (114, 'src_1'), (115, 'src_1'), + (116, 'src_1'), (301, 'src_2'), (302, 'src_2'), + (303, 'src_2'), (401, 'src_2'), (402, 'src_2'), + (403, 'src_2'), (404, 'src_2'), (405, 'src_2'), + (406, 'src_2'), (407, 'src_2'), (408, 'src_2'), + (409, 'src_2'), (410, 'src_2'), (411, 'src_2'), + (412, 'src_2'), (413, 'src_2'), (414, 'src_2'), + (415, 'src_2'), (416, 'src_2'), (417, 'src_2'), + (418, 'src_2'), (419, 'src_2'), (420, 'src_2'), + (421, 'src_2'), (422, 'src_2'), (423, 'src_2'), + (424, 'src_2'), (425, 'src_2'), (426, 'src_2'), + (427, 'src_2'), (428, 'src_2')] }] - self.default_test(tables_and_counts) \ No newline at end of file + self.default_test(tables_and_counts) + + def tearDown(self): + self.client.delete_table(self.fq_mapping_table_name, not_found_ok=True) + super().tearDown() diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics_test.py index 5cda495bf1..12425fc15c 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/backfill_the_basics_test.py @@ -10,7 +10,7 @@ # Project imports from app_identity import PROJECT_ID from cdr_cleaner.cleaning_rules.backfill_the_basics import BackfillTheBasics -from common import JINJA_ENV, OBSERVATION, PERSON +from common import JINJA_ENV, OBSERVATION, PERSON, MAPPING_PREFIX from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest OBSERVATION_TMPL = JINJA_ENV.from_string(""" @@ -51,6 +51,33 @@ (3, 8532, 2001, 999, 99999) """) +MAPPING_TMPL = JINJA_ENV.from_string(""" +CREATE OR REPLACE TABLE `{{project}}.{{dataset}}._mapping_observation` + (observation_id INT64, src_id STRING) + ; +INSERT INTO `{{project}}.{{dataset}}._mapping_observation` +(observation_id, src_id) +VALUES + (101, 'src_1'), + (102, 'src_1'), + (103, 'src_1'), + (104, 'src_1'), + (105, 'src_1'), + (106, 'src_1'), + (107, 'src_1'), + (108, 'src_1'), + (109, 'src_1'), + (110, 'src_1'), + (111, 'src_1'), + (112, 'src_1'), + (113, 'src_1'), + (114, 'src_1'), + (115, 'src_1'), + (301, 'src_2'), + (302, 'src_2'), + (303, 'src_2') +""") + class BackfillTheBasicsTest(BaseTest.CleaningRulesTestBase): @@ -71,6 +98,16 @@ def setUpClass(cls): cls.fq_sandbox_table_names = [] + # NOTE _mapping_observation is not in cls.fq_table_names because its columns are different from the ones + # defined in the resource_files folder. It has the columns defined in `create_rdr_snapshot.py` instead. + cls.fq_mapping_table_name = f'{cls.project_id}.{cls.dataset_id}.{MAPPING_PREFIX}{OBSERVATION}' + + # Generate sandbox table names + sandbox_table_names = cls.rule_instance.get_sandbox_tablenames() + for table_name in sandbox_table_names: + cls.fq_sandbox_table_names.append( + f'{cls.project_id}.{cls.sandbox_id}.{table_name}') + cls.fq_table_names = [ f'{cls.project_id}.{cls.dataset_id}.{OBSERVATION}', f'{cls.project_id}.{cls.dataset_id}.{PERSON}', @@ -89,8 +126,10 @@ def setUp(self): dataset=self.dataset_id) insert_person = PERSON_TMPL.render(project=self.project_id, dataset=self.dataset_id) + insert_mapping = MAPPING_TMPL.render(project=self.project_id, + dataset=self.dataset_id) - queries = [insert_observation, insert_person] + queries = [insert_observation, insert_person, insert_mapping] self.load_test_data(queries) def test_backfill_the_basics(self): @@ -107,14 +146,16 @@ def test_backfill_the_basics(self): """ tables_and_counts = [{ 'fq_table_name': - f'{self.project_id}.{self.dataset_id}.{OBSERVATION}', + self.fq_table_names[0], 'fq_sandbox_table_name': - None, + self.fq_sandbox_table_names[0], 'loaded_ids': [ 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 201, 301, 302, 303 ], - 'sandboxed_ids': [], + 'sandboxed_ids': [ + 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315 + ], 'fields': [ 'observation_id', 'person_id', 'observation_concept_id', 'observation_date', 'observation_type_concept_id', @@ -153,6 +194,50 @@ def test_backfill_the_basics(self): (314, 3, 3005917, self.date_2022, 45905771, 1586135), (315, 3, 1586140, self.date_2022, 45905771, 1586140) ] + }, { + 'fq_table_name': + self.fq_mapping_table_name, + 'loaded_ids': [ + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, + 114, 115, 301, 302, 303 + ], + 'fields': ['observation_id', 'src_id'], + 'cleaned_values': [ + (101, 'src_1'), + (102, 'src_1'), + (103, 'src_1'), + (104, 'src_1'), + (105, 'src_1'), + (106, 'src_1'), + (107, 'src_1'), + (108, 'src_1'), + (109, 'src_1'), + (110, 'src_1'), + (111, 'src_1'), + (112, 'src_1'), + (113, 'src_1'), + (114, 'src_1'), + (115, 'src_1'), + (301, 'src_2'), + (302, 'src_2'), + (303, 'src_2'), + (304, 'src_2'), + (305, 'src_2'), + (306, 'src_2'), + (307, 'src_2'), + (308, 'src_2'), + (309, 'src_2'), + (310, 'src_2'), + (311, 'src_2'), + (312, 'src_2'), + (313, 'src_2'), + (314, 'src_2'), + (315, 'src_2'), + ] }] - self.default_test(tables_and_counts) \ No newline at end of file + self.default_test(tables_and_counts) + + def tearDown(self): + self.client.delete_table(self.fq_mapping_table_name, not_found_ok=True) + super().tearDown() diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year_test.py index d8b2972632..8ed64305ba 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_by_birth_year_test.py @@ -26,10 +26,10 @@ ) VALUES -- records to sandbox -- - (1,0,1799,0,0), + (1,0,1899,0,0), (4,0,2020,0,0), -- records to keep -- - (2,0,1800,0,0), + (2,0,1900,0,0), (3,0,1975,0,0); INSERT INTO `{{project_id}}.{{dataset_id}}.observation` ( @@ -126,7 +126,7 @@ def test_setting_concept_identifiers(self): 'person_id', 'gender_concept_id', 'year_of_birth', 'race_concept_id', 'ethnicity_concept_id' ], - 'cleaned_values': [(2, 0, 1800, 0, 0), (3, 0, 1975, 0, 0)] + 'cleaned_values': [(2, 0, 1900, 0, 0), (3, 0, 1975, 0, 0)] }] self.default_test(tables_and_counts) diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_digital_health_data_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_digital_health_data_test.py index 4642e90004..39f87ee2e3 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_digital_health_data_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/clean_digital_health_data_test.py @@ -13,7 +13,7 @@ # Project Imports from app_identity import PROJECT_ID -from common import FITBIT_TABLES, ACTIVITY_SUMMARY, HEART_RATE_MINUTE_LEVEL, HEART_RATE_SUMMARY, STEPS_INTRADAY +from common import FITBIT_TABLES, ACTIVITY_SUMMARY, HEART_RATE_INTRADAY, HEART_RATE_SUMMARY, STEPS_INTRADAY from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest import cdr_cleaner.cleaning_rules.clean_digital_health_data as clean_dhd @@ -90,7 +90,7 @@ def setUpClass(cls): # Set the expected test datasets dataset_id = os.environ.get('COMBINED_DATASET_ID') cls.dataset_id = dataset_id - sandbox_id = dataset_id + '_sandbox' + sandbox_id = f'{dataset_id}_sandbox' cls.sandbox_id = sandbox_id cls.kwargs = {'api_project_id': 'rdr_project_id'} @@ -156,7 +156,7 @@ def test_clean_digital_health_data(self, mock_get_digital_health): (333, '2020-11-26 00:00:00')""").render( project_id=self.project_id, dataset_id=self.dataset_id, - fitbit_table=HEART_RATE_MINUTE_LEVEL) + fitbit_table=HEART_RATE_INTRADAY) queries.append(hr_query) hrs_query = self.jinja_env.from_string(""" @@ -189,8 +189,10 @@ def test_clean_digital_health_data(self, mock_get_digital_health): tables_and_counts = [{ 'fq_table_name': '.'.join([self.dataset_id, ACTIVITY_SUMMARY]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[0], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if ACTIVITY_SUMMARY in sb_name + ][0], 'fields': ['person_id', 'date'], 'loaded_ids': [111, 222, 333], 'sandboxed_ids': [333], @@ -200,9 +202,11 @@ def test_clean_digital_health_data(self, mock_get_digital_health): ] }, { 'fq_table_name': - '.'.join([self.dataset_id, HEART_RATE_MINUTE_LEVEL]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[2], + '.'.join([self.dataset_id, HEART_RATE_INTRADAY]), + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if HEART_RATE_INTRADAY in sb_name + ][0], 'fields': ['person_id', 'datetime'], 'loaded_ids': [111, 222, 333], 'sandboxed_ids': [333], @@ -213,8 +217,10 @@ def test_clean_digital_health_data(self, mock_get_digital_health): }, { 'fq_table_name': '.'.join([self.dataset_id, HEART_RATE_SUMMARY]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[1], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if HEART_RATE_SUMMARY in sb_name + ][0], 'fields': ['person_id', 'date'], 'loaded_ids': [111, 222, 333], 'sandboxed_ids': [333], diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts_test.py index 98ed6058a7..c1b10fd34f 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/convert_pre_post_coordinated_concepts_test.py @@ -7,7 +7,7 @@ # Project Imports from app_identity import PROJECT_ID -from common import JINJA_ENV, OBSERVATION, VOCABULARY_TABLES +from common import JINJA_ENV, MAPPING_PREFIX, OBSERVATION, VOCABULARY_TABLES from cdr_cleaner.cleaning_rules.convert_pre_post_coordinated_concepts import ConvertPrePostCoordinatedConcepts from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest @@ -32,6 +32,22 @@ (7, 17, 43528355, 1740608, 141095, date('2000-01-01'), 0, 'OtherConditions_Acne') """) +LOAD_MAPPING_QUERY = JINJA_ENV.from_string(""" + CREATE OR REPLACE TABLE `{{project}}.{{dataset_id}}._mapping_observation` + (observation_id INT64, src_id STRING) + ; + INSERT INTO `{{project_id}}.{{dataset_id}}._mapping_observation` + (observation_id, src_id) + VALUES + (1, 'src_1'), + (2, 'src_2'), + (3, 'src_3'), + (4, 'src_4'), + (5, 'src_5'), + (6, 'src_6'), + (7, 'src_7') +""") + class ConvertPrePostCoordinatedConceptsTest(BaseTest.CleaningRulesTestBase): @@ -45,7 +61,7 @@ def setUpClass(cls): cls.project_id = os.environ.get(PROJECT_ID) cls.dataset_id = os.environ.get('RDR_DATASET_ID') - cls.sandbox_id = cls.dataset_id + '_sandbox' + cls.sandbox_id = f'{cls.dataset_id}_sandbox' cls.vocabulary_id = os.environ.get('VOCABULARY_DATASET') cls.rule_instance = ConvertPrePostCoordinatedConcepts( cls.project_id, cls.dataset_id, cls.sandbox_id) @@ -55,6 +71,10 @@ def setUpClass(cls): for table in [OBSERVATION] + VOCABULARY_TABLES ] + # NOTE _mapping_observation is not in cls.fq_table_names because its columns are different from the ones + # defined in the resource_files folder. It has the columns defined in `create_rdr_snapshot.py` instead. + cls.fq_mapping_table_name = f'{cls.project_id}.{cls.dataset_id}.{MAPPING_PREFIX}{OBSERVATION}' + cls.fq_sandbox_table_names = [ f'{cls.project_id}.{cls.sandbox_id}.{cls.rule_instance.sandbox_table_for(OBSERVATION)}' ] @@ -69,7 +89,9 @@ def setUp(self): self.load_test_data([ LOAD_QUERY.render(project_id=self.project_id, - dataset_id=self.dataset_id) + dataset_id=self.dataset_id), + LOAD_MAPPING_QUERY.render(project_id=self.project_id, + dataset_id=self.dataset_id), ]) self.copy_vocab_tables(self.vocabulary_id) @@ -114,6 +136,28 @@ def test_convert_pre_post_coordinated_concepts(self): (400000000006, 43528574, 43528630, 45883358), (7, 43528355, 1740608, 141095), ] + }, { + 'fq_table_name': + self.fq_mapping_table_name, + 'loaded_ids': [1, 2, 3, 4, 5, 6, 7], + 'fields': ['observation_id', 'src_id'], + 'cleaned_values': [ + (1, 'src_1'), + (2, 'src_2'), + (100000000003, 'src_3'), + (100000000004, 'src_4'), + (200000000004, 'src_4'), + (100000000005, 'src_5'), + (200000000005, 'src_5'), + (100000000006, 'src_6'), + (200000000006, 'src_6'), + (300000000006, 'src_6'), + (400000000006, 'src_6'), + (7, 'src_7'), + ] }] - self.default_test(tables_and_counts) + + def tearDown(self): + self.client.delete_table(self.fq_mapping_table_name, not_found_ok=True) + super().tearDown() \ No newline at end of file diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/create_aian_lookup_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/create_aian_lookup_test.py new file mode 100644 index 0000000000..8f277219eb --- /dev/null +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/create_aian_lookup_test.py @@ -0,0 +1,69 @@ +""" +Integration test for CreateAIANLookup. +""" +import os + +from app_identity import get_application_id +from common import OBSERVATION +from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest +from cdr_cleaner.cleaning_rules.create_aian_lookup import CreateAIANLookup + + +class CreateAIANLookupTest(BaseTest.CleaningRulesTestBase): + + @classmethod + def setUpClass(cls): + print('**************************************************************') + print(cls.__name__) + print('**************************************************************') + + super().initialize_class_vars() + + cls.project_id = get_application_id() + cls.dataset_id = os.getenv('RDR_DATASET_ID') + cls.sandbox_id = f'{cls.dataset_id}_sandbox' + + cls.rule_instance = CreateAIANLookup(cls.project_id, cls.dataset_id, + cls.sandbox_id) + + sb_table_names = cls.rule_instance.get_sandbox_tablenames() + for table_name in sb_table_names: + cls.fq_sandbox_table_names.append( + f'{cls.project_id}.{cls.sandbox_id}.{table_name}') + + cls.fq_table_names = [ + f'{cls.project_id}.{cls.dataset_id}.{OBSERVATION}' + ] + + super().setUpClass() + + def setUp(self): + super().setUp() + + observation_tmpl = self.jinja_env.from_string(""" + INSERT INTO `{{project}}.{{dataset}}.{{table}}` + (observation_id, person_id, observation_source_concept_id, value_source_concept_id, + observation_concept_id, observation_date, observation_type_concept_id) + VALUES + -- Meets the AIAN criteria -- + (101, 11, 1586140, 1586141, 0, '2000-01-01', 0), + (102, 12, 1586150, 0, 0, '2000-01-01', 0), + (103, 13, 1585599, 0, 0, '2000-01-01', 0), + (104, 14, 1586139, 0, 0, '2000-01-01', 0), + (105, 15, 1585604, 0, 0, '2000-01-01', 0), + -- Not meet the AIAN criteria -- + (201, 21, 1586140, 0, 0, '2000-01-01', 0), + (202, 22, 0, 1586141, 0, '2000-01-01', 0), + (203, 23, 0, 0, 0, '2000-01-01', 0) + """).render(project=self.project_id, + dataset=self.dataset_id, + table=OBSERVATION) + + queries = [observation_tmpl] + self.load_test_data(queries) + + def test_create_aian_list(self): + self.default_test([]) + self.assertTableValuesMatch(self.fq_sandbox_table_names[0], + ['person_id'], [(11,), (12,), (13,), (14,), + (15,)]) diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization_test.py index 6ded88da71..d72a8de153 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/conflicting_hpo_state_generalization_test.py @@ -12,52 +12,74 @@ # Project Imports from app_identity import PROJECT_ID -from cdr_cleaner.cleaning_rules.deid.conflicting_hpo_state_generalization import ConflictingHpoStateGeneralize -from common import JINJA_ENV, OBSERVATION, PIPELINE_TABLES +from cdr_cleaner.cleaning_rules.deid.conflicting_hpo_state_generalization import ( + ConflictingHpoStateGeneralize, MAP_TABLE_NAME) +from common import EXT_SUFFIX, JINJA_ENV, OBSERVATION, SITE_MASKING_TABLE_ID from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest INSERT_RAW_DATA_OBS = JINJA_ENV.from_string(""" - INSERT INTO `{{project_id}}.{{dataset_id}}.observation` ( - observation_id, - person_id, - observation_concept_id, - observation_date, - observation_type_concept_id, - value_as_number, - value_as_string, - value_as_concept_id, - observation_source_concept_id, - value_source_concept_id, - value_source_value + INSERT INTO `{{project_id}}.{{dataset_id}}.observation` ( + observation_id, + person_id, + observation_concept_id, + observation_date, + observation_type_concept_id, + value_as_concept_id, + observation_source_concept_id, + value_source_concept_id, + value_source_value ) VALUES - (1,101,0,'2020-01-01',1,2,'',100,1585249,100,'Generalize This Value'), - (2,101,0,'2020-01-01',1,2,'',100,1500000,100,'Test Value'), - (3,103,0,'2020-01-01',1,2,'',100,1585249,1585261,'Do Not Generalize This Value'), - (4,103,0,'2020-01-01',1,2,'',100,1585248,100,'Test Value') + -- person_id 1 answered that she lives in Alabama. -- + -- And all the HPO records come from a HPO site in Alabama. -- + -- Nothing happens to person_id 1 -- + (101, 1, 0, '2020-01-01', 1, 999, 1585249, 1585261, 'PIIState_AL'), + (102, 1, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + (103, 1, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + -- person_id 2 answered that she lives in Alabama -- + -- And all the HPO records come from HPO sites in Alabama -- + -- Nothing happens to person_id 2 -- + (201, 2, 0, '2020-01-01', 1, 999, 1585249, 1585261, 'PIIState_AL'), + (202, 2, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + (203, 2, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + -- person_id 3 answered that she lives in Alabama. -- + -- But all the HPO records come from a HPO site in Arizona. -- + -- State info will be generalized for person_id 3 -- + (301, 3, 0, '2020-01-01', 1, 999, 1585249, 1585261, 'PIIState_AL'), + (302, 3, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + (303, 3, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + -- person_id 4 answered that she lives in Alabama. -- + -- But one of the HPO records come from a HPO site in Arizona. -- + -- State info will be generalized for person_id 4 -- + (401, 4, 0, '2020-01-01', 1, 999, 1585249, 1585261, 'PIIState_AL'), + (402, 4, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + (403, 4, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + -- person_id 5 answered that she lives in Alabama. -- + -- And she only has RDR records, no HPO records exists. -- + -- Nothing happens to person_id 5 -- + (501, 5, 0, '2020-01-01', 1, 999, 1585249, 1585261, 'PIIState_AL'), + (502, 5, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy'), + (503, 5, 0, '2020-01-01', 1, 999, 1500000, 9999999, 'Dummy') """) INSERT_RAW_DATA_EXT = JINJA_ENV.from_string(""" - INSERT INTO `{{project_id}}.{{dataset_id}}.observation_ext`( - observation_id, - src_id - ) - VALUES - (1,'Portal 1'), - (2, 'bar 000'), - (3, 'Portal 2'), - (4, 'bar 123') + INSERT INTO `{{project_id}}.{{dataset_id}}.observation_ext` + (observation_id, src_id) + VALUES + (101, 'Portal1'), (102, 'bar 001'), (103, 'bar 001'), + (201, 'Portal2'), (202, 'bar 001'), (203, 'bar 002'), + (301, 'Portal3'), (302, 'bar 003'), (303, 'bar 003'), + (401, 'Portal4'), (402, 'bar 001'), (403, 'bar 003'), + (501, 'Portal5'), (502, 'Portal6'), (503, 'Portal7') """) INSERT_TEMP_MASK_TABLE = JINJA_ENV.from_string(""" - INSERT INTO `{{project_id}}.{{dataset_id}}.site_maskings` ( - hpo_id, - src_id, - state, - value_source_concept_id) + INSERT INTO `{{project_id}}.{{dataset_id}}.site_maskings` + (hpo_id, src_id, state, value_source_concept_id) VALUES - ('foo', 'bar 123', 'PIIState_AL', 1585261), - ('bar', 'bar 000', 'PIIState_CA', 1585266) + ('hpo site in Alabama 1', 'bar 001', 'PIIState_AL', 1585261), + ('hpo site in Alabama 2', 'bar 002', 'PIIState_AL', 1585261), + ('hpo site in Arizona 3', 'bar 003', 'PIIState_AZ', 1585264) """) @@ -82,7 +104,9 @@ def setUpClass(cls) -> None: cls.project_id, cls.dataset_id, cls.sandbox_id) # Generates list of fully qualified table names and their corresponding sandbox table names - for table in [OBSERVATION, f'{OBSERVATION}_ext', 'site_maskings']: + for table in [ + OBSERVATION, f'{OBSERVATION}{EXT_SUFFIX}', SITE_MASKING_TABLE_ID + ]: cls.fq_table_names.append( f'{cls.project_id}.{cls.dataset_id}.{table}') @@ -100,8 +124,8 @@ def setUp(self): raw_data_load_query_obs = INSERT_RAW_DATA_OBS.render( project_id=self.project_id, dataset_id=self.dataset_id) - raw_data_load_query_mapping = INSERT_RAW_DATA_EXT. \ - render(project_id=self.project_id, dataset_id=self.dataset_id) + raw_data_load_query_mapping = INSERT_RAW_DATA_EXT.render( + project_id=self.project_id, dataset_id=self.dataset_id) # The location of the table will be mocked in the test temp_mask_query = INSERT_TEMP_MASK_TABLE.render( @@ -124,28 +148,39 @@ def test_conflicting_hpo_id(self): 'fq_table_name': f'{self.project_id}.{self.dataset_id}.{OBSERVATION}', 'fq_sandbox_table_name': - f'{self.project_id}.{self.sandbox_id}.{self.rule_instance.get_sandbox_tablenames()[0]}', + f'{self.project_id}.{self.sandbox_id}.{self.rule_instance.sandbox_table_for(OBSERVATION)}', # The following tables are created when `setup_rule` runs, # so this will break the sandboxing check that runs in 'default_test()' # We get around the check by declaring these tables are created before # the rule runs and this is expected. 'tables_created_on_setup': [ - f'{self.project_id}.{self.sandbox_id}.{self.rule_instance.get_sandbox_tablenames()[-1]}' + f'{self.project_id}.{self.sandbox_id}.{MAP_TABLE_NAME}' ], - 'loaded_ids': [1, 2, 3, 4], - 'sandboxed_ids': [1], + 'loaded_ids': [ + 101, 102, 103, 201, 202, 203, 301, 302, 303, 401, 402, 403, 501, + 502, 503 + ], + 'sandboxed_ids': [301, 401], 'fields': [ - 'observation_id', 'person_id', 'observation_date', - 'value_as_concept_id', 'observation_source_concept_id', - 'value_source_concept_id', 'value_source_value' + 'observation_id', 'person_id', 'value_as_concept_id', + 'observation_source_concept_id', 'value_source_concept_id' ], 'cleaned_values': [ - (1, 101, self.date, 2000000011, 1585249, 2000000011, - 'Generalize This Value'), - (2, 101, self.date, 100, 1500000, 100, 'Test Value'), - (3, 103, self.date, 100, 1585249, 1585261, - 'Do Not Generalize This Value'), - (4, 103, self.date, 100, 1585248, 100, 'Test Value') + (101, 1, 999, 1585249, 1585261), + (102, 1, 999, 1500000, 9999999), + (103, 1, 999, 1500000, 9999999), + (201, 2, 999, 1585249, 1585261), + (202, 2, 999, 1500000, 9999999), + (203, 2, 999, 1500000, 9999999), + (301, 3, 2000000011, 1585249, 2000000011), + (302, 3, 999, 1500000, 9999999), + (303, 3, 999, 1500000, 9999999), + (401, 4, 2000000011, 1585249, 2000000011), + (402, 4, 999, 1500000, 9999999), + (403, 4, 999, 1500000, 9999999), + (501, 5, 999, 1585249, 1585261), + (502, 5, 999, 1500000, 9999999), + (503, 5, 999, 1500000, 9999999), ] }] @@ -155,3 +190,16 @@ def test_conflicting_hpo_id(self): 'cdr_cleaner.cleaning_rules.deid.conflicting_hpo_state_generalization.PIPELINE_TABLES', self.dataset_id): self.default_test(tables_and_counts) + + # Checking if the sandboxed records are also expected. + self.assertTableValuesMatch( + f'{self.project_id}.{self.sandbox_id}.{MAP_TABLE_NAME}', + ['person_id', 'src_id'], [(1, 'bar 001'), (2, 'bar 001'), + (2, 'bar 002'), (3, 'bar 003'), + (4, 'bar 001'), (4, 'bar 003')]) + self.assertTableValuesMatch( + f'{self.project_id}.{self.sandbox_id}.{self.rule_instance.sandbox_table_for(f"{OBSERVATION}_identifier")}', + [ + 'observation_id', 'person_id', 'src_id', + 'value_source_concept_id' + ], [(301, 3, 'bar 003', 1585261), (401, 4, 'bar 003', 1585261)]) diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_deid_src_id_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_deid_src_id_test.py index 80a4d2b6fc..13d11e1d09 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_deid_src_id_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_deid_src_id_test.py @@ -15,11 +15,16 @@ # Project Imports from app_identity import PROJECT_ID -from common import JINJA_ENV, FITBIT_TABLES, SITE_MASKING_TABLE_ID +from common import (ACTIVITY_SUMMARY, DEVICE, FITBIT_TABLES, + HEART_RATE_INTRADAY, HEART_RATE_SUMMARY, JINJA_ENV, + SITE_MASKING_TABLE_ID, SLEEP_DAILY_SUMMARY, SLEEP_LEVEL, + STEPS_INTRADAY) from cdr_cleaner.cleaning_rules.deid.fitbit_deid_src_id import FitbitDeidSrcID from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest -ACTIVITY_SUMMARY_TEMPLATE = JINJA_ENV.from_string(""" +TEMPLATE_DICT = { + ACTIVITY_SUMMARY: + JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.{{fitbit_table}}` (person_id, activity_calories, date, src_id) @@ -29,9 +34,9 @@ (2345, 500, date('2020-08-17'), 'pt'), (6789, 800, date('2020-08-17'), 'tp'), (3456, 1000, date('2020-08-17'), 'pt') -""") - -HEART_RATE_MINUTE_LEVEL_TEMPLATE = JINJA_ENV.from_string(""" +"""), + HEART_RATE_INTRADAY: + JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.{{fitbit_table}}` (person_id, heart_rate_value, datetime, src_id) @@ -41,9 +46,9 @@ (2345, 55, (DATETIME '2020-08-17 16:00:00'), 'pt'), (6789, 40, (DATETIME '2020-08-17 16:30:00'), 'tp'), (3456, 65, (DATETIME '2020-08-17 17:00:00'), 'pt') -""") - -HEART_RATE_SUMMARY_TEMPLATE = JINJA_ENV.from_string(""" +"""), + HEART_RATE_SUMMARY: + JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.{{fitbit_table}}` (person_id, date, calorie_count, src_id) @@ -53,9 +58,9 @@ (2345, date('2020-08-17'), 500, 'pt'), (6789, date('2020-08-17'), 800, 'tp'), (3456, date('2020-08-17'), 1000, 'pt') -""") - -STEPS_INTRADAY_TEMPLATE = JINJA_ENV.from_string(""" +"""), + STEPS_INTRADAY: + JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.{{fitbit_table}}` (person_id, steps, datetime, src_id) @@ -65,9 +70,9 @@ (2345, 55, (DATETIME '2020-08-17 16:00:00'), 'pt'), (6789, 40, (DATETIME '2020-08-17 16:30:00'), 'tp'), (3456, 65, (DATETIME '2020-08-17 17:00:00'), 'pt') -""") - -SLEEP_DAILY_SUMMARY_TEMPLATE = JINJA_ENV.from_string(""" +"""), + SLEEP_DAILY_SUMMARY: + JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.{{fitbit_table}}` (person_id, sleep_date, minute_in_bed, src_id) @@ -77,9 +82,9 @@ (2345, date('2020-08-17'), 745, 'pt'), (6789, date('2020-08-17'), 605, 'tp'), (3456, date('2020-08-17'), 578, 'pt') -""") - -SLEEP_LEVEL_TEMPLATE = JINJA_ENV.from_string(""" +"""), + SLEEP_LEVEL: + JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.{{fitbit_table}}` (person_id, sleep_date, duration_in_min, src_id) @@ -89,9 +94,9 @@ (2345, date('2020-08-17'), 22, 'pt'), (6789, date('2020-08-17'), 56, 'tp'), (3456, date('2020-08-17'), 12, 'pt') -""") - -DEVICE_TEMPLATE = JINJA_ENV.from_string(""" +"""), + DEVICE: + JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.{{fitbit_table}}` (person_id, device_date, battery, src_id) @@ -102,6 +107,7 @@ (6789, date('2020-08-17'), "Medium", 'tp'), (3456, date('2020-08-17'), "Medium", 'pt') """) +} SITE_MASKINGS_TEMPLATE = JINJA_ENV.from_string(""" INSERT INTO @@ -164,16 +170,13 @@ def setUp(self): # Insert test records into fitbit tables fitbit_test_queries = [] - TEMPLATES = [ - ACTIVITY_SUMMARY_TEMPLATE, HEART_RATE_MINUTE_LEVEL_TEMPLATE, - HEART_RATE_SUMMARY_TEMPLATE, STEPS_INTRADAY_TEMPLATE, - SLEEP_DAILY_SUMMARY_TEMPLATE, SLEEP_LEVEL_TEMPLATE, DEVICE_TEMPLATE - ] - for table, template in zip(FITBIT_TABLES, TEMPLATES): - test_data_query = template.render(project_id=self.project_id, - dataset_id=self.dataset_id, - fitbit_table=table) - fitbit_test_queries.append(test_data_query) + for table in FITBIT_TABLES: + template = TEMPLATE_DICT.get(table) + if template: + test_data_query = template.render(project_id=self.project_id, + dataset_id=self.dataset_id, + fitbit_table=table) + fitbit_test_queries.append(test_data_query) # Load test data self.load_test_data([site_maskings_query] + fitbit_test_queries) @@ -184,161 +187,147 @@ def test_field_cleaning(self): """ # Expected results list - tables_and_counts = [ - { - 'fq_table_name': - self.fq_table_names[0], # ACTIVITY_SUMMARY - 'fq_sandbox_table_name': - None, - 'fields': ['person_id', 'activity_calories', 'date', 'src_id'], - 'loaded_ids': [1234, 5678, 2345, 6789, 3456], - 'sandboxed_ids': [], - 'cleaned_values': [ - (1234, 100, datetime.fromisoformat('2020-08-17').date(), - 'Participant Portal 2'), - (5678, 200, datetime.fromisoformat('2020-08-17').date(), - 'Participant Portal 1'), - (2345, 500, datetime.fromisoformat('2020-08-17').date(), - 'Participant Portal 2'), - (6789, 800, datetime.fromisoformat('2020-08-17').date(), - 'Participant Portal 1'), - (3456, 1000, datetime.fromisoformat('2020-08-17').date(), - 'Participant Portal 2') - ] - }, - { - 'fq_table_name': - self.fq_table_names[1], # HEART_RATE_MINUTE_LEVEL - 'fq_sandbox_table_name': - None, - 'fields': [ - 'person_id', 'heart_rate_value', 'datetime', 'src_id' - ], - 'loaded_ids': [1234, 5678, 2345, 6789, 3456], - 'sandboxed_ids': [], - 'cleaned_values': [ - (1234, 60, datetime.fromisoformat('2020-08-17 15:00:00'), - 'Participant Portal 2'), - (5678, 50, datetime.fromisoformat('2020-08-17 15:30:00'), - 'Participant Portal 1'), - (2345, 55, datetime.fromisoformat('2020-08-17 16:00:00'), - 'Participant Portal 2'), - (6789, 40, datetime.fromisoformat('2020-08-17 16:30:00'), - 'Participant Portal 1'), - (3456, 65, datetime.fromisoformat('2020-08-17 17:00:00'), - 'Participant Portal 2') - ] - }, - { - 'fq_table_name': - self.fq_table_names[2], # HEART_RATE_SUMMARY - 'fq_sandbox_table_name': - None, - 'fields': ['person_id', 'date', 'calorie_count', 'src_id'], - 'loaded_ids': [1234, 5678, 2345, 6789, 3456], - 'sandboxed_ids': [], - 'cleaned_values': [ - (1234, datetime.fromisoformat('2020-08-17').date(), 100, - 'Participant Portal 2'), - (5678, datetime.fromisoformat('2020-08-17').date(), 200, - 'Participant Portal 1'), - (2345, datetime.fromisoformat('2020-08-17').date(), 500, - 'Participant Portal 2'), - (6789, datetime.fromisoformat('2020-08-17').date(), 800, - 'Participant Portal 1'), - (3456, datetime.fromisoformat('2020-08-17').date(), 1000, - 'Participant Portal 2') - ] - }, - { - 'fq_table_name': - self.fq_table_names[3], # STEPS_INTRADAY - 'fq_sandbox_table_name': - None, - 'fields': ['person_id', 'steps', 'datetime', 'src_id'], - 'loaded_ids': [1234, 5678, 2345, 6789, 3456], - 'sandboxed_ids': [], - 'cleaned_values': [ - (1234, 60, datetime.fromisoformat('2020-08-17 15:00:00'), - 'Participant Portal 2'), - (5678, 50, datetime.fromisoformat('2020-08-17 15:30:00'), - 'Participant Portal 1'), - (2345, 55, datetime.fromisoformat('2020-08-17 16:00:00'), - 'Participant Portal 2'), - (6789, 40, datetime.fromisoformat('2020-08-17 16:30:00'), - 'Participant Portal 1'), - (3456, 65, datetime.fromisoformat('2020-08-17 17:00:00'), - 'Participant Portal 2') - ] - }, - { - 'fq_table_name': - self.fq_table_names[4], # SLEEP_DAILY_SUMMARY - 'fq_sandbox_table_name': - None, - 'fields': [ - 'person_id', 'sleep_date', 'minute_in_bed', 'src_id' - ], - 'loaded_ids': [1234, 5678, 2345, 6789, 3456], - 'sandboxed_ids': [], - 'cleaned_values': [ - (1234, datetime.fromisoformat('2020-08-17').date(), 502, - 'Participant Portal 2'), - (5678, datetime.fromisoformat('2020-08-17').date(), 443, - 'Participant Portal 1'), - (2345, datetime.fromisoformat('2020-08-17').date(), 745, - 'Participant Portal 2'), - (6789, datetime.fromisoformat('2020-08-17').date(), 605, - 'Participant Portal 1'), - (3456, datetime.fromisoformat('2020-08-17').date(), 578, - 'Participant Portal 2') - ] - }, - { - 'fq_table_name': - self.fq_table_names[5], # SLEEP_LEVEL - 'fq_sandbox_table_name': - None, - 'fields': [ - 'person_id', 'sleep_date', 'duration_in_min', 'src_id' - ], - 'loaded_ids': [1234, 5678, 2345, 6789, 3456], - 'sandboxed_ids': [], - 'cleaned_values': [ - (1234, datetime.fromisoformat('2020-08-17').date(), 42, - 'Participant Portal 2'), - (5678, datetime.fromisoformat('2020-08-17').date(), 15, - 'Participant Portal 1'), - (2345, datetime.fromisoformat('2020-08-17').date(), 22, - 'Participant Portal 2'), - (6789, datetime.fromisoformat('2020-08-17').date(), 56, - 'Participant Portal 1'), - (3456, datetime.fromisoformat('2020-08-17').date(), 12, - 'Participant Portal 2') - ] - }, - { - 'fq_table_name': - self.fq_table_names[6], # DEVICE - 'fq_sandbox_table_name': - None, - 'fields': ['person_id', 'device_date', 'battery', 'src_id'], - 'loaded_ids': [1234, 5678, 2345, 6789, 3456], - 'sandboxed_ids': [], - 'cleaned_values': [ - (1234, datetime.fromisoformat('2020-08-17').date(), - "Medium", 'Participant Portal 2'), - (5678, datetime.fromisoformat('2020-08-17').date(), - "Medium", 'Participant Portal 1'), - (2345, datetime.fromisoformat('2020-08-17').date(), - "Medium", 'Participant Portal 2'), - (6789, datetime.fromisoformat('2020-08-17').date(), - "Medium", 'Participant Portal 1'), - (3456, datetime.fromisoformat('2020-08-17').date(), - "Medium", 'Participant Portal 2') - ] - } - ] + tables_and_counts = [{ + 'fq_table_name': + '.'.join([self.dataset_id, ACTIVITY_SUMMARY]), + 'fq_sandbox_table_name': + None, + 'fields': ['person_id', 'activity_calories', 'date', 'src_id'], + 'loaded_ids': [1234, 5678, 2345, 6789, 3456], + 'sandboxed_ids': [], + 'cleaned_values': [ + (1234, 100, datetime.fromisoformat('2020-08-17').date(), + 'Participant Portal 2'), + (5678, 200, datetime.fromisoformat('2020-08-17').date(), + 'Participant Portal 1'), + (2345, 500, datetime.fromisoformat('2020-08-17').date(), + 'Participant Portal 2'), + (6789, 800, datetime.fromisoformat('2020-08-17').date(), + 'Participant Portal 1'), + (3456, 1000, datetime.fromisoformat('2020-08-17').date(), + 'Participant Portal 2') + ] + }, { + 'fq_table_name': + '.'.join([self.dataset_id, HEART_RATE_INTRADAY]), + 'fq_sandbox_table_name': + None, + 'fields': ['person_id', 'heart_rate_value', 'datetime', 'src_id'], + 'loaded_ids': [1234, 5678, 2345, 6789, 3456], + 'sandboxed_ids': [], + 'cleaned_values': [ + (1234, 60, datetime.fromisoformat('2020-08-17 15:00:00'), + 'Participant Portal 2'), + (5678, 50, datetime.fromisoformat('2020-08-17 15:30:00'), + 'Participant Portal 1'), + (2345, 55, datetime.fromisoformat('2020-08-17 16:00:00'), + 'Participant Portal 2'), + (6789, 40, datetime.fromisoformat('2020-08-17 16:30:00'), + 'Participant Portal 1'), + (3456, 65, datetime.fromisoformat('2020-08-17 17:00:00'), + 'Participant Portal 2') + ] + }, { + 'fq_table_name': + '.'.join([self.dataset_id, HEART_RATE_SUMMARY]), + 'fq_sandbox_table_name': + None, + 'fields': ['person_id', 'date', 'calorie_count', 'src_id'], + 'loaded_ids': [1234, 5678, 2345, 6789, 3456], + 'sandboxed_ids': [], + 'cleaned_values': [ + (1234, datetime.fromisoformat('2020-08-17').date(), 100, + 'Participant Portal 2'), + (5678, datetime.fromisoformat('2020-08-17').date(), 200, + 'Participant Portal 1'), + (2345, datetime.fromisoformat('2020-08-17').date(), 500, + 'Participant Portal 2'), + (6789, datetime.fromisoformat('2020-08-17').date(), 800, + 'Participant Portal 1'), + (3456, datetime.fromisoformat('2020-08-17').date(), 1000, + 'Participant Portal 2') + ] + }, { + 'fq_table_name': + '.'.join([self.dataset_id, STEPS_INTRADAY]), + 'fq_sandbox_table_name': + None, + 'fields': ['person_id', 'steps', 'datetime', 'src_id'], + 'loaded_ids': [1234, 5678, 2345, 6789, 3456], + 'sandboxed_ids': [], + 'cleaned_values': [ + (1234, 60, datetime.fromisoformat('2020-08-17 15:00:00'), + 'Participant Portal 2'), + (5678, 50, datetime.fromisoformat('2020-08-17 15:30:00'), + 'Participant Portal 1'), + (2345, 55, datetime.fromisoformat('2020-08-17 16:00:00'), + 'Participant Portal 2'), + (6789, 40, datetime.fromisoformat('2020-08-17 16:30:00'), + 'Participant Portal 1'), + (3456, 65, datetime.fromisoformat('2020-08-17 17:00:00'), + 'Participant Portal 2') + ] + }, { + 'fq_table_name': + '.'.join([self.dataset_id, SLEEP_DAILY_SUMMARY]), + 'fq_sandbox_table_name': + None, + 'fields': ['person_id', 'sleep_date', 'minute_in_bed', 'src_id'], + 'loaded_ids': [1234, 5678, 2345, 6789, 3456], + 'sandboxed_ids': [], + 'cleaned_values': [ + (1234, datetime.fromisoformat('2020-08-17').date(), 502, + 'Participant Portal 2'), + (5678, datetime.fromisoformat('2020-08-17').date(), 443, + 'Participant Portal 1'), + (2345, datetime.fromisoformat('2020-08-17').date(), 745, + 'Participant Portal 2'), + (6789, datetime.fromisoformat('2020-08-17').date(), 605, + 'Participant Portal 1'), + (3456, datetime.fromisoformat('2020-08-17').date(), 578, + 'Participant Portal 2') + ] + }, { + 'fq_table_name': + '.'.join([self.dataset_id, SLEEP_LEVEL]), + 'fq_sandbox_table_name': + None, + 'fields': ['person_id', 'sleep_date', 'duration_in_min', 'src_id'], + 'loaded_ids': [1234, 5678, 2345, 6789, 3456], + 'sandboxed_ids': [], + 'cleaned_values': [ + (1234, datetime.fromisoformat('2020-08-17').date(), 42, + 'Participant Portal 2'), + (5678, datetime.fromisoformat('2020-08-17').date(), 15, + 'Participant Portal 1'), + (2345, datetime.fromisoformat('2020-08-17').date(), 22, + 'Participant Portal 2'), + (6789, datetime.fromisoformat('2020-08-17').date(), 56, + 'Participant Portal 1'), + (3456, datetime.fromisoformat('2020-08-17').date(), 12, + 'Participant Portal 2') + ] + }, { + 'fq_table_name': + '.'.join([self.dataset_id, DEVICE]), + 'fq_sandbox_table_name': + None, + 'fields': ['person_id', 'device_date', 'battery', 'src_id'], + 'loaded_ids': [1234, 5678, 2345, 6789, 3456], + 'sandboxed_ids': [], + 'cleaned_values': [ + (1234, datetime.fromisoformat('2020-08-17').date(), "Medium", + 'Participant Portal 2'), + (5678, datetime.fromisoformat('2020-08-17').date(), "Medium", + 'Participant Portal 1'), + (2345, datetime.fromisoformat('2020-08-17').date(), "Medium", + 'Participant Portal 2'), + (6789, datetime.fromisoformat('2020-08-17').date(), "Medium", + 'Participant Portal 1'), + (3456, datetime.fromisoformat('2020-08-17').date(), "Medium", + 'Participant Portal 2') + ] + }] # mock the PIPELINE_TABLES with mock.patch( diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_pid_rid_map_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_pid_rid_map_test.py index 39e333ee58..07d49ce75e 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_pid_rid_map_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/fitbit_pid_rid_map_test.py @@ -6,7 +6,7 @@ from app_identity import PROJECT_ID import cdr_cleaner.cleaning_rules.deid.fitbit_pid_rid_map as pr from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest -from common import ACTIVITY_SUMMARY, HEART_RATE_SUMMARY, HEART_RATE_MINUTE_LEVEL, STEPS_INTRADAY, SLEEP_DAILY_SUMMARY, SLEEP_LEVEL, DEVICE, DEID_MAP +from common import ACTIVITY_SUMMARY, HEART_RATE_SUMMARY, HEART_RATE_INTRADAY, STEPS_INTRADAY, SLEEP_DAILY_SUMMARY, SLEEP_LEVEL, DEVICE, DEID_MAP class FitbitPIDtoRIDTest(BaseTest.CleaningRulesTestBase): @@ -105,7 +105,7 @@ def test_field_cleaning(self): (3456, 65, (DATETIME '2020-08-17 17:00:00')), (3456, 70, (DATETIME '2020-08-18 17:00:00'))""").render( fq_dataset_name=self.fq_dataset_name, - fitbit_table=HEART_RATE_MINUTE_LEVEL) + fitbit_table=HEART_RATE_INTRADAY) queries.append(hr_query) hrs_query = self.jinja_env.from_string(""" @@ -198,8 +198,10 @@ def test_field_cleaning(self): tables_and_counts = [{ 'fq_table_name': '.'.join([self.fq_dataset_name, ACTIVITY_SUMMARY]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[0], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if ACTIVITY_SUMMARY in sb_name + ][0], 'fields': ['person_id', 'activity_calories', 'date'], 'loaded_ids': [1234, 5678, 2345, 6789, 3456, 3456], 'sandboxed_ids': [3456], @@ -211,9 +213,11 @@ def test_field_cleaning(self): ] }, { 'fq_table_name': - '.'.join([self.fq_dataset_name, HEART_RATE_MINUTE_LEVEL]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[1], + '.'.join([self.fq_dataset_name, HEART_RATE_INTRADAY]), + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if HEART_RATE_INTRADAY in sb_name + ][0], 'fields': ['person_id', 'heart_rate_value', 'datetime'], 'loaded_ids': [1234, 5678, 2345, 6789, 3456, 3456], 'sandboxed_ids': [3456], @@ -226,8 +230,10 @@ def test_field_cleaning(self): }, { 'fq_table_name': '.'.join([self.fq_dataset_name, HEART_RATE_SUMMARY]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[2], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if HEART_RATE_SUMMARY in sb_name + ][0], 'fields': ['person_id', 'date', 'calorie_count'], 'loaded_ids': [1234, 5678, 2345, 6789, 3456], 'sandboxed_ids': [3456], @@ -240,8 +246,10 @@ def test_field_cleaning(self): }, { 'fq_table_name': '.'.join([self.fq_dataset_name, STEPS_INTRADAY]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[3], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if STEPS_INTRADAY in sb_name + ][0], 'fields': ['person_id', 'datetime', 'steps'], 'loaded_ids': [1234, 5678, 2345, 6789, 3456], 'sandboxed_ids': [3456], @@ -254,8 +262,10 @@ def test_field_cleaning(self): }, { 'fq_table_name': '.'.join([self.fq_dataset_name, SLEEP_DAILY_SUMMARY]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[4], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if SLEEP_DAILY_SUMMARY in sb_name + ][0], 'fields': ['person_id', 'sleep_date', 'minute_in_bed'], 'loaded_ids': [1234, 5678, 2345, 6789, 3456], 'sandboxed_ids': [3456], @@ -268,8 +278,10 @@ def test_field_cleaning(self): }, { 'fq_table_name': '.'.join([self.fq_dataset_name, SLEEP_LEVEL]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[5], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if SLEEP_LEVEL in sb_name + ][0], 'fields': ['person_id', 'sleep_date', 'duration_in_min'], 'loaded_ids': [1234, 5678, 2345, 6789, 3456], 'sandboxed_ids': [3456], @@ -282,8 +294,10 @@ def test_field_cleaning(self): }, { 'fq_table_name': '.'.join([self.fq_dataset_name, DEVICE]), - 'fq_sandbox_table_name': - self.fq_sandbox_table_names[6], + 'fq_sandbox_table_name': [ + sb_name for sb_name in self.fq_sandbox_table_names + if DEVICE in sb_name + ][0], 'fields': ['person_id', 'device_date', 'battery'], 'loaded_ids': [1234, 5678, 2345, 6789, 3456], 'sandboxed_ids': [3456], diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/suppress_year_of_birth_records_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/suppress_year_of_birth_records_test.py index b28a5215ca..b373bb0a52 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/suppress_year_of_birth_records_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/deid/suppress_year_of_birth_records_test.py @@ -53,12 +53,13 @@ /* For this data, observation_id 2 and 5 should be dropped.*/ /* observation_id 2 is within the same year as participant's year of birth.*/ /* observation_id 5 is the last possible date that is less than participant's birth year + 2 (2005-01-01) */ + /* The CR should capture records regardless of nulls in nullable concept_id fields. */ VALUES (1, 1, '2020-06-01', '2020-06-01 00:00:00 UTC', 0, 0, 0, 0, 0, 0, 0), (2, 2, '2002-06-01', '2002-06-01 00:00:00 UTC', 0, 0, 0, 0, 0, 0, 0), (3, 3, '2020-03-01', '2020-03-01 00:00:00 UTC', 0, 0, 0, 0, 0, 0, 0), - (4, 4, '2020-01-05', '2020-01-05 00:00:00 UTC', 0, 0, 0, 0, 0, 0, 0), - (5, 3, '2004-12-31', '2004-12-31 00:00:00 UTC', 0, 0, 0, 0, 0, 0, 0) + (4, 4, '2020-01-05', '2020-01-05 00:00:00 UTC', 0, 0, 0, 0, 0, 0, NULL), + (5, 3, '2004-12-31', '2004-12-31 00:00:00 UTC', 0, 0, NULL, NULL, NULL, NULL, NULL) """) DEATH_DATA_TEMPLATE = JINJA_ENV.from_string(""" INSERT INTO `{{project_id}}.{{dataset_id}}.death` diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent_test.py index f132633147..d2201ff9f8 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_ehr_data_without_consent_test.py @@ -28,7 +28,7 @@ """) VISIT_OCCURRENCE_DATA_TEMPLATE = JINJA_ENV.from_string(""" -insert into `{{project_id}}.{{dataset_id}}.visit_occurrence` +insert into `{{project_id}}.{{dataset_id}}.visit_occurrence` (visit_occurrence_id, person_id, visit_concept_id, @@ -59,7 +59,7 @@ """) OBSERVATION_DATA_TEMPLATE = JINJA_ENV.from_string(""" -insert into `{{project_id}}.{{dataset_id}}.observation` +INSERT INTO `{{project_id}}.{{dataset_id}}.observation` (observation_id, person_id, observation_concept_id, @@ -67,7 +67,7 @@ observation_datetime, observation_type_concept_id, value_source_concept_id, - observation_source_value ) + observation_source_value) VALUES (1, 1, 0, '2020-01-01', '2020-01-01 00:00:00 UTC', 0, 1586100, 'EHRConsentPII_ConsentPermission'), (2, 1, 0, '2021-01-02', '2021-01-02 00:00:00 UTC', 0, 1586100, 'EHRConsentPII_ConsentPermission'), (3, 1, 0, '2020-05-01', '2020-05-01 00:00:00 UTC', 0, 123, 'test_value_0'), @@ -75,11 +75,13 @@ (5, 2, 0, '2020-01-05', '2020-01-05 00:00:00 UTC', 0, 345, 'test_value_2'), (6, 2, 0, '2020-05-05', '2020-05-05 00:00:00 UTC', 0, 456, 'test_value_3'), (7, 3, 0, '2020-01-01', '2020-01-01 00:00:00 UTC', 0, 1586100, 'EHRConsentPII_ConsentPermission'), - (8, 4, 0, '2021-01-02', '2021-01-02 00:00:00 UTC', 0, 1586100, 'EHRConsentPII_ConsentPermission') + (8, 4, 0, '2021-01-02', '2021-01-02 00:00:00 UTC', 0, 1586100, 'EHRConsentPII_ConsentPermission'), + (9, 5, 0, '2023-09-29', '2023-09-29 09:34:13 UTC', 0, 123, 'test_value_3'), + (10, 5, 0, '2023-09-29', '2023-09-29 09:34:13 UTC', 0, 1586100, 'EHRConsentPII_ConsentPermission') """) MAPPING_OBSERVATION_TEMPLATE = JINJA_ENV.from_string(""" -insert into `{{project_id}}.{{dataset_id}}._mapping_observation` +INSERT INTO `{{project_id}}.{{dataset_id}}._mapping_observation` (observation_id, src_dataset_id) VALUES (1, 'rdr2021'), (2, 'rdr2021'), @@ -88,13 +90,33 @@ (5, 'unioned_ehr'), (6, 'rdr2021'), (7, 'unioned_ehr'), - (8, 'unioned_ehr') + (8, 'unioned_ehr'), + (9, 'unioned_ehr'), + (10, 'rdr2023') +""") + +DUPLICATE_DATASET_CREATION = JINJA_ENV.from_string(""" +CREATE OR REPLACE TABLE `{{project_id}}.{{duplicates_dataset}}.{{duplicates_table}}` ( + person_id INT, + hpo_id INT, + src_id STRING, + consent_for_study_enrollment_authored DATE, + withdrawal_status STRING +) +""") + +DUPLICATE_RECORDS_TEMPLATE = JINJA_ENV.from_string(""" +INSERT INTO `{{project_id}}.{{duplicates_dataset}}.{{duplicates_table}}` +(person_id, hpo_id, src_id, consent_for_study_enrollment_authored, withdrawal_status) +VALUES ( + 5, 0, '42', (DATE '2023-09-27'), 'UNKNOWN' +) """) CONSENT_VALIDATION_TEMPLATE = JINJA_ENV.from_string(""" -insert into `{{project_id}}.{{dataset_id}}.consent_validation` +INSERT INTO `{{project_id}}.{{dataset_id}}.consent_validation` (person_id, research_id, consent_for_electronic_health_records, consent_for_electronic_health_records_authored, src_id) -VALUES +VALUES -- validated consent with varying casing, not cleaned -- (1, 0, 'Submitted', (DATETIME '2018-11-26 00:00:00'), 'rdr'), -- validated consent but no consent record in observation, cleaned -- @@ -103,7 +125,10 @@ (3, 0, 'Submitted_No', (DATETIME '2018-11-26 00:00:00'), 'rdr'), (3, 0, 'Submitted', (DATETIME '2018-11-26 00:00:00'), 'rdr'), -- null status. invalid consent, cleaned -- - (4, 0, NULL, (DATETIME '2018-11-26 00:00:00'), 'rdr') + (4, 0, NULL, (DATETIME '2018-11-26 00:00:00'), 'rdr'), + -- duplicate records -- + (5, 0, NULL, (DATETIME '2023-07-30 09:41:24'), 'rdr'), + (5, 0, NULL, (DATETIME '2023-07-30 09:41:24'), 'rdr') """) @@ -123,10 +148,17 @@ def setUpClass(cls): # Set the expected test datasets cls.dataset_id = os.environ.get('COMBINED_DATASET_ID') cls.sandbox_id = cls.dataset_id + '_sandbox' + cls.duplicates_dataset = 'duplicates_dataset' + cls.duplicates_table = 'duplicates_table' - cls.rule_instance = RemoveEhrDataWithoutConsent(cls.project_id, - cls.dataset_id, - cls.sandbox_id) + cls.rule_instance = RemoveEhrDataWithoutConsent( + cls.project_id, + cls.dataset_id, + cls.sandbox_id, + table_namer=None, + ehr_duplicates_dataset=cls.duplicates_dataset, + ehr_duplicates_table=cls.duplicates_table, + ) # Generates list of fully qualified table names and their corresponding sandbox table names cls.fq_table_names.extend([ @@ -136,6 +168,7 @@ def setUpClass(cls): f'{cls.project_id}.{cls.dataset_id}._mapping_{OBSERVATION}', f'{cls.project_id}.{cls.dataset_id}._mapping_{VISIT_OCCURRENCE}', f'{cls.project_id}.{cls.dataset_id}.{EHR_CONSENT_VALIDATION}', + f'{cls.project_id}.{cls.duplicates_dataset}.{cls.duplicates_table}' ]) cls.fq_sandbox_table_names.extend([ f'{cls.project_id}.{cls.sandbox_id}.{cls.rule_instance.issue_numbers[0].lower()}_{OBSERVATION}', @@ -143,6 +176,9 @@ def setUpClass(cls): f'{cls.project_id}.{cls.sandbox_id}.{EHR_UNCONSENTED_PARTICIPANTS_LOOKUP_TABLE}' ]) + cls.kwargs['ehr_duplicates_dataset'] = cls.duplicates_dataset + cls.kwargs['ehr_duplicates_table'] = cls.duplicates_table + # call super to set up the client, create datasets cls.up_class = super().setUpClass() @@ -167,11 +203,25 @@ def setUp(self): consent_validation_query = CONSENT_VALIDATION_TEMPLATE.render( project_id=self.project_id, dataset_id=self.dataset_id) + duplicates_creation_query = DUPLICATE_DATASET_CREATION.render( + project_id=self.project_id, + duplicates_dataset=self.duplicates_dataset, + duplicates_table=self.duplicates_table) + duplicates_data_query = DUPLICATE_RECORDS_TEMPLATE.render( + project_id=self.project_id, + duplicates_dataset=self.duplicates_dataset, + duplicates_table=self.duplicates_table) + # Load test data self.load_test_data([ - person_data_query, visit_occurrence_data_query, - observation_data_query, mapping_observation_query, - mapping_visit_query, consent_validation_query + person_data_query, + visit_occurrence_data_query, + observation_data_query, + mapping_observation_query, + mapping_visit_query, + consent_validation_query, + duplicates_creation_query, # create before insert + duplicates_data_query, ]) def test_remove_ehr_data_without_consent(self): @@ -183,6 +233,8 @@ def test_remove_ehr_data_without_consent(self): 3. person_id=3. has a invalid, affirmative consent record. 4. person_id=4. has a invalid(null status), affirmative consent record. + + 5. person_id=5. has a duplicated record of person_id=6 """ # Expected results list @@ -200,7 +252,7 @@ def test_remove_ehr_data_without_consent(self): f'{self.project_id}.{self.dataset_id}.{OBSERVATION}', 'fq_sandbox_table_name': f'{self.project_id}.{self.sandbox_id}.{self.rule_instance.sandbox_table_for(OBSERVATION)}', - 'loaded_ids': [1, 2, 3, 4, 5, 6, 7, 8], + 'loaded_ids': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'sandboxed_ids': [4, 5, 7, 8], 'fields': [ 'observation_id', 'person_id', 'value_source_concept_id', @@ -209,7 +261,9 @@ def test_remove_ehr_data_without_consent(self): 'cleaned_values': [ (1, 1, 1586100, 'EHRConsentPII_ConsentPermission'), (2, 1, 1586100, 'EHRConsentPII_ConsentPermission'), - (3, 1, 123, 'test_value_0'), (6, 2, 456, 'test_value_3') + (3, 1, 123, 'test_value_0'), (6, 2, 456, 'test_value_3'), + (9, 5, 123, 'test_value_3'), + (10, 5, 1586100, 'EHRConsentPII_ConsentPermission') ] }] diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_non_matching_participant_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_non_matching_participant_test.py index 7338818fe4..adda74cca2 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_non_matching_participant_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_non_matching_participant_test.py @@ -82,37 +82,37 @@ INSERT INTO `{{fq_table_name}}` (observation_id, src_dataset_id, src_observation_id, src_hpo_id, src_table_id) VALUES - (10101, '{{rdr_dataset_id}}', 111, 'rdr', 'observation'), + (10101, '{{rdr_dataset_id}}', 111, 'ce', 'observation'), (10102, '{{ehr_dataset_id}}', 112, '{{hpo_1}}', 'observation'), - (10201, '{{rdr_dataset_id}}', 121, 'rdr', 'observation'), + (10201, '{{rdr_dataset_id}}', 121, 'ce', 'observation'), (10202, '{{ehr_dataset_id}}', 122, '{{hpo_1}}', 'observation'), - (10301, '{{rdr_dataset_id}}', 131, 'rdr', 'observation'), + (10301, '{{rdr_dataset_id}}', 131, 'vibrent', 'observation'), (10302, '{{ehr_dataset_id}}', 132, '{{hpo_1}}', 'observation'), - (10401, '{{rdr_dataset_id}}', 141, 'rdr', 'observation'), + (10401, '{{rdr_dataset_id}}', 141, 'vibrent', 'observation'), (10402, '{{ehr_dataset_id}}', 142, '{{hpo_1}}', 'observation'), - (20101, '{{rdr_dataset_id}}', 211, 'rdr', 'observation'), + (20101, '{{rdr_dataset_id}}', 211, 'vibrent', 'observation'), (20102, '{{ehr_dataset_id}}', 212, '{{hpo_2}}', 'observation'), - (20201, '{{rdr_dataset_id}}', 221, 'rdr', 'observation'), + (20201, '{{rdr_dataset_id}}', 221, 'ce', 'observation'), (20202, '{{ehr_dataset_id}}', 222, '{{hpo_2}}', 'observation'), - (20301, '{{rdr_dataset_id}}', 231, 'rdr', 'observation'), + (20301, '{{rdr_dataset_id}}', 231, 'ce', 'observation'), (20302, '{{ehr_dataset_id}}', 232, '{{hpo_2}}', 'observation'), - (20401, '{{rdr_dataset_id}}', 241, 'rdr', 'observation'), + (20401, '{{rdr_dataset_id}}', 241, 'healthpro', 'observation'), (20402, '{{ehr_dataset_id}}', 242, '{{hpo_2}}', 'observation'), - (30101, '{{rdr_dataset_id}}', 311, 'rdr', 'observation'), + (30101, '{{rdr_dataset_id}}', 311, 'healthpro', 'observation'), (30102, '{{ehr_dataset_id}}', 312, '{{hpo_3}}', 'observation'), - (30201, '{{rdr_dataset_id}}', 321, 'rdr', 'observation'), + (30201, '{{rdr_dataset_id}}', 321, 'vibrent', 'observation'), (30202, '{{ehr_dataset_id}}', 322, '{{hpo_3}}', 'observation'), - (30301, '{{rdr_dataset_id}}', 331, 'rdr', 'observation'), + (30301, '{{rdr_dataset_id}}', 331, 'vibrent', 'observation'), (30302, '{{ehr_dataset_id}}', 332, '{{hpo_3}}', 'observation'), - (30401, '{{rdr_dataset_id}}', 341, 'rdr', 'observation'), + (30401, '{{rdr_dataset_id}}', 341, 'ce', 'observation'), (30402, '{{ehr_dataset_id}}', 342, '{{hpo_3}}', 'observation'), - (40101, '{{rdr_dataset_id}}', 411, 'rdr', 'observation'), + (40101, '{{rdr_dataset_id}}', 411, 'healthpro', 'observation'), (40102, '{{ehr_dataset_id}}', 412, '{{hpo_4}}', 'observation'), - (40201, '{{rdr_dataset_id}}', 421, 'rdr', 'observation'), + (40201, '{{rdr_dataset_id}}', 421, 'healthpro', 'observation'), (40202, '{{ehr_dataset_id}}', 422, '{{hpo_4}}', 'observation'), - (40301, '{{rdr_dataset_id}}', 431, 'rdr', 'observation'), + (40301, '{{rdr_dataset_id}}', 431, 'vibrent', 'observation'), (40302, '{{ehr_dataset_id}}', 432, '{{hpo_4}}', 'observation'), - (40401, '{{rdr_dataset_id}}', 441, 'rdr', 'observation'), + (40401, '{{rdr_dataset_id}}', 441, 'vibrent', 'observation'), (40402, '{{ehr_dataset_id}}', 442, '{{hpo_4}}', 'observation') """), f'{HPO_1}_{IDENTITY_MATCH}': diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date_test.py index 63cd7483b3..41ad9390a2 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/remove_participant_data_past_deactivation_date_test.py @@ -18,8 +18,8 @@ # Project imports from common import (AOU_DEATH, JINJA_ENV, OBSERVATION, DRUG_EXPOSURE, DEATH, - PERSON, SURVEY_CONDUCT, HEART_RATE_MINUTE_LEVEL, - SLEEP_LEVEL, STEPS_INTRADAY, DEVICE) + PERSON, SURVEY_CONDUCT, HEART_RATE_INTRADAY, SLEEP_LEVEL, + STEPS_INTRADAY, DEVICE) from app_identity import PROJECT_ID from cdr_cleaner.cleaning_rules.remove_participant_data_past_deactivation_date import ( RemoveParticipantDataPastDeactivationDate, DEACTIVATED_PARTICIPANTS, DATE, @@ -165,7 +165,7 @@ def setUp(self): (4, 3, 0, '2009-08-30 19:33:53 UTC', 0, 0, 0, 0, 0, 0), (5, 4, 0, '2009-08-30 19:33:53 UTC', 0, 0, 0, 0, 0, 0) """), - HEART_RATE_MINUTE_LEVEL: + HEART_RATE_INTRADAY: JINJA_ENV.from_string(""" INSERT INTO `{{table.project}}.{{table.dataset_id}}.{{table.table_id}}` (person_id, datetime, heart_rate_value) @@ -350,11 +350,11 @@ def test_removing_data_past_deactivated_date(self, mock_get_deact): 'cleaned_values': [('a2',), ('a4',), ('b5',)] }, { 'name': - HEART_RATE_MINUTE_LEVEL, + HEART_RATE_INTRADAY, 'fq_table_name': - f'{self.project_id}.{self.dataset_id}.{HEART_RATE_MINUTE_LEVEL}', + f'{self.project_id}.{self.dataset_id}.{HEART_RATE_INTRADAY}', 'fq_sandbox_table_name': - f'{self.project_id}.{self.sandbox_id}.{self.rule_instance.sandbox_table_for(HEART_RATE_MINUTE_LEVEL)}', + f'{self.project_id}.{self.sandbox_id}.{self.rule_instance.sandbox_table_for(HEART_RATE_INTRADAY)}', 'fields': ['person_id', 'heart_rate_value'], 'loaded_ids': [1, 1], 'sandboxed_ids': [1], diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_withdrawn_pids_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_withdrawn_pids_test.py new file mode 100644 index 0000000000..7feee45712 --- /dev/null +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/sandbox_and_remove_withdrawn_pids_test.py @@ -0,0 +1,316 @@ +""" +Integration test for SandboxAndRemovePidsList module. +""" +# Python imports +import os +from datetime import datetime + +# Third party imports +from google.cloud.bigquery import Table + +# Project Imports +from app_identity import PROJECT_ID +from common import JINJA_ENV, RDR_DATASET_ID, OBSERVATION, PERSON, AOU_DEATH +from cdr_cleaner.cleaning_rules.sandbox_and_remove_withdrawn_pids import SandboxAndRemoveWithdrawnPids +from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest + +OBSERVATION_TABLE_TEMPLATE = JINJA_ENV.from_string(""" + INSERT INTO `{{project_id}}.{{dataset_id}}.observation` + (observation_id, person_id, observation_concept_id, observation_date, observation_type_concept_id) + VALUES + (10101, 101, 0, date('2022-01-01'), 0), + (10102, 101, 0, date('2022-01-01'), 0), + (10201, 102, 0, date('2022-01-02'), 0), + (10202, 102, 0, date('2022-01-02'), 0), + (10301, 103, 0, date('2022-01-03'), 0), + (10302, 103, 0, date('2022-01-03'), 0), + (10401, 104, 0, date('2022-01-04'), 0), + (10402, 104, 0, date('2022-01-04'), 0), + (20101, 201, 0, date('2022-01-01'), 0), + (20102, 201, 0, date('2022-01-01'), 0), + (20201, 202, 0, date('2022-01-02'), 0), + (20202, 202, 0, date('2022-01-02'), 0), + (20301, 203, 0, date('2022-01-03'), 0), + (20302, 203, 0, date('2022-01-03'), 0), + (20401, 204, 0, date('2022-01-04'), 0), + (20402, 204, 0, date('2022-01-04'), 0), + (30101, 301, 0, date('2022-01-01'), 0), + (30102, 301, 0, date('2022-01-01'), 0), + (30201, 302, 0, date('2022-01-02'), 0), + (30202, 302, 0, date('2022-01-02'), 0), + (30301, 303, 0, date('2022-01-03'), 0), + (30302, 303, 0, date('2022-01-03'), 0), + (30401, 304, 0, date('2022-01-04'), 0), + (30402, 304, 0, date('2022-01-04'), 0), + (40101, 401, 0, date('2022-01-01'), 0), + (40102, 401, 0, date('2022-01-01'), 0), + (40201, 402, 0, date('2022-01-02'), 0), + (40202, 402, 0, date('2022-01-02'), 0), + (40301, 403, 0, date('2022-01-03'), 0), + (40302, 403, 0, date('2022-01-03'), 0), + (40401, 404, 0, date('2022-01-04'), 0), + (40402, 404, 0, date('2022-01-04'), 0) +""") + +PERSON_DATA_TEMPLATE = JINJA_ENV.from_string(""" + INSERT INTO + `{{project_id}}.{{dataset_id}}.person` + (person_id, gender_concept_id, year_of_birth, race_concept_id, ethnicity_concept_id) + VALUES + (101, 0, 1991, 0, 0), + (102, 0, 1992, 0, 0), + (103, 0, 1993, 0, 0), + (104, 0, 1994, 0, 0), + (201, 0, 1991, 0, 0), + (202, 0, 1992, 0, 0), + (203, 0, 1993, 0, 0), + (204, 0, 1994, 0, 0), + (301, 0, 1991, 0, 0), + (302, 0, 1992, 0, 0), + (303, 0, 1993, 0, 0), + (304, 0, 1994, 0, 0), + (401, 0, 1991, 0, 0), + (402, 0, 1992, 0, 0), + (403, 0, 1993, 0, 0), + (404, 0, 1994, 0, 0) +""") + +AOU_DEATH_TEMPLATE = JINJA_ENV.from_string(""" + INSERT INTO + `{{project_id}}.{{dataset_id}}.aou_death` + (aou_death_id, person_id, death_date, death_type_concept_id, cause_concept_id, cause_source_concept_id, src_id, primary_death_record) + VALUES + ('a10101', 101, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False), + ('a10202', 102, date('2020-05-05'), 0, 0, 0, 'Participant Portal 1', False), + ('a10301', 103, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False), + ('a10402', 104, date('2020-05-05'), 0, 0, 0, 'Participant Portal 1', False), + ('a20102', 201, date('2020-05-05'), 0, 0, 0, 'Participant Portal 2', False), + ('a20202', 202, date('2020-05-05'), 0, 0, 0, 'Participant Portal 2', False), + ('a20302', 203, date('2020-05-05'), 0, 0, 0, 'Participant Portal 2', False), + ('a20401', 204, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False), + ('a30101', 301, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False), + ('a30202', 302, date('2020-05-05'), 0, 0, 0, 'Participant Portal 3', False), + ('a30302', 303, date('2020-05-05'), 0, 0, 0, 'Participant Portal 3', False), + ('a30401', 304, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False), + ('a40101', 401, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False), + ('a40202', 402, date('2020-05-05'), 0, 0, 0, 'Participant Portal 4', False), + ('a40301', 403, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False), + ('a40401', 404, date('2020-05-05'), 0, 0, 0, 'Staff Portal: HealthPro', False) +""") + +LOOKUP_TABLE_TEMPLATE = JINJA_ENV.from_string(""" + INSERT INTO `{{project_id}}.{{dataset_id}}.{{lookup_table}}` + (person_id) + VALUES + (104), + (202), + (204), + (301), + (401), + (403) +""") + +LOOKUP_TABLE_SCHEMA = [{ + "type": "integer", + "name": "person_id", + "mode": "nullable" +}, { + "type": "integer", + "name": "hpo_id", + "mode": "nullable" +}, { + "type": "string", + "name": "src_id", + "mode": "nullable" +}, { + "type": "DATE", + "name": "consent_for_study_enrollment_authored", + "mode": "nullable" +}, { + "type": "string", + "name": "withdrawal_status", + "mode": "nullable" +}] + + +class SandboxAndRemovePidsListTest(BaseTest.CleaningRulesTestBase): + + @classmethod + def setUpClass(cls): + print('**************************************************************') + print(cls.__name__) + print('**************************************************************') + + super().initialize_class_vars() + + # Set the test project identifier + cls.project_id = os.environ.get(PROJECT_ID) + + # Set the expected test datasets + cls.dataset_id = RDR_DATASET_ID + cls.withdrawn_dups_table = 'pdr_withdrawals_list' + cls.sandbox_id = f'{cls.dataset_id}_sandbox' + + cls.kwargs = {'withdrawn_dups_table': cls.withdrawn_dups_table} + + # Instantiate class + cls.rule_instance = SandboxAndRemoveWithdrawnPids( + project_id=cls.project_id, + dataset_id=cls.dataset_id, + sandbox_dataset_id=cls.sandbox_id, + withdrawn_dups_table=cls.withdrawn_dups_table) + + # Generates list of fully qualified table names + affected_table_names = ['observation', 'person', 'aou_death'] + for table_name in affected_table_names: + cls.fq_table_names.append( + f'{cls.project_id}.{cls.dataset_id}.{table_name}') + + # Generates list of sandbox table names + for table_name in affected_table_names: + cls.fq_sandbox_table_names.append( + f'{cls.project_id}.{cls.sandbox_id}.{cls.rule_instance.sandbox_table_for(table_name)}' + ) + + # call super to set up the client, create datasets + cls.up_class = super().setUpClass() + + def setUp(self): + """ + Create tables and test data + """ + super().setUp() + + # Create a temp lookup_table in rdr dataset for testing + lookup_table_name = f'{self.project_id}.{self.dataset_id}.{self.withdrawn_dups_table}' + self.client.create_table(Table(lookup_table_name, LOOKUP_TABLE_SCHEMA)) + self.fq_table_names.append(lookup_table_name) + + # Build temp records lookup table query + lookup_table_query = LOOKUP_TABLE_TEMPLATE.render( + project_id=self.project_id, + dataset_id=self.dataset_id, + lookup_table=self.withdrawn_dups_table) + + # Build test data queries + observation_records_query = OBSERVATION_TABLE_TEMPLATE.render( + project_id=self.project_id, dataset_id=self.dataset_id) + person_records_query = PERSON_DATA_TEMPLATE.render( + project_id=self.project_id, dataset_id=self.dataset_id) + aou_death_records_query = AOU_DEATH_TEMPLATE.render( + project_id=self.project_id, dataset_id=self.dataset_id) + + table_test_queries = [ + observation_records_query, person_records_query, + aou_death_records_query + ] + + # Load test data + self.load_test_data([lookup_table_query] + table_test_queries) + + def test_sandbox_and_remove_pids_list(self): + """ + Validates that the data for participants in the lookup table has been removed. + """ + tables_and_counts = [{ + 'fq_table_name': + f'{self.project_id}.{self.dataset_id}.{OBSERVATION}', + 'fq_sandbox_table_name': + self.fq_sandbox_table_names[0], + 'fields': [ + 'observation_id', 'person_id', 'observation_concept_id', + 'observation_date', 'observation_type_concept_id' + ], + 'loaded_ids': [ + 10101, 10102, 10201, 10202, 10301, 10302, 10401, 10402, 20101, + 20102, 20201, 20202, 20301, 20302, 20401, 20402, 30101, 30102, + 30201, 30202, 30301, 30302, 30401, 30402, 40101, 40102, 40201, + 40202, 40301, 40302, 40401, 40402 + ], + 'sandboxed_ids': [ + 10401, 10402, 20201, 20202, 20401, 20402, 30101, 30102, 40101, + 40102, 40301, 40302 + ], + 'cleaned_values': [ + (10101, 101, 0, datetime.fromisoformat('2022-01-01').date(), 0), + (10102, 101, 0, datetime.fromisoformat('2022-01-01').date(), 0), + (10201, 102, 0, datetime.fromisoformat('2022-01-02').date(), 0), + (10202, 102, 0, datetime.fromisoformat('2022-01-02').date(), 0), + (10301, 103, 0, datetime.fromisoformat('2022-01-03').date(), 0), + (10302, 103, 0, datetime.fromisoformat('2022-01-03').date(), 0), + (20101, 201, 0, datetime.fromisoformat('2022-01-01').date(), 0), + (20102, 201, 0, datetime.fromisoformat('2022-01-01').date(), 0), + (20301, 203, 0, datetime.fromisoformat('2022-01-03').date(), 0), + (20302, 203, 0, datetime.fromisoformat('2022-01-03').date(), 0), + (30201, 302, 0, datetime.fromisoformat('2022-01-02').date(), 0), + (30202, 302, 0, datetime.fromisoformat('2022-01-02').date(), 0), + (30301, 303, 0, datetime.fromisoformat('2022-01-03').date(), 0), + (30302, 303, 0, datetime.fromisoformat('2022-01-03').date(), 0), + (30401, 304, 0, datetime.fromisoformat('2022-01-04').date(), 0), + (30402, 304, 0, datetime.fromisoformat('2022-01-04').date(), 0), + (40201, 402, 0, datetime.fromisoformat('2022-01-02').date(), 0), + (40202, 402, 0, datetime.fromisoformat('2022-01-02').date(), 0), + (40401, 404, 0, datetime.fromisoformat('2022-01-04').date(), 0), + (40402, 404, 0, datetime.fromisoformat('2022-01-04').date(), 0) + ] + }, { + 'fq_table_name': + f'{self.project_id}.{self.dataset_id}.{PERSON}', + 'fq_sandbox_table_name': + self.fq_sandbox_table_names[1], + 'fields': [ + 'person_id', 'gender_concept_id', 'year_of_birth', + 'race_concept_id', 'ethnicity_concept_id' + ], + 'loaded_ids': [ + 101, 102, 103, 104, 201, 202, 203, 204, 301, 302, 303, 304, 401, + 402, 403, 404 + ], + 'sandboxed_ids': [104, 202, 204, 301, 401, 403], + 'cleaned_values': [(101, 0, 1991, 0, 0), (102, 0, 1992, 0, 0), + (103, 0, 1993, 0, 0), (201, 0, 1991, 0, 0), + (203, 0, 1993, 0, 0), (302, 0, 1992, 0, 0), + (303, 0, 1993, 0, 0), (304, 0, 1994, 0, 0), + (402, 0, 1992, 0, 0), (404, 0, 1994, 0, 0)] + }, { + 'fq_table_name': + f'{self.project_id}.{self.dataset_id}.{AOU_DEATH}', + 'fq_sandbox_table_name': + self.fq_sandbox_table_names[2], + 'fields': [ + 'aou_death_id', 'person_id', 'death_date', + 'death_type_concept_id', 'cause_concept_id', + 'cause_source_concept_id', 'src_id', 'primary_death_record' + ], + 'loaded_ids': [ + 'a10101', 'a10202', 'a10301', 'a10402', 'a20102', 'a20202', + 'a20302', 'a20401', 'a30101', 'a30202', 'a30302', 'a30401', + 'a40101', 'a40202', 'a40301', 'a40401' + ], + 'sandboxed_ids': [ + 'a10402', 'a20202', 'a20401', 'a30101', 'a40101', 'a40301' + ], + 'cleaned_values': [ + ('a10101', 101, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Staff Portal: HealthPro', False), + ('a10202', 102, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Participant Portal 1', False), + ('a10301', 103, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Staff Portal: HealthPro', False), + ('a20102', 201, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Participant Portal 2', False), + ('a20302', 203, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Participant Portal 2', False), + ('a30202', 302, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Participant Portal 3', False), + ('a30302', 303, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Participant Portal 3', False), + ('a30401', 304, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Staff Portal: HealthPro', False), + ('a40202', 402, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Participant Portal 4', False), + ('a40401', 404, datetime.fromisoformat('2020-05-05').date(), 0, + 0, 0, 'Staff Portal: HealthPro', False), + ] + }] + self.default_test(tables_and_counts) diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/truncate_fitbit_data_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/truncate_fitbit_data_test.py index b3af12a061..769e8b9887 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/truncate_fitbit_data_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/truncate_fitbit_data_test.py @@ -14,9 +14,8 @@ from dateutil import parser # Project imports -from common import FITBIT_TABLES, ACTIVITY_SUMMARY,\ - HEART_RATE_SUMMARY, SLEEP_LEVEL, SLEEP_DAILY_SUMMARY,\ - HEART_RATE_MINUTE_LEVEL, STEPS_INTRADAY, DEVICE +from common import (FITBIT_TABLES, ACTIVITY_SUMMARY, HEART_RATE_SUMMARY, + HEART_RATE_INTRADAY, STEPS_INTRADAY, DEVICE) from app_identity import PROJECT_ID from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest from cdr_cleaner.cleaning_rules.truncate_fitbit_data import TruncateFitbitData @@ -105,7 +104,7 @@ def test_truncate_data(self): (333, (DATETIME '2020-11-26 00:00:00')), (444, (DATETIME '2021-11-26 00:00:00'))""").render( fq_dataset_name=self.fq_dataset_name, - fitbit_table=HEART_RATE_MINUTE_LEVEL) + fitbit_table=HEART_RATE_INTRADAY) queries.append(hr_query) hrs_query = self.jinja_env.from_string(""" @@ -167,10 +166,10 @@ def test_truncate_data(self): (222, parser.parse('2019-11-26').date())] }, { 'fq_table_name': - '.'.join([self.fq_dataset_name, HEART_RATE_MINUTE_LEVEL]), + '.'.join([self.fq_dataset_name, HEART_RATE_INTRADAY]), 'fq_sandbox_table_name': [ table for table in self.fq_sandbox_table_names - if HEART_RATE_MINUTE_LEVEL in table + if HEART_RATE_INTRADAY in table ][0], 'fields': ['person_id', 'datetime'], 'loaded_ids': [111, 222, 333, 444], diff --git a/tests/integration_tests/data_steward/tools/create_combined_backup_dataset_test.py b/tests/integration_tests/data_steward/tools/create_combined_backup_dataset_test.py index 1b46a53269..6c8c718f53 100644 --- a/tests/integration_tests/data_steward/tools/create_combined_backup_dataset_test.py +++ b/tests/integration_tests/data_steward/tools/create_combined_backup_dataset_test.py @@ -9,7 +9,8 @@ import bq_utils import resources from app_identity import get_application_id, PROJECT_ID -from common import AOU_DEATH, SITE_MASKING_TABLE_ID, BIGQUERY_DATASET_ID, RDR_DATASET_ID +from common import (AOU_DEATH, SITE_MASKING_TABLE_ID, BIGQUERY_DATASET_ID, + RDR_DATASET_ID, COMBINED_DATASET_ID, EHR_CONSENT_VALIDATION) from gcloud.gcs import StorageClient from gcloud.bq import BigQueryClient from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest @@ -60,7 +61,7 @@ def load_dataset_from_files(cls, dataset_id, path, mappings=False): hpo_bucket = cls.storage_client.get_hpo_bucket(test_util.FAKE_HPO_ID) cls.storage_client.empty_bucket(hpo_bucket) job_ids: list = [] - for table in resources.CDM_TABLES: + for table in resources.CDM_TABLES + [EHR_CONSENT_VALIDATION]: job_ids.append( cls._upload_file_to_bucket(hpo_bucket, dataset_id, path, table)) if mappings and table in DOMAIN_TABLES: @@ -96,7 +97,7 @@ def _upload_file_to_bucket(cls, bucket, dataset_id: str, path: str, return job_id def setUp(self): - self.combined_dataset_id = bq_utils.get_combined_dataset_id() + self.combined_dataset_id = COMBINED_DATASET_ID test_util.delete_all_tables(self.bq_client, self.combined_dataset_id) def test_consented_person_id(self): @@ -262,7 +263,7 @@ def _all_rdr_records_included(self): ' WHERE t.{domain_table}_id = m.{domain_table}_id)').format( domain_table=domain_table, rdr_dataset_id=self.rdr_dataset_id, - combined_dataset_id=bq_utils.get_combined_dataset_id(), + combined_dataset_id=self.combined_dataset_id, mapping_table=mapping_table) response = bq_utils.query(query) rows = bq_utils.response2rows(response) @@ -315,7 +316,7 @@ def setUpClass(cls): super().initialize_class_vars() cls.project_id = os.environ.get(PROJECT_ID) - cls.dataset_id = os.environ.get('COMBINED_DATASET_ID') + cls.dataset_id = COMBINED_DATASET_ID cls.sandbox_id = BIGQUERY_DATASET_ID cls.rdr_id = RDR_DATASET_ID cls.unioned_id = os.environ.get('UNIONED_DATASET_ID') diff --git a/tests/integration_tests/data_steward/tools/import_rdr_dataset_test.py b/tests/integration_tests/data_steward/tools/import_rdr_dataset_test.py index a4039b5c2a..ee3bfa9ad8 100644 --- a/tests/integration_tests/data_steward/tools/import_rdr_dataset_test.py +++ b/tests/integration_tests/data_steward/tools/import_rdr_dataset_test.py @@ -6,7 +6,8 @@ # Project imports from app_identity import get_application_id, PROJECT_ID -from common import AOU_DEATH, CARE_SITE, METADATA, DEATH, DRUG_ERA, PERSON, PID_RID_MAPPING, VISIT_COST +from common import (AOU_DEATH, CARE_SITE, METADATA, DEATH, DRUG_ERA, + OBSERVATION, PERSON, PID_RID_MAPPING, VISIT_COST) from resources import cdm_schemas, fields_for, rdr_src_id_schemas from tests.integration_tests.data_steward.cdr_cleaner.cleaning_rules.bigquery_tests_base import BaseTest from tools.import_rdr_dataset import create_rdr_tables, get_destination_schemas @@ -97,8 +98,8 @@ def test_get_destination_schemas(self): # AOU_DEATH does not have RDR specific schema definition. self.assertEqual(schema_dict[AOU_DEATH], fields_for(AOU_DEATH)) - def test_create_rdr_tables(self): - """Test create_rdr_tables + def test_create_rdr_tables_aou_death(self): + """Test create_rdr_tables for aou_death creation. Confirm the following: (1) RDR's death records are loaded to our Raw RDR AOU_DEATH table, (2) NULL death_date records do not fail the process, and @@ -116,3 +117,47 @@ def test_create_rdr_tables(self): self.assertTableDoesNotExist( f'{self.project_id}.{self.dataset_id}.{DEATH}') + + def test_create_rdr_tables(self): + """Test create_rdr_tables for table creation. + Confirm the following: + (1) Records from RDR are copied to the corresponding tables in Curation. + (2) When the RDR's CDM table is empty, an empty table is created in Curation too. + (3) Even when the CDM table does not exist in RDR, an empty table is created in Curation. + """ + + # Adding records for the test case (1) + insert_obs = self.jinja_env.from_string(""" + INSERT INTO `{{project}}.{{dataset}}.{{obs}}` + (observation_id, person_id, observation_concept_id, observation_date, + observation_type_concept_id, src_id) + VALUES + (101, 1, 0, date('2022-01-01'), 0, 'src_a'), + (102, 1, 0, date('2022-01-01'), 0, 'src_b'), + (103, 1, 0, date('2022-01-01'), 0, 'src_c') + """).render(project=self.project_id, + dataset=self.rdr_dataset_id, + obs=OBSERVATION) + + self.load_test_data([insert_obs]) + + # Deleting a table for the test case (3) + self.client.delete_table( + f'{self.project_id}.{self.rdr_dataset_id}.{VISIT_COST}') + + create_rdr_tables(client=self.client, + destination_dataset=self.dataset_id, + rdr_project=self.project_id, + rdr_source_dataset=self.rdr_dataset_id) + + # (1) Records from RDR are copied to the corresponding tables in Curation. + self.assertTableValuesMatch( + f'{self.project_id}.{self.dataset_id}.{OBSERVATION}', [ + 'observation_id', + ], [(101,), (102,), (103,)]) + + # (2) When the RDR's CDM table is empty, an empty table is created in Curation too. + self.assertTrue(self.client.table_exists(DRUG_ERA, self.dataset_id)) + + # (3) Even when the CDM table does not exist in RDR, an empty table is created in Curation. + self.assertTrue(self.client.table_exists(VISIT_COST, self.dataset_id)) diff --git a/tests/integration_tests/data_steward/utils/participant_summary_requests_test.py b/tests/integration_tests/data_steward/utils/participant_summary_requests_test.py index deef575d93..4dcb6714d0 100644 --- a/tests/integration_tests/data_steward/utils/participant_summary_requests_test.py +++ b/tests/integration_tests/data_steward/utils/participant_summary_requests_test.py @@ -160,7 +160,8 @@ def test_get_participant_data(self, mock_get_session, mock_token): mock_resp.json.return_value = self.json_response_entry # test - expected_response = psr.get_participant_data(self.url, self.headers) + expected_response = psr.get_participant_data(self.client, self.url, + self.headers) # post conditions self.assertEqual(expected_response, self.participant_data) @@ -178,7 +179,8 @@ def test_get_deactivated_participants(self, mock_get_session, mock_token): mock_resp.json.return_value = self.json_response_entry # Tests - df = psr.get_deactivated_participants(self.project_id, self.columns) + df = psr.get_deactivated_participants(self.client, self.project_id, + self.columns) # Parameter check test self.assertRaises(RuntimeError, psr.store_participant_data, diff --git a/tests/integration_tests/data_steward/validation/ehr_union_test.py b/tests/integration_tests/data_steward/validation/ehr_union_test.py index c089bbcbbb..7cb8bf1657 100644 --- a/tests/integration_tests/data_steward/validation/ehr_union_test.py +++ b/tests/integration_tests/data_steward/validation/ehr_union_test.py @@ -228,7 +228,6 @@ def test_union_ehr(self, mock_hpo_info, mock_aou_death): mapping_tables = [ ehr_union.mapping_table_for(table) for table in cdm.tables_to_map() + [PERSON] - if not table == SURVEY_CONDUCT ] output_cdm_tables = [ ehr_union.output_table_for(table) @@ -280,35 +279,34 @@ def test_union_ehr(self, mock_hpo_info, mock_aou_death): # mapping tables tables_to_map = cdm.tables_to_map() for table_to_map in tables_to_map: - if not table_to_map == SURVEY_CONDUCT: - mapping_table = ehr_union.mapping_table_for(table_to_map) - expected_fields = { - 'src_table_id', - 'src_%s_id' % table_to_map, - '%s_id' % table_to_map, 'src_hpo_id', 'src_dataset_id' - } - mapping_table_obj = self.bq_client.get_table( - f'{self.output_dataset_id}.{mapping_table}') - actual_fields = set( - [field.name for field in mapping_table_obj.schema]) - message = 'Table %s has fields %s when %s expected' % ( - mapping_table, actual_fields, expected_fields) - self.assertSetEqual(expected_fields, actual_fields, message) - - if table_to_map == VISIT_DETAIL: - expected_num_rows = len(self.expected_tables[mapping_table]) - else: - result_table = ehr_union.output_table_for(table_to_map) - expected_num_rows = len(self.expected_tables[result_table]) - - actual_num_rows = int(mapping_table_obj.num_rows) - message = 'Table %s has %s rows when %s expected' % ( - mapping_table, actual_num_rows, expected_num_rows) - self.assertEqual(expected_num_rows, actual_num_rows, message) + mapping_table = ehr_union.mapping_table_for(table_to_map) + expected_fields = { + 'src_table_id', + 'src_%s_id' % table_to_map, + '%s_id' % table_to_map, 'src_hpo_id', 'src_dataset_id' + } + mapping_table_obj = self.bq_client.get_table( + f'{self.output_dataset_id}.{mapping_table}') + actual_fields = set( + [field.name for field in mapping_table_obj.schema]) + message = 'Table %s has fields %s when %s expected' % ( + mapping_table, actual_fields, expected_fields) + self.assertSetEqual(expected_fields, actual_fields, message) + + if table_to_map == VISIT_DETAIL: + expected_num_rows = len(self.expected_tables[mapping_table]) + else: + result_table = ehr_union.output_table_for(table_to_map) + expected_num_rows = len(self.expected_tables[result_table]) + + actual_num_rows = int(mapping_table_obj.num_rows) + message = 'Table %s has %s rows when %s expected' % ( + mapping_table, actual_num_rows, expected_num_rows) + self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in resources.CDM_TABLES: - if not table_name in [SURVEY_CONDUCT, DEATH]: + if table_name != DEATH: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] @@ -316,8 +314,7 @@ def test_union_ehr(self, mock_hpo_info, mock_aou_death): table_obj = self.bq_client.get_table( f'{self.output_dataset_id}.{result_table}') actual_count = int(table_obj.num_rows) - msg = 'Unexpected row count in table {result_table} after ehr union'.format( - result_table=result_table) + msg = f'Unexpected row count in table {result_table} after ehr union' self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) @@ -334,9 +331,8 @@ def test_union_ehr(self, mock_hpo_info, mock_aou_death): self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input - nyc_person_table_id = resources.get_table_id('person', - hpo_id=NYC_HPO_ID) - pitt_person_table_id = resources.get_table_id('person', + nyc_person_table_id = resources.get_table_id(PERSON, hpo_id=NYC_HPO_ID) + pitt_person_table_id = resources.get_table_id(PERSON, hpo_id=PITT_HPO_ID) q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{nyc_person_table_id} diff --git a/tests/unit_tests/data_steward/analytics/cdr_ops/report_runner_test.py b/tests/unit_tests/data_steward/analytics/cdr_ops/report_runner_test.py index 02c2f73e1e..1737103d28 100644 --- a/tests/unit_tests/data_steward/analytics/cdr_ops/report_runner_test.py +++ b/tests/unit_tests/data_steward/analytics/cdr_ops/report_runner_test.py @@ -81,13 +81,17 @@ def test_create_html_from_ipynb(self, mock_pure_path, mock_html_exporter, runner.create_html_from_ipynb(self.notebook_ipynb_path) # Assertions in reading the notebook - mock_open.assert_any_call(self.notebook_ipynb_path, 'r') + mock_open.assert_any_call(self.notebook_ipynb_path, + 'r', + encoding='utf-8') mock_nbformat_reads.assert_any_call('fake_data', as_version=4) mock_html_exporter.return_value.from_notebook_node.assert_any_call( mock_nbformat_reads.return_value) # Assertions in writing the notebook to a html page - mock_open.assert_any_call(with_suffix_returned_value, 'w') + mock_open.assert_any_call(with_suffix_returned_value, + 'w', + encoding='utf-8') mock_open.return_value.write.assert_any_call('return fake_data') def test_infer_required(self): diff --git a/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables_test.py b/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables_test.py index 02a4d8497d..9a76c7183b 100644 --- a/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables_test.py +++ b/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/remove_extra_tables_test.py @@ -57,7 +57,8 @@ def test_get_query_specs(self): include_vocabulary=True).keys()) | {'_cdr_metadata'} | { f'{table}_ext' for table in cdm_schemas().keys() if has_domain_table_id(table) - } - {'person_ext'} | {'person_src_hpos_ext'} + } - {'person_ext'} | {'person_src_hpos_ext' + } | {'aou_death'} | {'wear_study'} self.assertCountEqual(self.rule_instance.affected_tables, final_tables) expected_list = [{ diff --git a/tests/unit_tests/data_steward/deid/parser_test.py b/tests/unit_tests/data_steward/deid/parser_test.py index 16e0008fd7..ee63b28be2 100644 --- a/tests/unit_tests/data_steward/deid/parser_test.py +++ b/tests/unit_tests/data_steward/deid/parser_test.py @@ -36,13 +36,15 @@ def setUp(self): self.rules = os.path.join(DEID_PATH, 'config', 'ids', 'config.json') self.pipeline = ['generalize', 'suppress', 'shift', 'compute'] self.interactive = 'BATCH' + self.run_as_email = 'test@test.com' self.correct_parameter_list = [ '--rules', self.rules, '--private_key', self.private_key, '--table', self.tablename, '--action', self.action, '--idataset', self.input_dataset, '--odataset', self.output_dataset, '--log', self.log_path, '--pipeline', self.pipeline, '--interactive', - self.interactive, '--interactive', self.interactive + self.interactive, '--interactive', self.interactive, '--run_as', + self.run_as_email ] self.incorrect_parameter_list = [ '--rules', self.rules, '--private_key', self.private_key, '--table', @@ -65,6 +67,8 @@ def test_parse_args(self): # setting correct_parameter_dict values not set in setUp function correct_parameter_dict['cluster'] = False correct_parameter_dict['age_limit'] = MAX_AGE + correct_parameter_dict['run_as_email'] = correct_parameter_dict.pop( + 'run_as') # Test if correct parameters are given results_dict = parse_args(self.correct_parameter_list) diff --git a/tests/unit_tests/data_steward/tools/create_combined_backup_dataset_test.py b/tests/unit_tests/data_steward/tools/create_combined_backup_dataset_test.py index 129c79193d..f0951bfbe5 100644 --- a/tests/unit_tests/data_steward/tools/create_combined_backup_dataset_test.py +++ b/tests/unit_tests/data_steward/tools/create_combined_backup_dataset_test.py @@ -9,12 +9,12 @@ EXPECTED_MAPPING_QUERY = common.JINJA_ENV.from_string(""" SELECT DISTINCT '{{rdr_dataset_id}}' AS src_dataset_id, - {{domain_table}}_id AS src_{{domain_table}}_id, + t.{{domain_table}}_id AS src_{{domain_table}}_id, v.src_id as src_hpo_id, {% if domain_table in ['survey_conduct', 'person'] %} - {{domain_table}}_id AS {{domain_table}}_id, + t.{{domain_table}}_id AS {{domain_table}}_id, {% else %} - {{domain_table}}_id + {{mapping_constant}} AS {{domain_table}}_id, + t.{{domain_table}}_id + {{mapping_constant}} AS {{domain_table}}_id, {% endif %} '{{domain_table}}' as src_table_id FROM `{{rdr_dataset_id}}.{{domain_table}}` AS t @@ -39,9 +39,9 @@ EXPECTED_SURVEY_CONDUCT_MAPPING_QUERY = common.JINJA_ENV.from_string(""" SELECT DISTINCT '{{rdr_dataset_id}}' AS src_dataset_id, - {{domain_table}}_id AS src_{{domain_table}}_id, + t.{{domain_table}}_id AS src_{{domain_table}}_id, v.src_id as src_hpo_id, - {{domain_table}}_id AS {{domain_table}}_id, + t.{{domain_table}}_id AS {{domain_table}}_id, '{{domain_table}}' as src_table_id FROM `{{rdr_dataset_id}}.{{domain_table}}` AS t JOIN `{{rdr_dataset_id}}._mapping_{{domain_table}}` AS v diff --git a/tests/unit_tests/data_steward/tools/run_deid_test.py b/tests/unit_tests/data_steward/tools/run_deid_test.py index a3b8c9e7e8..49125b0055 100644 --- a/tests/unit_tests/data_steward/tools/run_deid_test.py +++ b/tests/unit_tests/data_steward/tools/run_deid_test.py @@ -39,12 +39,13 @@ def setUp(self): self.skip_tables = 'foo_table' self.tablename = 'bar_table' self.max_age = '89' + self.run_as_email = 'test@test.com' self.correct_parameter_list = [ '--idataset', self.input_dataset, '--private_key', self.private_key, '--odataset', self.output_dataset, '--action', self.action, '--skip-tables', self.skip_tables, '--tables', self.tablename, - '--age_limit', self.max_age + '--age_limit', self.max_age, '--run_as', self.run_as_email ] self.incorrect_parameter_list = [ @@ -81,6 +82,8 @@ def test_parse_args(self): correct_parameter_dict['console_log'] = False correct_parameter_dict['interactive_mode'] = False correct_parameter_dict['input_dataset'] = self.input_dataset + correct_parameter_dict['run_as_email'] = correct_parameter_dict.pop( + 'run_as') # need to delete idataset argument from correct_parameter_dict because input_dataset argument is returned # when self.correct_parameter_list is supplied to parse_args @@ -121,7 +124,8 @@ def test_main(self, mock_tables, mock_load, mock_copy, mock_main, os.path.join(DEID_PATH, 'config', 'ids', 'config.json'), '--private_key', self.private_key, '--table', 'fake1', '--action', self.action, '--idataset', self.input_dataset, '--log', 'LOGS', - '--odataset', self.output_dataset, '--age-limit', self.max_age + '--odataset', self.output_dataset, '--age-limit', self.max_age, + '--run_as', self.run_as_email ]) self.assertEqual(mock_main.call_count, 1) self.assertEqual(mock_copy_ext_tables.call_count, 1) diff --git a/tests/unit_tests/data_steward/utils/participant_summary_requests_test.py b/tests/unit_tests/data_steward/utils/participant_summary_requests_test.py index a1e8f033d6..5bd4244949 100644 --- a/tests/unit_tests/data_steward/utils/participant_summary_requests_test.py +++ b/tests/unit_tests/data_steward/utils/participant_summary_requests_test.py @@ -43,6 +43,7 @@ def setUp(self): self.tablename = 'baz_table' self.fake_hpo = 'foo_hpo' self.destination_table = 'bar_dataset._deactivated_participants' + self.client = 'test_client' self.fake_token = 'fake_token' self.fake_url = 'www.fake_site.com' @@ -225,31 +226,32 @@ def setUp(self): 'authored_time': '2021-02-01T12:01:01Z' }] - @patch('utils.participant_summary_requests.default') @patch('utils.participant_summary_requests.auth') @patch('utils.participant_summary_requests.req') - def test_get_access_token(self, mock_req, mock_auth, mock_default): + def test_get_access_token(self, mock_req, mock_auth): # pre conditions scopes = [ 'https://www.googleapis.com/auth/cloud-platform', 'email', 'profile' ] - creds = MagicMock() - mock_default.return_value = (creds, None) req = MagicMock() + client = MagicMock() mock_req.Request.return_value = req + mock_email = 'test@test.com' # test - actual_token = psr.get_access_token() + client._credentials.service_account_email = mock_email + actual_token = psr.get_access_token(client) # post conditions - mock_default.assert_called_once_with() - mock_auth.delegated_credentials.assert_called_once_with(creds, - scopes=scopes) + mock_auth.get_impersonation_credentials.assert_called_once_with( + mock_email, target_scopes=scopes) mock_req.Request.assert_called_once_with() # assert the credential refresh still happens - mock_auth.delegated_credentials().refresh.assert_called_once_with(req) + mock_auth.get_impersonation_credentials( + ).refresh.assert_called_once_with(req) - self.assertEqual(mock_auth.delegated_credentials().token, actual_token) + self.assertEqual(mock_auth.get_impersonation_credentials().token, + actual_token) @patch('utils.participant_summary_requests.BASE_URL', 'www.fake_site.appspot.com') @@ -265,7 +267,8 @@ def test_fake_website(self, mock_get, mock_token): error_msg = 'Error: API request failed because ' mock_get.return_value = FakeHTTPResponse(status_code=status_code) with self.assertRaises(RuntimeError) as e: - _ = psr.get_participant_data(self.fake_url, self.fake_headers) + _ = psr.get_participant_data(self.client, self.fake_url, + self.fake_headers) self.assertEqual(str(e.exception), error_msg.format(status_code=status_code)) self.assertEqual(mock_get.call_count, 1) @@ -273,7 +276,8 @@ def test_fake_website(self, mock_get, mock_token): status_code = 404 mock_get.return_value = FakeHTTPResponse(status_code=status_code) with self.assertRaises(RuntimeError) as e: - _ = psr.get_participant_data(self.fake_url, self.fake_headers) + _ = psr.get_participant_data(self.client, self.fake_url, + self.fake_headers) self.assertEqual(str(e.exception), error_msg.format(status_code=status_code)) self.assertEqual(mock_get.call_count, 2) @@ -287,7 +291,7 @@ def test_get_participant_data(self, mock_get_session, mock_token): mock_session.get.return_value.status_code = 200 mock_session.get.return_value.json.return_value = self.json_response_entry - actual_response = psr.get_participant_data(self.fake_url, + actual_response = psr.get_participant_data(self.client, self.fake_url, self.fake_headers) self.assertEqual(actual_response, self.participant_data) @@ -320,7 +324,7 @@ def test_get_deactivated_participants(self, # tests dataframe_response = psr.get_deactivated_participants( - self.project_id, self.columns) + self.client, self.project_id, self.columns) dataset_response = psr.store_participant_data(dataframe_response, mock_bq_client, @@ -482,10 +486,10 @@ def test_get_deactivated_participants_parameters(self, mock_data, Ensures error checking is working. """ # Parameter check tests - self.assertRaises(RuntimeError, psr.get_deactivated_participants, None, - self.columns) self.assertRaises(RuntimeError, psr.get_deactivated_participants, - self.project_id, None) + self.client, None, self.columns) + self.assertRaises(RuntimeError, psr.get_deactivated_participants, + self.client, self.project_id, None) def test_process_digital_health_data_to_df(self): column_map = {'participant_id': 'person_id'} diff --git a/tests/unit_tests/data_steward/validation/ehr_union_test.py b/tests/unit_tests/data_steward/validation/ehr_union_test.py index 8f851e4baa..fa0d0b8f2f 100644 --- a/tests/unit_tests/data_steward/validation/ehr_union_test.py +++ b/tests/unit_tests/data_steward/validation/ehr_union_test.py @@ -309,14 +309,13 @@ def test_excluded_hpo_ids(self, mock_hpo_info, mock_create_std_tbl, mock_hpo_info.return_value = [{ 'hpo_id': hpo_id } for hpo_id in self.hpo_ids] - self.mock_bq_client.return_value = 'client' eu.main("input_dataset_id", "output_dataset_id", "project_id", hpo_ids_ex=[self.FAKE_SITE_2]) mock_mapping.assert_called_with(ANY, [self.FAKE_SITE_1], "input_dataset_id", "output_dataset_id", - "project_id", 'client') + "project_id", ANY) def tearDown(self): pass