From 8f57d02b80e91761bb083ae77393e7b05588be20 Mon Sep 17 00:00:00 2001 From: brendagutman <77469967+brendagutman@users.noreply.github.com> Date: Mon, 11 Sep 2023 09:03:20 -0500 Subject: [PATCH] [DC-3407] Update value_as_string check in clean_rdr notebooks to remove false positives (#1720) * [DC-3407] Update raw rdr query * [DC-3407] Update clean rdr query * [DC-3407] Update the expected_strings list * [DC-3407] Update the check description and list order * [DC-3407] Update the check description and list order in clean notebook --- .../analytics/cdr_ops/clean_rdr_export_qc.py | 248 +++++++++++++++++- .../analytics/cdr_ops/raw_rdr_export_qc.py | 239 ++++++++++++++++- 2 files changed, 475 insertions(+), 12 deletions(-) diff --git a/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py b/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py index 9d18aa5cda..14248b11d9 100644 --- a/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py +++ b/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py @@ -31,15 +31,232 @@ from gcloud.bq import BigQueryClient from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message -# # Table comparison -# The export should generally contain the same tables from month to month. -# Tables found only in the old or the new export are listed below. - impersonation_creds = auth.get_impersonation_credentials( run_as, target_scopes=IMPERSONATION_SCOPES) client = BigQueryClient(project_id, credentials=impersonation_creds) +# This list is created by querying the redcap surveys. In case of needed update, query provided in the comments of DC3407 +expected_strings = ['cidi5_20', 'cidi5_24', 'cidi5_28', 'cidi5_31', 'mhqukb_48_age', + 'mhqukb_50_number', 'mhqukb_51_number', 'mhqukb_52_number', + 'mhqukb_53_number', 'record_id', 'helpmewithconsent_name', + 'other_concerns', 'other_reasons', 'resultsconsent_emailmecopy', + 'resultsconsent_signaturedate', 'consentpii_helpwithconsentsignature', + 'extraconsent_signature_type', 'extraconsent_todaysdate', + 'piiaddress_streetaddress', 'piiaddress_streetaddress2', + 'piibirthinformation_birthdate', 'piicontactinformation_phone', + 'piiname_first', 'piiname_last', 'piiname_middle', + 'streetaddress_piicity', 'streetaddress_piizip', 'basics_11a_cope_a_33', + 'basics_xx', 'basics_xx20', 'cdc_covid_19_7_xx22_date', 'cope_a_126', + 'cope_a_160', 'cope_a_85', 'copect_50_xx19_cope_a_152', + 'copect_50_xx19_cope_a_198', 'copect_50_xx19_cope_a_57', + 'cu_covid_cope_a_204', 'eds_follow_up_1_xx', 'ipaq_1_cope_a_24', + 'ipaq_2_cope_a_160', 'ipaq_2_cope_a_85', 'ipaq_3_cope_a_24', + 'ipaq_4_cope_a_160', 'ipaq_4_cope_a_85', 'ipaq_5_cope_a_24', + 'ipaq_6_cope_a_160', 'ipaq_6_cope_a_85', 'lifestyle_2_xx12_cope_a_152', + 'lifestyle_2_xx12_cope_a_198', 'lifestyle_2_xx12_cope_a_57', + 'tsu_ds5_13_xx42_cope_a_226', 'cdc_covid_19_7_xx23_other_cope_a_204', + 'cdc_covid_19_n_a2', 'cdc_covid_19_n_a4', 'cdc_covid_19_n_a8', + 'cope_aou_xx_2_a', 'dmfs_29a', 'msds_17_c', + 'nhs_covid_fhc17b_cope_a_226', 'ehrconsentpii_helpwithconsentsignature', + 'ehrconsentpii_todaysdate', 'ehrconsentpii_todaysdateilhippawitness', + 'sensitivetype2_domesticviolence', 'sensitivetype2_genetictesting', + 'sensitivetype2_hivaids', 'sensitivetype2_mentalhealth', + 'sensitivetype2_substanceuse', 'signature_type', 'cidi5_15', + 'mhqukb_25_number', 'mhqukb_26_age', 'mhqukb_28_age', 'ss_2_age', + 'ss_3_age_1', 'ss_3_age_2', 'ss_3_number', + 'english_exploring_the_mind_consent_form', 'etm_help_name', + 'cdc_covid_xx_a_date1', 'cdc_covid_xx_a_date2', + 'cdc_covid_xx_b_firstdose_other', 'cdc_covid_xx_b_seconddose_other', + 'cdc_covid_xx_symptom_cope_350', + 'cdc_covid_xx_symptom_seconddose_cope_350', 'dmfs_29_seconddose_other', + 'othercancer_daughterfreetextbox', 'othercancer_fatherfreetextbox', + 'othercancer_grandparentfreetextbox', 'othercancer_motherfreetextbox', + 'othercancer_siblingfreetextbox', 'othercancer_sonfreetextbox', + 'othercondition_daughterfreetextbox', 'othercondition_fatherfreetextbox', + 'othercondition_grandparentfreetextbox', + 'othercondition_motherfreetextbox', 'othercondition_siblingfreetextbox', + 'othercondition_sonfreetextbox', 'cdc_covid_xx_b_other', + 'otherdelayedmedicalcare_freetext', + 'attemptquitsmoking_completelyquitage', 'otherspecify_otherdrugstextbox', + 'smoking_averagedailycigarettenumber', + 'smoking_currentdailycigarettenumber', + 'smoking_dailysmokestartingagenumber', 'smoking_numberofyearsnumber', + 'cdc_covid_xx_a_date10', 'cdc_covid_xx_a_date11', + 'cdc_covid_xx_a_date12', 'cdc_covid_xx_a_date13', + 'cdc_covid_xx_a_date14', 'cdc_covid_xx_a_date15', + 'cdc_covid_xx_a_date16', 'cdc_covid_xx_a_date17', 'cdc_covid_xx_a_date3', + 'cdc_covid_xx_a_date4', 'cdc_covid_xx_a_date5', 'cdc_covid_xx_a_date6', + 'cdc_covid_xx_a_date7', 'cdc_covid_xx_a_date8', 'cdc_covid_xx_a_date9', + 'cdc_covid_xx_b_dose10_other', 'cdc_covid_xx_b_dose11_other', + 'cdc_covid_xx_b_dose12_other', 'cdc_covid_xx_b_dose13_other', + 'cdc_covid_xx_b_dose14_other', 'cdc_covid_xx_b_dose15_other', + 'cdc_covid_xx_b_dose16_other', 'cdc_covid_xx_b_dose17_other', + 'cdc_covid_xx_b_dose3_other', 'cdc_covid_xx_b_dose4_other', + 'cdc_covid_xx_b_dose5_other', 'cdc_covid_xx_b_dose6_other', + 'cdc_covid_xx_b_dose7_other', 'cdc_covid_xx_b_dose8_other', + 'cdc_covid_xx_b_dose9_other', 'cdc_covid_xx_symptom_cope_350_dose10', + 'cdc_covid_xx_symptom_cope_350_dose11', + 'cdc_covid_xx_symptom_cope_350_dose12', + 'cdc_covid_xx_symptom_cope_350_dose13', + 'cdc_covid_xx_symptom_cope_350_dose14', + 'cdc_covid_xx_symptom_cope_350_dose15', + 'cdc_covid_xx_symptom_cope_350_dose16', + 'cdc_covid_xx_symptom_cope_350_dose17', + 'cdc_covid_xx_symptom_cope_350_dose3', + 'cdc_covid_xx_symptom_cope_350_dose4', + 'cdc_covid_xx_symptom_cope_350_dose5', + 'cdc_covid_xx_symptom_cope_350_dose6', + 'cdc_covid_xx_symptom_cope_350_dose7', + 'cdc_covid_xx_symptom_cope_350_dose8', + 'cdc_covid_xx_symptom_cope_350_dose9', 'cdc_covid_xx_type_dose10_other', + 'cdc_covid_xx_type_dose11_other', 'cdc_covid_xx_type_dose12_other', + 'cdc_covid_xx_type_dose13_other', 'cdc_covid_xx_type_dose14_other', + 'cdc_covid_xx_type_dose15_other', 'cdc_covid_xx_type_dose16_other', + 'cdc_covid_xx_type_dose17_other', 'cdc_covid_xx_type_dose3_other', + 'cdc_covid_xx_type_dose4_other', 'cdc_covid_xx_type_dose5_other', + 'cdc_covid_xx_type_dose6_other', 'cdc_covid_xx_type_dose7_other', + 'cdc_covid_xx_type_dose8_other', 'cdc_covid_xx_type_dose9_other', + 'dmfs_29_additionaldose_other', + 'organtransplant_bloodvesseltransplantdate', + 'organtransplant_bonetransplantdate', + 'organtransplant_corneatransplantdate', + 'organtransplant_hearttransplantdate', + 'organtransplant_intestinetransplantdate', + 'organtransplant_kidneytransplantdate', + 'organtransplant_livertransplantdate', + 'organtransplant_lungtransplantdate', + 'organtransplant_otherorgantransplantdate', + 'organtransplant_othertissuetransplantdate', + 'organtransplant_pancreastransplantdate', + 'organtransplant_skintransplantdate', + 'organtransplant_valvetransplantdate', 'otherorgan_freetextbox', + 'othertissue_freetextbox', + 'outsidetravel6month_outsidetravel6monthhowlong', + 'outsidetravel6month_outsidetravel6monthwheretraveled', + 'overallhealth_hysterectomyhistoryage', + 'overallhealthovaryremovalhistoryage', + 'otherarthritis_daughterfreetextbox', 'otherarthritis_fatherfreetextbox', + 'otherarthritis_freetextbox', 'otherarthritis_grandparentfreetextbox', + 'otherarthritis_motherfreetextbox', 'otherarthritis_siblingfreetextbox', + 'otherarthritis_sonfreetextbox', + 'otherbonejointmuscle_daughterfreetextbox', + 'otherbonejointmuscle_fatherfreetextbox', + 'otherbonejointmuscle_freetextbox', + 'otherbonejointmuscle_grandparentfreetextbox', + 'otherbonejointmuscle_motherfreetextbox', + 'otherbonejointmuscle_siblingfreetextbox', + 'otherbonejointmuscle_sonfreetextbox', + 'otherbrainnervoussystem_daughterfreetextbox', + 'otherbrainnervoussystem_fatherfreetextbox', + 'otherbrainnervoussystem_freetextbox', + 'otherbrainnervoussystem_grandparentfreetextbox', + 'otherbrainnervoussystem_motherfreetextbox', + 'otherbrainnervoussystem_siblingfreetextbox', + 'otherbrainnervoussystem_sonfreetextbox', 'othercancer_freetextbox', + 'otherdiabetes_daughterfreetextbox', 'otherdiabetes_fatherfreetextbox', + 'otherdiabetes_freetextbox', 'otherdiabetes_grandparentfreetextbox', + 'otherdiabetes_motherfreetextbox', 'otherdiabetes_siblingfreetextbox', + 'otherdiabetes_sonfreetextbox', 'otherdiagnosis_daughterfreetextbox', + 'otherdiagnosis_fatherfreetextbox', 'otherdiagnosis_freetextbox', + 'otherdiagnosis_grandparentfreetextbox', + 'otherdiagnosis_motherfreetextbox', 'otherdiagnosis_siblingfreetextbox', + 'otherdiagnosis_sonfreetextbox', + 'otherdigestivecondition_daughterfreetextbox', + 'otherdigestivecondition_fatherfreetextbox', + 'otherdigestivecondition_freetextbox', + 'otherdigestivecondition_grandparentfreetextbox', + 'otherdigestivecondition_motherfreetextbox', + 'otherdigestivecondition_siblingfreetextbox', + 'otherdigestivecondition_sonfreetextbox', + 'otherhearingeye_daughterfreetextbox', + 'otherhearingeye_fatherfreetextbox', 'otherhearingeye_freetextbox', + 'otherhearingeye_grandparentfreetextbox', + 'otherhearingeye_motherfreetextbox', + 'otherhearingeye_siblingfreetextbox', 'otherhearingeye_sonfreetextbox', + 'otherheartorbloodcondition_daughterfreetextbox', + 'otherheartorbloodcondition_fatherfreetextbox', + 'otherheartorbloodcondition_freetextbox', + 'otherheartorbloodcondition_grandparentfreetextbox', + 'otherheartorbloodcondition_motherfreetextbox', + 'otherheartorbloodcondition_siblingfreetextbox', + 'otherheartorbloodcondition_sonfreetextbox', + 'otherhormoneendocrine_daughterfreetextbox', + 'otherhormoneendocrine_fatherfreetextbox', + 'otherhormoneendocrine_freetextbox', + 'otherhormoneendocrine_grandparentfreetextbox', + 'otherhormoneendocrine_motherfreetextbox', + 'otherhormoneendocrine_siblingfreetextbox', + 'otherhormoneendocrine_sonfreetextbox', + 'otherinfectiousdisease_freetextbox', + 'otherkidneycondition_daughterfreetextbox', + 'otherkidneycondition_fatherfreetextbox', + 'otherkidneycondition_freetextbox', + 'otherkidneycondition_grandparentfreetextbox', + 'otherkidneycondition_motherfreetextbox', + 'otherkidneycondition_siblingfreetextbox', + 'otherkidneycondition_sonfreetextbox', + 'othermentalhealthsubstanceuse_daughterfreetextbox', + 'othermentalhealthsubstanceuse_fatherfreetextbox', + 'othermentalhealthsubstanceuse_freetextbox', + 'othermentalhealthsubstanceuse_grandparentfreetextb', + 'othermentalhealthsubstanceuse_motherfreetextbox', + 'othermentalhealthsubstanceuse_siblingfreetextbox', + 'othermentalhealthsubstanceuse_sonfreetextbox', + 'otherrespiratory_daughterfreetextbox', + 'otherrespiratory_fatherfreetextbox', 'otherrespiratory_freetextbox', + 'otherrespiratory_grandparentfreetextbox', + 'otherrespiratory_motherfreetextbox', + 'otherrespiratory_siblingfreetextbox', 'otherrespiratory_sonfreetextbox', + 'otherthyroid_daughterfreetextbox', 'otherthyroid_fatherfreetextbox', + 'otherthyroid_freetextbox', 'otherthyroid_grandparentfreetextbox', + 'otherthyroid_motherfreetextbox', 'otherthyroid_siblingfreetextbox', + 'otherthyroid_sonfreetextbox', 'self_reported_height_cm', + 'self_reported_height_ft', 'self_reported_height_in', + 'self_reported_weight_kg', 'self_reported_weight_pounds', + 'sdoh_eds_follow_up_1_xx', 'urs_8c', 'aian_tribe', + 'aiannoneofthesedescribeme_aianfreetext', + 'blacknoneofthesedescribeme_blackfreetext', + 'employmentworkaddress_addresslineone', + 'employmentworkaddress_addresslinetwo', 'employmentworkaddress_city', + 'employmentworkaddress_country', 'employmentworkaddress_zipcode', + 'hispanicnoneofthesedescribeme_hispanicfreetext', + 'livingsituation_howmanypeople', + 'livingsituation_livingsituationfreetext', + 'livingsituation_peopleunder18', + 'menanoneofthesedescribeme_menafreetext', + 'nhpinoneofthesedescribeme_nhpifreetext', + 'noneofthesedescribeme_asianfreetext', 'otherhealthplan_freetext', + 'persononeaddress_persononeaddresscity', + 'persononeaddress_persononeaddresszipcode', + 'secondarycontactinfo_persononeaddressone', + 'secondarycontactinfo_persononeaddresstwo', + 'secondarycontactinfo_persononeemail', + 'secondarycontactinfo_persononefirstname', + 'secondarycontactinfo_persononelastname', + 'secondarycontactinfo_persononemiddleinitial', + 'secondarycontactinfo_persononetelephone', + 'secondarycontactinfo_secondcontactsaddressone', + 'secondarycontactinfo_secondcontactsaddresstwo', + 'secondarycontactinfo_secondcontactsemail', + 'secondarycontactinfo_secondcontactsfirstname', + 'secondarycontactinfo_secondcontactslastname', + 'secondarycontactinfo_secondcontactsmiddleinitial', + 'secondarycontactinfo_secondcontactsnumber', + 'secondcontactsaddress_secondcontactcity', + 'secondcontactsaddress_secondcontactzipcode', + 'sexatbirthnoneofthese_sexatbirthtextbox', + 'socialsecurity_socialsecuritynumber', + 'somethingelse_sexualitysomethingelsetextbox', + 'specifiedgender_specifiedgendertextbox', 'thebasics_countryborntextbox', + 'whatraceethnicity_raceethnicitynoneofthese', + 'whitenoneofthesedescribeme_whitefreetext', 'timeofday', + 'wearconsent_todaysdate'] + +# # Table comparison +# The export should generally contain the same tables from month to month. +# Tables found only in the old or the new export are listed below. + tpl = JINJA_ENV.from_string(''' SELECT COALESCE(curr.table_id, prev.table_id) AS table_id @@ -187,20 +404,34 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# # Check if numeric data in value_as_string +# # Check numeric data in value_as_string # Some numeric data is expected in value_as_string. For example, zip codes or other contact specific information. - +# +# **If the check fails, manually review the results.**
+# False positives are possible. The suggested first step of investigation is to run the query in the comments of DC3407. This will provide any new text type questions from the surveys that can be added to the list `expected_strings`. +# + tpl = JINJA_ENV.from_string(""" SELECT observation_source_value ,COUNT(1) AS n FROM `{{project_id}}.{{new_rdr}}.observation` WHERE SAFE_CAST(value_as_string AS INT64) IS NOT NULL +AND value_source_concept_id = 0 +AND LOWER(observation_source_value) NOT IN UNNEST ({{expected_strings}}) +AND NOT REGEXP_CONTAINS(LOWER(observation_source_value), '(?i)snap|signature|address|email|number|cohortgroup') GROUP BY 1 ORDER BY 2 DESC """) -query = tpl.render(new_rdr=new_rdr, project_id=project_id) -execute(client, query) +query = tpl.render(new_rdr=new_rdr, project_id=project_id,expected_strings=expected_strings) +df = execute(client, query) + +success_msg = 'All records with a number in value_as_string are expected to be text.' +failure_msg = 'Some records that have a number value_as_string might not be expected. See description.' + +render_message(df, + success_msg, + failure_msg) +# - # # All COPE `questionnaire_response_id`s are in COPE version map # Any `questionnaire_response_id`s missing from the map will be listed below. @@ -1007,4 +1238,3 @@ success_msg, failure_msg, failure_msg_args={'code_count': len(df)}) -# - \ No newline at end of file diff --git a/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py b/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py index 3504dae56f..2553091687 100644 --- a/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py +++ b/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py @@ -54,6 +54,224 @@ 'wear_consent_ptsc' ] +# This list is created by querying the redcap surveys. In case of needed update, query provided in the comments of DC3407 +expected_strings = ['cidi5_20', 'cidi5_24', 'cidi5_28', 'cidi5_31', 'mhqukb_48_age', + 'mhqukb_50_number', 'mhqukb_51_number', 'mhqukb_52_number', + 'mhqukb_53_number', 'record_id', 'helpmewithconsent_name', + 'other_concerns', 'other_reasons', 'resultsconsent_emailmecopy', + 'resultsconsent_signaturedate', 'consentpii_helpwithconsentsignature', + 'extraconsent_signature_type', 'extraconsent_todaysdate', + 'piiaddress_streetaddress', 'piiaddress_streetaddress2', + 'piibirthinformation_birthdate', 'piicontactinformation_phone', + 'piiname_first', 'piiname_last', 'piiname_middle', + 'streetaddress_piicity', 'streetaddress_piizip', 'basics_11a_cope_a_33', + 'basics_xx', 'basics_xx20', 'cdc_covid_19_7_xx22_date', 'cope_a_126', + 'cope_a_160', 'cope_a_85', 'copect_50_xx19_cope_a_152', + 'copect_50_xx19_cope_a_198', 'copect_50_xx19_cope_a_57', + 'cu_covid_cope_a_204', 'eds_follow_up_1_xx', 'ipaq_1_cope_a_24', + 'ipaq_2_cope_a_160', 'ipaq_2_cope_a_85', 'ipaq_3_cope_a_24', + 'ipaq_4_cope_a_160', 'ipaq_4_cope_a_85', 'ipaq_5_cope_a_24', + 'ipaq_6_cope_a_160', 'ipaq_6_cope_a_85', 'lifestyle_2_xx12_cope_a_152', + 'lifestyle_2_xx12_cope_a_198', 'lifestyle_2_xx12_cope_a_57', + 'tsu_ds5_13_xx42_cope_a_226', 'cdc_covid_19_7_xx23_other_cope_a_204', + 'cdc_covid_19_n_a2', 'cdc_covid_19_n_a4', 'cdc_covid_19_n_a8', + 'cope_aou_xx_2_a', 'dmfs_29a', 'msds_17_c', + 'nhs_covid_fhc17b_cope_a_226', 'ehrconsentpii_helpwithconsentsignature', + 'ehrconsentpii_todaysdate', 'ehrconsentpii_todaysdateilhippawitness', + 'sensitivetype2_domesticviolence', 'sensitivetype2_genetictesting', + 'sensitivetype2_hivaids', 'sensitivetype2_mentalhealth', + 'sensitivetype2_substanceuse', 'signature_type', 'cidi5_15', + 'mhqukb_25_number', 'mhqukb_26_age', 'mhqukb_28_age', 'ss_2_age', + 'ss_3_age_1', 'ss_3_age_2', 'ss_3_number', + 'english_exploring_the_mind_consent_form', 'etm_help_name', + 'cdc_covid_xx_a_date1', 'cdc_covid_xx_a_date2', + 'cdc_covid_xx_b_firstdose_other', 'cdc_covid_xx_b_seconddose_other', + 'cdc_covid_xx_symptom_cope_350', + 'cdc_covid_xx_symptom_seconddose_cope_350', 'dmfs_29_seconddose_other', + 'othercancer_daughterfreetextbox', 'othercancer_fatherfreetextbox', + 'othercancer_grandparentfreetextbox', 'othercancer_motherfreetextbox', + 'othercancer_siblingfreetextbox', 'othercancer_sonfreetextbox', + 'othercondition_daughterfreetextbox', 'othercondition_fatherfreetextbox', + 'othercondition_grandparentfreetextbox', + 'othercondition_motherfreetextbox', 'othercondition_siblingfreetextbox', + 'othercondition_sonfreetextbox', 'cdc_covid_xx_b_other', + 'otherdelayedmedicalcare_freetext', + 'attemptquitsmoking_completelyquitage', 'otherspecify_otherdrugstextbox', + 'smoking_averagedailycigarettenumber', + 'smoking_currentdailycigarettenumber', + 'smoking_dailysmokestartingagenumber', 'smoking_numberofyearsnumber', + 'cdc_covid_xx_a_date10', 'cdc_covid_xx_a_date11', + 'cdc_covid_xx_a_date12', 'cdc_covid_xx_a_date13', + 'cdc_covid_xx_a_date14', 'cdc_covid_xx_a_date15', + 'cdc_covid_xx_a_date16', 'cdc_covid_xx_a_date17', 'cdc_covid_xx_a_date3', + 'cdc_covid_xx_a_date4', 'cdc_covid_xx_a_date5', 'cdc_covid_xx_a_date6', + 'cdc_covid_xx_a_date7', 'cdc_covid_xx_a_date8', 'cdc_covid_xx_a_date9', + 'cdc_covid_xx_b_dose10_other', 'cdc_covid_xx_b_dose11_other', + 'cdc_covid_xx_b_dose12_other', 'cdc_covid_xx_b_dose13_other', + 'cdc_covid_xx_b_dose14_other', 'cdc_covid_xx_b_dose15_other', + 'cdc_covid_xx_b_dose16_other', 'cdc_covid_xx_b_dose17_other', + 'cdc_covid_xx_b_dose3_other', 'cdc_covid_xx_b_dose4_other', + 'cdc_covid_xx_b_dose5_other', 'cdc_covid_xx_b_dose6_other', + 'cdc_covid_xx_b_dose7_other', 'cdc_covid_xx_b_dose8_other', + 'cdc_covid_xx_b_dose9_other', 'cdc_covid_xx_symptom_cope_350_dose10', + 'cdc_covid_xx_symptom_cope_350_dose11', + 'cdc_covid_xx_symptom_cope_350_dose12', + 'cdc_covid_xx_symptom_cope_350_dose13', + 'cdc_covid_xx_symptom_cope_350_dose14', + 'cdc_covid_xx_symptom_cope_350_dose15', + 'cdc_covid_xx_symptom_cope_350_dose16', + 'cdc_covid_xx_symptom_cope_350_dose17', + 'cdc_covid_xx_symptom_cope_350_dose3', + 'cdc_covid_xx_symptom_cope_350_dose4', + 'cdc_covid_xx_symptom_cope_350_dose5', + 'cdc_covid_xx_symptom_cope_350_dose6', + 'cdc_covid_xx_symptom_cope_350_dose7', + 'cdc_covid_xx_symptom_cope_350_dose8', + 'cdc_covid_xx_symptom_cope_350_dose9', 'cdc_covid_xx_type_dose10_other', + 'cdc_covid_xx_type_dose11_other', 'cdc_covid_xx_type_dose12_other', + 'cdc_covid_xx_type_dose13_other', 'cdc_covid_xx_type_dose14_other', + 'cdc_covid_xx_type_dose15_other', 'cdc_covid_xx_type_dose16_other', + 'cdc_covid_xx_type_dose17_other', 'cdc_covid_xx_type_dose3_other', + 'cdc_covid_xx_type_dose4_other', 'cdc_covid_xx_type_dose5_other', + 'cdc_covid_xx_type_dose6_other', 'cdc_covid_xx_type_dose7_other', + 'cdc_covid_xx_type_dose8_other', 'cdc_covid_xx_type_dose9_other', + 'dmfs_29_additionaldose_other', + 'organtransplant_bloodvesseltransplantdate', + 'organtransplant_bonetransplantdate', + 'organtransplant_corneatransplantdate', + 'organtransplant_hearttransplantdate', + 'organtransplant_intestinetransplantdate', + 'organtransplant_kidneytransplantdate', + 'organtransplant_livertransplantdate', + 'organtransplant_lungtransplantdate', + 'organtransplant_otherorgantransplantdate', + 'organtransplant_othertissuetransplantdate', + 'organtransplant_pancreastransplantdate', + 'organtransplant_skintransplantdate', + 'organtransplant_valvetransplantdate', 'otherorgan_freetextbox', + 'othertissue_freetextbox', + 'outsidetravel6month_outsidetravel6monthhowlong', + 'outsidetravel6month_outsidetravel6monthwheretraveled', + 'overallhealth_hysterectomyhistoryage', + 'overallhealthovaryremovalhistoryage', + 'otherarthritis_daughterfreetextbox', 'otherarthritis_fatherfreetextbox', + 'otherarthritis_freetextbox', 'otherarthritis_grandparentfreetextbox', + 'otherarthritis_motherfreetextbox', 'otherarthritis_siblingfreetextbox', + 'otherarthritis_sonfreetextbox', + 'otherbonejointmuscle_daughterfreetextbox', + 'otherbonejointmuscle_fatherfreetextbox', + 'otherbonejointmuscle_freetextbox', + 'otherbonejointmuscle_grandparentfreetextbox', + 'otherbonejointmuscle_motherfreetextbox', + 'otherbonejointmuscle_siblingfreetextbox', + 'otherbonejointmuscle_sonfreetextbox', + 'otherbrainnervoussystem_daughterfreetextbox', + 'otherbrainnervoussystem_fatherfreetextbox', + 'otherbrainnervoussystem_freetextbox', + 'otherbrainnervoussystem_grandparentfreetextbox', + 'otherbrainnervoussystem_motherfreetextbox', + 'otherbrainnervoussystem_siblingfreetextbox', + 'otherbrainnervoussystem_sonfreetextbox', 'othercancer_freetextbox', + 'otherdiabetes_daughterfreetextbox', 'otherdiabetes_fatherfreetextbox', + 'otherdiabetes_freetextbox', 'otherdiabetes_grandparentfreetextbox', + 'otherdiabetes_motherfreetextbox', 'otherdiabetes_siblingfreetextbox', + 'otherdiabetes_sonfreetextbox', 'otherdiagnosis_daughterfreetextbox', + 'otherdiagnosis_fatherfreetextbox', 'otherdiagnosis_freetextbox', + 'otherdiagnosis_grandparentfreetextbox', + 'otherdiagnosis_motherfreetextbox', 'otherdiagnosis_siblingfreetextbox', + 'otherdiagnosis_sonfreetextbox', + 'otherdigestivecondition_daughterfreetextbox', + 'otherdigestivecondition_fatherfreetextbox', + 'otherdigestivecondition_freetextbox', + 'otherdigestivecondition_grandparentfreetextbox', + 'otherdigestivecondition_motherfreetextbox', + 'otherdigestivecondition_siblingfreetextbox', + 'otherdigestivecondition_sonfreetextbox', + 'otherhearingeye_daughterfreetextbox', + 'otherhearingeye_fatherfreetextbox', 'otherhearingeye_freetextbox', + 'otherhearingeye_grandparentfreetextbox', + 'otherhearingeye_motherfreetextbox', + 'otherhearingeye_siblingfreetextbox', 'otherhearingeye_sonfreetextbox', + 'otherheartorbloodcondition_daughterfreetextbox', + 'otherheartorbloodcondition_fatherfreetextbox', + 'otherheartorbloodcondition_freetextbox', + 'otherheartorbloodcondition_grandparentfreetextbox', + 'otherheartorbloodcondition_motherfreetextbox', + 'otherheartorbloodcondition_siblingfreetextbox', + 'otherheartorbloodcondition_sonfreetextbox', + 'otherhormoneendocrine_daughterfreetextbox', + 'otherhormoneendocrine_fatherfreetextbox', + 'otherhormoneendocrine_freetextbox', + 'otherhormoneendocrine_grandparentfreetextbox', + 'otherhormoneendocrine_motherfreetextbox', + 'otherhormoneendocrine_siblingfreetextbox', + 'otherhormoneendocrine_sonfreetextbox', + 'otherinfectiousdisease_freetextbox', + 'otherkidneycondition_daughterfreetextbox', + 'otherkidneycondition_fatherfreetextbox', + 'otherkidneycondition_freetextbox', + 'otherkidneycondition_grandparentfreetextbox', + 'otherkidneycondition_motherfreetextbox', + 'otherkidneycondition_siblingfreetextbox', + 'otherkidneycondition_sonfreetextbox', + 'othermentalhealthsubstanceuse_daughterfreetextbox', + 'othermentalhealthsubstanceuse_fatherfreetextbox', + 'othermentalhealthsubstanceuse_freetextbox', + 'othermentalhealthsubstanceuse_grandparentfreetextb', + 'othermentalhealthsubstanceuse_motherfreetextbox', + 'othermentalhealthsubstanceuse_siblingfreetextbox', + 'othermentalhealthsubstanceuse_sonfreetextbox', + 'otherrespiratory_daughterfreetextbox', + 'otherrespiratory_fatherfreetextbox', 'otherrespiratory_freetextbox', + 'otherrespiratory_grandparentfreetextbox', + 'otherrespiratory_motherfreetextbox', + 'otherrespiratory_siblingfreetextbox', 'otherrespiratory_sonfreetextbox', + 'otherthyroid_daughterfreetextbox', 'otherthyroid_fatherfreetextbox', + 'otherthyroid_freetextbox', 'otherthyroid_grandparentfreetextbox', + 'otherthyroid_motherfreetextbox', 'otherthyroid_siblingfreetextbox', + 'otherthyroid_sonfreetextbox', 'self_reported_height_cm', + 'self_reported_height_ft', 'self_reported_height_in', + 'self_reported_weight_kg', 'self_reported_weight_pounds', + 'sdoh_eds_follow_up_1_xx', 'urs_8c', 'aian_tribe', + 'aiannoneofthesedescribeme_aianfreetext', + 'blacknoneofthesedescribeme_blackfreetext', + 'employmentworkaddress_addresslineone', + 'employmentworkaddress_addresslinetwo', 'employmentworkaddress_city', + 'employmentworkaddress_country', 'employmentworkaddress_zipcode', + 'hispanicnoneofthesedescribeme_hispanicfreetext', + 'livingsituation_howmanypeople', + 'livingsituation_livingsituationfreetext', + 'livingsituation_peopleunder18', + 'menanoneofthesedescribeme_menafreetext', + 'nhpinoneofthesedescribeme_nhpifreetext', + 'noneofthesedescribeme_asianfreetext', 'otherhealthplan_freetext', + 'persononeaddress_persononeaddresscity', + 'persononeaddress_persononeaddresszipcode', + 'secondarycontactinfo_persononeaddressone', + 'secondarycontactinfo_persononeaddresstwo', + 'secondarycontactinfo_persononeemail', + 'secondarycontactinfo_persononefirstname', + 'secondarycontactinfo_persononelastname', + 'secondarycontactinfo_persononemiddleinitial', + 'secondarycontactinfo_persononetelephone', + 'secondarycontactinfo_secondcontactsaddressone', + 'secondarycontactinfo_secondcontactsaddresstwo', + 'secondarycontactinfo_secondcontactsemail', + 'secondarycontactinfo_secondcontactsfirstname', + 'secondarycontactinfo_secondcontactslastname', + 'secondarycontactinfo_secondcontactsmiddleinitial', + 'secondarycontactinfo_secondcontactsnumber', + 'secondcontactsaddress_secondcontactcity', + 'secondcontactsaddress_secondcontactzipcode', + 'sexatbirthnoneofthese_sexatbirthtextbox', + 'socialsecurity_socialsecuritynumber', + 'somethingelse_sexualitysomethingelsetextbox', + 'specifiedgender_specifiedgendertextbox', 'thebasics_countryborntextbox', + 'whatraceethnicity_raceethnicitynoneofthese', + 'whitenoneofthesedescribeme_whitefreetext', 'timeofday', + 'wearconsent_todaysdate'] + + # # Table comparison # The export should generally contain the same tables from month to month. # Tables found only in the old or the new export are listed below. @@ -365,20 +583,35 @@ query = tpl.render(new_rdr=new_rdr, project_id=project_id) execute(client, query) -# # Check if numeric data in value_as_string +# # Check numeric data in value_as_string # Some numeric data is expected in value_as_string. For example, zip codes or other contact specific information. +# +# **If the check fails, manually review the results.**
+# False positives are possible. The suggested first step of investigation is to run the query in the comments of DC3407. This will provide any new text type questions from the surveys that can be added to the list `expected_strings`. +# + tpl = JINJA_ENV.from_string(""" SELECT observation_source_value ,COUNT(1) AS n FROM `{{project_id}}.{{new_rdr}}.observation` WHERE SAFE_CAST(value_as_string AS INT64) IS NOT NULL +AND value_source_concept_id = 0 +AND LOWER(observation_source_value) NOT IN UNNEST ({{expected_strings}}) +AND NOT REGEXP_CONTAINS(LOWER(observation_source_value), '(?i)snap|signature|address|email|number|cohortgroup') GROUP BY 1 ORDER BY 2 DESC """) -query = tpl.render(new_rdr=new_rdr, project_id=project_id) -execute(client, query) +query = tpl.render(new_rdr=new_rdr, project_id=project_id,expected_strings=expected_strings) +df = execute(client, query) + +success_msg = 'All records with a number in value_as_string are expected to be text.' +failure_msg = 'Some records that have a number value_as_string might not be expected. See description.' + +render_message(df, + success_msg, + failure_msg) +# - # # All COPE `questionnaire_response_id`s are in COPE version map # Any `questionnaire_response_id`s missing from the map will be listed below.