Skip to content

Commit

Permalink
[DC-3407] Update value_as_string check in clean_rdr notebooks to remo…
Browse files Browse the repository at this point in the history
…ve false positives (#1720)

* [DC-3407] Update raw rdr query

* [DC-3407] Update clean rdr query

* [DC-3407] Update the expected_strings list

* [DC-3407] Update the check description and list order

* [DC-3407] Update the check description and list order in clean notebook
  • Loading branch information
brendagutman authored Sep 11, 2023
1 parent 6365bbc commit 8f57d02
Show file tree
Hide file tree
Showing 2 changed files with 475 additions and 12 deletions.
248 changes: 239 additions & 9 deletions data_steward/analytics/cdr_ops/clean_rdr_export_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,232 @@
from gcloud.bq import BigQueryClient
from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message

# # Table comparison
# The export should generally contain the same tables from month to month.
# Tables found only in the old or the new export are listed below.

impersonation_creds = auth.get_impersonation_credentials(
run_as, target_scopes=IMPERSONATION_SCOPES)

client = BigQueryClient(project_id, credentials=impersonation_creds)

# This list is created by querying the redcap surveys. In case of needed update, query provided in the comments of DC3407
expected_strings = ['cidi5_20', 'cidi5_24', 'cidi5_28', 'cidi5_31', 'mhqukb_48_age',
'mhqukb_50_number', 'mhqukb_51_number', 'mhqukb_52_number',
'mhqukb_53_number', 'record_id', 'helpmewithconsent_name',
'other_concerns', 'other_reasons', 'resultsconsent_emailmecopy',
'resultsconsent_signaturedate', 'consentpii_helpwithconsentsignature',
'extraconsent_signature_type', 'extraconsent_todaysdate',
'piiaddress_streetaddress', 'piiaddress_streetaddress2',
'piibirthinformation_birthdate', 'piicontactinformation_phone',
'piiname_first', 'piiname_last', 'piiname_middle',
'streetaddress_piicity', 'streetaddress_piizip', 'basics_11a_cope_a_33',
'basics_xx', 'basics_xx20', 'cdc_covid_19_7_xx22_date', 'cope_a_126',
'cope_a_160', 'cope_a_85', 'copect_50_xx19_cope_a_152',
'copect_50_xx19_cope_a_198', 'copect_50_xx19_cope_a_57',
'cu_covid_cope_a_204', 'eds_follow_up_1_xx', 'ipaq_1_cope_a_24',
'ipaq_2_cope_a_160', 'ipaq_2_cope_a_85', 'ipaq_3_cope_a_24',
'ipaq_4_cope_a_160', 'ipaq_4_cope_a_85', 'ipaq_5_cope_a_24',
'ipaq_6_cope_a_160', 'ipaq_6_cope_a_85', 'lifestyle_2_xx12_cope_a_152',
'lifestyle_2_xx12_cope_a_198', 'lifestyle_2_xx12_cope_a_57',
'tsu_ds5_13_xx42_cope_a_226', 'cdc_covid_19_7_xx23_other_cope_a_204',
'cdc_covid_19_n_a2', 'cdc_covid_19_n_a4', 'cdc_covid_19_n_a8',
'cope_aou_xx_2_a', 'dmfs_29a', 'msds_17_c',
'nhs_covid_fhc17b_cope_a_226', 'ehrconsentpii_helpwithconsentsignature',
'ehrconsentpii_todaysdate', 'ehrconsentpii_todaysdateilhippawitness',
'sensitivetype2_domesticviolence', 'sensitivetype2_genetictesting',
'sensitivetype2_hivaids', 'sensitivetype2_mentalhealth',
'sensitivetype2_substanceuse', 'signature_type', 'cidi5_15',
'mhqukb_25_number', 'mhqukb_26_age', 'mhqukb_28_age', 'ss_2_age',
'ss_3_age_1', 'ss_3_age_2', 'ss_3_number',
'english_exploring_the_mind_consent_form', 'etm_help_name',
'cdc_covid_xx_a_date1', 'cdc_covid_xx_a_date2',
'cdc_covid_xx_b_firstdose_other', 'cdc_covid_xx_b_seconddose_other',
'cdc_covid_xx_symptom_cope_350',
'cdc_covid_xx_symptom_seconddose_cope_350', 'dmfs_29_seconddose_other',
'othercancer_daughterfreetextbox', 'othercancer_fatherfreetextbox',
'othercancer_grandparentfreetextbox', 'othercancer_motherfreetextbox',
'othercancer_siblingfreetextbox', 'othercancer_sonfreetextbox',
'othercondition_daughterfreetextbox', 'othercondition_fatherfreetextbox',
'othercondition_grandparentfreetextbox',
'othercondition_motherfreetextbox', 'othercondition_siblingfreetextbox',
'othercondition_sonfreetextbox', 'cdc_covid_xx_b_other',
'otherdelayedmedicalcare_freetext',
'attemptquitsmoking_completelyquitage', 'otherspecify_otherdrugstextbox',
'smoking_averagedailycigarettenumber',
'smoking_currentdailycigarettenumber',
'smoking_dailysmokestartingagenumber', 'smoking_numberofyearsnumber',
'cdc_covid_xx_a_date10', 'cdc_covid_xx_a_date11',
'cdc_covid_xx_a_date12', 'cdc_covid_xx_a_date13',
'cdc_covid_xx_a_date14', 'cdc_covid_xx_a_date15',
'cdc_covid_xx_a_date16', 'cdc_covid_xx_a_date17', 'cdc_covid_xx_a_date3',
'cdc_covid_xx_a_date4', 'cdc_covid_xx_a_date5', 'cdc_covid_xx_a_date6',
'cdc_covid_xx_a_date7', 'cdc_covid_xx_a_date8', 'cdc_covid_xx_a_date9',
'cdc_covid_xx_b_dose10_other', 'cdc_covid_xx_b_dose11_other',
'cdc_covid_xx_b_dose12_other', 'cdc_covid_xx_b_dose13_other',
'cdc_covid_xx_b_dose14_other', 'cdc_covid_xx_b_dose15_other',
'cdc_covid_xx_b_dose16_other', 'cdc_covid_xx_b_dose17_other',
'cdc_covid_xx_b_dose3_other', 'cdc_covid_xx_b_dose4_other',
'cdc_covid_xx_b_dose5_other', 'cdc_covid_xx_b_dose6_other',
'cdc_covid_xx_b_dose7_other', 'cdc_covid_xx_b_dose8_other',
'cdc_covid_xx_b_dose9_other', 'cdc_covid_xx_symptom_cope_350_dose10',
'cdc_covid_xx_symptom_cope_350_dose11',
'cdc_covid_xx_symptom_cope_350_dose12',
'cdc_covid_xx_symptom_cope_350_dose13',
'cdc_covid_xx_symptom_cope_350_dose14',
'cdc_covid_xx_symptom_cope_350_dose15',
'cdc_covid_xx_symptom_cope_350_dose16',
'cdc_covid_xx_symptom_cope_350_dose17',
'cdc_covid_xx_symptom_cope_350_dose3',
'cdc_covid_xx_symptom_cope_350_dose4',
'cdc_covid_xx_symptom_cope_350_dose5',
'cdc_covid_xx_symptom_cope_350_dose6',
'cdc_covid_xx_symptom_cope_350_dose7',
'cdc_covid_xx_symptom_cope_350_dose8',
'cdc_covid_xx_symptom_cope_350_dose9', 'cdc_covid_xx_type_dose10_other',
'cdc_covid_xx_type_dose11_other', 'cdc_covid_xx_type_dose12_other',
'cdc_covid_xx_type_dose13_other', 'cdc_covid_xx_type_dose14_other',
'cdc_covid_xx_type_dose15_other', 'cdc_covid_xx_type_dose16_other',
'cdc_covid_xx_type_dose17_other', 'cdc_covid_xx_type_dose3_other',
'cdc_covid_xx_type_dose4_other', 'cdc_covid_xx_type_dose5_other',
'cdc_covid_xx_type_dose6_other', 'cdc_covid_xx_type_dose7_other',
'cdc_covid_xx_type_dose8_other', 'cdc_covid_xx_type_dose9_other',
'dmfs_29_additionaldose_other',
'organtransplant_bloodvesseltransplantdate',
'organtransplant_bonetransplantdate',
'organtransplant_corneatransplantdate',
'organtransplant_hearttransplantdate',
'organtransplant_intestinetransplantdate',
'organtransplant_kidneytransplantdate',
'organtransplant_livertransplantdate',
'organtransplant_lungtransplantdate',
'organtransplant_otherorgantransplantdate',
'organtransplant_othertissuetransplantdate',
'organtransplant_pancreastransplantdate',
'organtransplant_skintransplantdate',
'organtransplant_valvetransplantdate', 'otherorgan_freetextbox',
'othertissue_freetextbox',
'outsidetravel6month_outsidetravel6monthhowlong',
'outsidetravel6month_outsidetravel6monthwheretraveled',
'overallhealth_hysterectomyhistoryage',
'overallhealthovaryremovalhistoryage',
'otherarthritis_daughterfreetextbox', 'otherarthritis_fatherfreetextbox',
'otherarthritis_freetextbox', 'otherarthritis_grandparentfreetextbox',
'otherarthritis_motherfreetextbox', 'otherarthritis_siblingfreetextbox',
'otherarthritis_sonfreetextbox',
'otherbonejointmuscle_daughterfreetextbox',
'otherbonejointmuscle_fatherfreetextbox',
'otherbonejointmuscle_freetextbox',
'otherbonejointmuscle_grandparentfreetextbox',
'otherbonejointmuscle_motherfreetextbox',
'otherbonejointmuscle_siblingfreetextbox',
'otherbonejointmuscle_sonfreetextbox',
'otherbrainnervoussystem_daughterfreetextbox',
'otherbrainnervoussystem_fatherfreetextbox',
'otherbrainnervoussystem_freetextbox',
'otherbrainnervoussystem_grandparentfreetextbox',
'otherbrainnervoussystem_motherfreetextbox',
'otherbrainnervoussystem_siblingfreetextbox',
'otherbrainnervoussystem_sonfreetextbox', 'othercancer_freetextbox',
'otherdiabetes_daughterfreetextbox', 'otherdiabetes_fatherfreetextbox',
'otherdiabetes_freetextbox', 'otherdiabetes_grandparentfreetextbox',
'otherdiabetes_motherfreetextbox', 'otherdiabetes_siblingfreetextbox',
'otherdiabetes_sonfreetextbox', 'otherdiagnosis_daughterfreetextbox',
'otherdiagnosis_fatherfreetextbox', 'otherdiagnosis_freetextbox',
'otherdiagnosis_grandparentfreetextbox',
'otherdiagnosis_motherfreetextbox', 'otherdiagnosis_siblingfreetextbox',
'otherdiagnosis_sonfreetextbox',
'otherdigestivecondition_daughterfreetextbox',
'otherdigestivecondition_fatherfreetextbox',
'otherdigestivecondition_freetextbox',
'otherdigestivecondition_grandparentfreetextbox',
'otherdigestivecondition_motherfreetextbox',
'otherdigestivecondition_siblingfreetextbox',
'otherdigestivecondition_sonfreetextbox',
'otherhearingeye_daughterfreetextbox',
'otherhearingeye_fatherfreetextbox', 'otherhearingeye_freetextbox',
'otherhearingeye_grandparentfreetextbox',
'otherhearingeye_motherfreetextbox',
'otherhearingeye_siblingfreetextbox', 'otherhearingeye_sonfreetextbox',
'otherheartorbloodcondition_daughterfreetextbox',
'otherheartorbloodcondition_fatherfreetextbox',
'otherheartorbloodcondition_freetextbox',
'otherheartorbloodcondition_grandparentfreetextbox',
'otherheartorbloodcondition_motherfreetextbox',
'otherheartorbloodcondition_siblingfreetextbox',
'otherheartorbloodcondition_sonfreetextbox',
'otherhormoneendocrine_daughterfreetextbox',
'otherhormoneendocrine_fatherfreetextbox',
'otherhormoneendocrine_freetextbox',
'otherhormoneendocrine_grandparentfreetextbox',
'otherhormoneendocrine_motherfreetextbox',
'otherhormoneendocrine_siblingfreetextbox',
'otherhormoneendocrine_sonfreetextbox',
'otherinfectiousdisease_freetextbox',
'otherkidneycondition_daughterfreetextbox',
'otherkidneycondition_fatherfreetextbox',
'otherkidneycondition_freetextbox',
'otherkidneycondition_grandparentfreetextbox',
'otherkidneycondition_motherfreetextbox',
'otherkidneycondition_siblingfreetextbox',
'otherkidneycondition_sonfreetextbox',
'othermentalhealthsubstanceuse_daughterfreetextbox',
'othermentalhealthsubstanceuse_fatherfreetextbox',
'othermentalhealthsubstanceuse_freetextbox',
'othermentalhealthsubstanceuse_grandparentfreetextb',
'othermentalhealthsubstanceuse_motherfreetextbox',
'othermentalhealthsubstanceuse_siblingfreetextbox',
'othermentalhealthsubstanceuse_sonfreetextbox',
'otherrespiratory_daughterfreetextbox',
'otherrespiratory_fatherfreetextbox', 'otherrespiratory_freetextbox',
'otherrespiratory_grandparentfreetextbox',
'otherrespiratory_motherfreetextbox',
'otherrespiratory_siblingfreetextbox', 'otherrespiratory_sonfreetextbox',
'otherthyroid_daughterfreetextbox', 'otherthyroid_fatherfreetextbox',
'otherthyroid_freetextbox', 'otherthyroid_grandparentfreetextbox',
'otherthyroid_motherfreetextbox', 'otherthyroid_siblingfreetextbox',
'otherthyroid_sonfreetextbox', 'self_reported_height_cm',
'self_reported_height_ft', 'self_reported_height_in',
'self_reported_weight_kg', 'self_reported_weight_pounds',
'sdoh_eds_follow_up_1_xx', 'urs_8c', 'aian_tribe',
'aiannoneofthesedescribeme_aianfreetext',
'blacknoneofthesedescribeme_blackfreetext',
'employmentworkaddress_addresslineone',
'employmentworkaddress_addresslinetwo', 'employmentworkaddress_city',
'employmentworkaddress_country', 'employmentworkaddress_zipcode',
'hispanicnoneofthesedescribeme_hispanicfreetext',
'livingsituation_howmanypeople',
'livingsituation_livingsituationfreetext',
'livingsituation_peopleunder18',
'menanoneofthesedescribeme_menafreetext',
'nhpinoneofthesedescribeme_nhpifreetext',
'noneofthesedescribeme_asianfreetext', 'otherhealthplan_freetext',
'persononeaddress_persononeaddresscity',
'persononeaddress_persononeaddresszipcode',
'secondarycontactinfo_persononeaddressone',
'secondarycontactinfo_persononeaddresstwo',
'secondarycontactinfo_persononeemail',
'secondarycontactinfo_persononefirstname',
'secondarycontactinfo_persononelastname',
'secondarycontactinfo_persononemiddleinitial',
'secondarycontactinfo_persononetelephone',
'secondarycontactinfo_secondcontactsaddressone',
'secondarycontactinfo_secondcontactsaddresstwo',
'secondarycontactinfo_secondcontactsemail',
'secondarycontactinfo_secondcontactsfirstname',
'secondarycontactinfo_secondcontactslastname',
'secondarycontactinfo_secondcontactsmiddleinitial',
'secondarycontactinfo_secondcontactsnumber',
'secondcontactsaddress_secondcontactcity',
'secondcontactsaddress_secondcontactzipcode',
'sexatbirthnoneofthese_sexatbirthtextbox',
'socialsecurity_socialsecuritynumber',
'somethingelse_sexualitysomethingelsetextbox',
'specifiedgender_specifiedgendertextbox', 'thebasics_countryborntextbox',
'whatraceethnicity_raceethnicitynoneofthese',
'whitenoneofthesedescribeme_whitefreetext', 'timeofday',
'wearconsent_todaysdate']

# # Table comparison
# The export should generally contain the same tables from month to month.
# Tables found only in the old or the new export are listed below.

tpl = JINJA_ENV.from_string('''
SELECT
COALESCE(curr.table_id, prev.table_id) AS table_id
Expand Down Expand Up @@ -187,20 +404,34 @@
query = tpl.render(new_rdr=new_rdr, project_id=project_id)
execute(client, query)

# # Check if numeric data in value_as_string
# # Check numeric data in value_as_string
# Some numeric data is expected in value_as_string. For example, zip codes or other contact specific information.

#
# **If the check fails, manually review the results.** <br>
# False positives are possible. The suggested first step of investigation is to run the query in the comments of DC3407. This will provide any new text type questions from the surveys that can be added to the list `expected_strings`.
# +
tpl = JINJA_ENV.from_string("""
SELECT
observation_source_value
,COUNT(1) AS n
FROM `{{project_id}}.{{new_rdr}}.observation`
WHERE SAFE_CAST(value_as_string AS INT64) IS NOT NULL
AND value_source_concept_id = 0
AND LOWER(observation_source_value) NOT IN UNNEST ({{expected_strings}})
AND NOT REGEXP_CONTAINS(LOWER(observation_source_value), '(?i)snap|signature|address|email|number|cohortgroup')
GROUP BY 1
ORDER BY 2 DESC
""")
query = tpl.render(new_rdr=new_rdr, project_id=project_id)
execute(client, query)
query = tpl.render(new_rdr=new_rdr, project_id=project_id,expected_strings=expected_strings)
df = execute(client, query)

success_msg = 'All records with a number in value_as_string are expected to be text.'
failure_msg = 'Some records that have a number value_as_string might not be expected. See description.'

render_message(df,
success_msg,
failure_msg)
# -

# # All COPE `questionnaire_response_id`s are in COPE version map
# Any `questionnaire_response_id`s missing from the map will be listed below.
Expand Down Expand Up @@ -1007,4 +1238,3 @@
success_msg,
failure_msg,
failure_msg_args={'code_count': len(df)})
# -
Loading

0 comments on commit 8f57d02

Please sign in to comment.