Merge branch 'develop' into ms/dc-3271

all-of-us · Sep 11, 2023 · 0fd38e3 · 0fd38e3
2 parents 5038819 + 8f57d02
commit 0fd38e3
Show file tree

Hide file tree

Showing 86 changed files with 4,000 additions and 1,790 deletions.
diff --git a/data_steward/admin/key_rotation.py b/data_steward/admin/key_rotation.py
@@ -8,7 +8,7 @@
 
 LOGGER = logging.getLogger(__name__)
 
-KEY_EXPIRE_DAYS = 180
+KEY_EXPIRE_DAYS = 150
 KEY_EXPIRE_ALERT_DAYS = 7
 GCP_DTM_FMT = '%Y-%m-%dT%H:%M:%SZ'
 

diff --git a/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py b/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py
diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py
@@ -75,6 +75,25 @@
 # summary will have a summary in the end
 df = pd.DataFrame(columns=['query', 'result'])
 
+# wear_consent and wear_consent_ptsc question and module concepts where not in multiple surveys.
+# The concepts found in multiple surveys are: 'resultsconsent_helpmewithconsent' and 'helpmewithconsent_name' 
+WEAR_SURVEY_CODES = ['havesmartphone',
+                      'wearwatch',
+                      'usetracker',
+                      'wear12months',
+                      'receivesms',
+                      'frequency',
+                      'agreetoshare',
+                      'onlyparticipantinhousehold',
+                      'haveaddress',
+                      'resultsconsent_wear',
+                      'email_help_consent',
+                      'timeofday',
+                      'wearconsent_signature',
+                      'wearconsent_todaysdate',
+                      'wear_consent',
+                      'wear_consent_ptsc']
+
 # # Query1: all the birthdates are set to 15th June of the birth year in person table
 #
 
@@ -1225,6 +1244,126 @@ def query_template(table_era):
 res2
 
 # # final summary result
+# -
+
+
+# # Q17 Wear study table
+#
+# DC-3340
+#
+# This check confirms that the wear_study table contains:
+# 1. Only one row per participant
+# 2. Wear study participants are also found in the CDR person table.
+# 3. Wear study participants have primary consent records in observation.
+#
+# **If check fails:**<br> 
+# * The issue `participant with multiple records` means that those participants have multiple rows in the wear_study table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table. <br>
+# * The issue `not in person table` means that participants exist in the wear_study table that aren't in the person table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.<br>
+# * The issue `no primary consent` means that participants exist in the wear_study table that do not have proper primary consent. Investigate the issue. It is possible that there is another way to determine primary consent. <br>
+
+# +
+query = JINJA_ENV.from_string("""
+
+WITH latest_primary_consent_records AS ( -- most current consent record per person --
+    SELECT person_id, observation_source_value, MAX(observation_date) AS latest_date,
+    FROM `{{project_id}}.{{ct_dataset}}.observation` o
+    WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent')
+    GROUP BY person_id, observation_source_value
+)
+
+SELECT
+  'participant with multiple records' as issue,
+  COUNT(person_id) as bad_rows
+FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
+GROUP BY person_id
+HAVING COUNT(person_id)>1
+
+UNION ALL
+
+SELECT
+  'not in person table' as issue,
+  COUNT(person_id) as bad_rows
+FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
+WHERE person_id not in ( -- person table --
+  SELECT person_id
+  FROM `{{project_id}}.{{ct_dataset}}.person` o
+  )
+  
+UNION ALL
+
+SELECT
+  'no primary consent' as issue,
+  COUNT(person_id) as bad_rows
+FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
+WHERE person_id not in (  -- aou consenting participants --
+  SELECT cte.person_id
+  FROM latest_primary_consent_records cte
+    LEFT JOIN ( -- any positive primary consent --
+      SELECT *
+      FROM `{{project_id}}.{{ct_dataset}}.observation` o
+      WHERE REGEXP_CONTAINS(o.observation_source_value, '(?i)extraconsent_agreetoconsent')
+      AND o.value_as_concept_id = 45877994
+    ON cte.person_id = o.person_id
+    AND cte.latest_consent_date = o.observation_date
+  WHERE o.person_id IS NOT NULL
+  )
+
+""")
+q = query.render(project_id=project_id,
+                 ct_dataset=ct_dataset)
+df1 = execute(client, q)
+
+if df1['bad_rows'].sum() == 0:
+    df = df.append(
+        {
+            'query': 'Query17  wear_study table is as expected.',
+            'result': 'PASS'
+        },
+        ignore_index=True)
+else:
+    df = df.append(
+        {
+            'query': 'Query17 wear_study table is not as expected. See notes in the description.',
+            'result': 'Failure'
+        },
+        ignore_index=True)
+
+
+# +
+# Query 18:  Check that wear_consent records are suppressed in the 'observation' and 'survey_conduct' tables
+# -
+
+query = JINJA_ENV.from_string("""
+SELECT
+  'observation' as table,
+  COUNT(*) AS bad_rows
+FROM
+  `{{project_id}}.{{ct_dataset}}.observation` o
+  LEFT JOIN   `{{project_id}}.{{ct_dataset}}.survey_conduct` sc
+  ON sc.survey_conduct_id = o.questionnaire_response_id
+WHERE sc.survey_concept_id IN (2100000011,2100000012) -- captures questions asked in multiple surveys --
+OR LOWER(observation_source_value) IN UNNEST ({{wear_codes}}) -- captures those that might be missing from survey_conduct --
+GROUP BY 1
+UNION ALL
+SELECT
+  'survey_conduct' as table,
+  COUNT(*) AS bad_rows
+FROM
+  `{{project_id}}.{{ct_dataset}}.survey_conduct` sc
+WHERE sc.survey_concept_id IN (2100000011,2100000012) 
+GROUP BY 1
+""")
+q = query.render(project_id=project_id,
+            ct_dataset=ct_dataset,
+            wear_codes=WEAR_SURVEY_CODES)
+df1=execute(client, q) 
+if df1['bad_rows'].sum()==0:
+ df = df.append({'query' : 'Query18 wear_consent records are cleaned as expected.', 'result' : 'PASS'},  
+                ignore_index = True) 
+else:
+ df = df.append({'query' : 'Query18 wear_consent records have not been cleaned as expected.', 'result' : 'Failure'},
+                ignore_index = True) 
+df1
 
 
 # +
@@ -1233,4 +1372,4 @@ def highlight_cells(val):
     return f'background-color: {color}'
 
 
-df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})
+df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})
diff --git a/data_steward/analytics/cdr_ops/cross_ct_rt/CDR_QC_RT_vs_CT.py b/data_steward/analytics/cdr_ops/cross_ct_rt/CDR_QC_RT_vs_CT.py
@@ -600,6 +600,61 @@
 
 # -
 
+# # Query8: Verify the wear_study dateshift
+#
+# RT dates should have been shifted back by the number of days designated to each 
+# participant via the primary_pid_rid_mapping table.
+#
+# The following query will find any rows in the wear_study tables where the RT date plus the date shift is not equal to the 
+# CT date. If there are resulting rows, make sure the pipeline dateshift ran properly.
+
+# +
+query = JINJA_ENV.from_string("""
+
+SELECT
+ 'date shift is off' as issue,
+ COUNT(*) as bad_rows
+FROM
+  `{{project_id}}.{{rt_dataset}}.wear_study` rtws
+JOIN
+  `{{project_id}}.{{ct_dataset}}.wear_study` ctws
+USING(person_id)
+JOIN
+  `{{project_id}}.{{pipeline_tables}}.primary_pid_rid_mapping` pprm
+ON rtws.person_id = pprm.research_id
+WHERE DATE_ADD(rtws.wear_consent_start_date, INTERVAL shift DAY) <> ctws.wear_consent_start_date
+OR DATE_ADD(rtws.wear_consent_end_date, INTERVAL shift DAY) <> ctws.wear_consent_end_date
+""")
+
+q = query.render(project_id=project_id,
+                 rt_dataset=rt_dataset,
+                 ct_dataset=ct_dataset,
+                 pipeline_tables=PIPELINE_TABLES)
+
+df1 = execute(client, q)
+
+if df1['bad_rows'].sum() == 0:
+    df = df.append(
+        {
+            'query':
+                'Query8 Wear_study dates are as expected.',
+            'result':
+                'PASS'
+        },
+        ignore_index=True)
+else:
+    df = df.append(
+        {
+            'query':
+                'Query8 Wear_study dates are not aligned properly. See description.',
+            'result':
+                'FAIL'
+        },
+        ignore_index=True
+                  )
+    display(df1)
+# -
+
 # # Summary_CDR_QC_RT_vs_CT_comparison
 
 # if not pass, will be highlighted in red