Skip to content

Commit

Permalink
Merge branch 'develop' into ms/dc-3271
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Schmidt authored Sep 11, 2023
2 parents 5038819 + 8f57d02 commit 0fd38e3
Show file tree
Hide file tree
Showing 86 changed files with 4,000 additions and 1,790 deletions.
2 changes: 1 addition & 1 deletion data_steward/admin/key_rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

LOGGER = logging.getLogger(__name__)

KEY_EXPIRE_DAYS = 180
KEY_EXPIRE_DAYS = 150
KEY_EXPIRE_ALERT_DAYS = 7
GCP_DTM_FMT = '%Y-%m-%dT%H:%M:%SZ'

Expand Down
322 changes: 248 additions & 74 deletions data_steward/analytics/cdr_ops/clean_rdr_export_qc.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,25 @@
# summary will have a summary in the end
df = pd.DataFrame(columns=['query', 'result'])

# wear_consent and wear_consent_ptsc question and module concepts where not in multiple surveys.
# The concepts found in multiple surveys are: 'resultsconsent_helpmewithconsent' and 'helpmewithconsent_name'
WEAR_SURVEY_CODES = ['havesmartphone',
'wearwatch',
'usetracker',
'wear12months',
'receivesms',
'frequency',
'agreetoshare',
'onlyparticipantinhousehold',
'haveaddress',
'resultsconsent_wear',
'email_help_consent',
'timeofday',
'wearconsent_signature',
'wearconsent_todaysdate',
'wear_consent',
'wear_consent_ptsc']

# # Query1: all the birthdates are set to 15th June of the birth year in person table
#

Expand Down Expand Up @@ -1225,6 +1244,126 @@ def query_template(table_era):
res2

# # final summary result
# -


# # Q17 Wear study table
#
# DC-3340
#
# This check confirms that the wear_study table contains:
# 1. Only one row per participant
# 2. Wear study participants are also found in the CDR person table.
# 3. Wear study participants have primary consent records in observation.
#
# **If check fails:**<br>
# * The issue `participant with multiple records` means that those participants have multiple rows in the wear_study table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table. <br>
# * The issue `not in person table` means that participants exist in the wear_study table that aren't in the person table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.<br>
# * The issue `no primary consent` means that participants exist in the wear_study table that do not have proper primary consent. Investigate the issue. It is possible that there is another way to determine primary consent. <br>

# +
query = JINJA_ENV.from_string("""
WITH latest_primary_consent_records AS ( -- most current consent record per person --
SELECT person_id, observation_source_value, MAX(observation_date) AS latest_date,
FROM `{{project_id}}.{{ct_dataset}}.observation` o
WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent')
GROUP BY person_id, observation_source_value
)
SELECT
'participant with multiple records' as issue,
COUNT(person_id) as bad_rows
FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
GROUP BY person_id
HAVING COUNT(person_id)>1
UNION ALL
SELECT
'not in person table' as issue,
COUNT(person_id) as bad_rows
FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
WHERE person_id not in ( -- person table --
SELECT person_id
FROM `{{project_id}}.{{ct_dataset}}.person` o
)
UNION ALL
SELECT
'no primary consent' as issue,
COUNT(person_id) as bad_rows
FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws
WHERE person_id not in ( -- aou consenting participants --
SELECT cte.person_id
FROM latest_primary_consent_records cte
LEFT JOIN ( -- any positive primary consent --
SELECT *
FROM `{{project_id}}.{{ct_dataset}}.observation` o
WHERE REGEXP_CONTAINS(o.observation_source_value, '(?i)extraconsent_agreetoconsent')
AND o.value_as_concept_id = 45877994
ON cte.person_id = o.person_id
AND cte.latest_consent_date = o.observation_date
WHERE o.person_id IS NOT NULL
)
""")
q = query.render(project_id=project_id,
ct_dataset=ct_dataset)
df1 = execute(client, q)

if df1['bad_rows'].sum() == 0:
df = df.append(
{
'query': 'Query17 wear_study table is as expected.',
'result': 'PASS'
},
ignore_index=True)
else:
df = df.append(
{
'query': 'Query17 wear_study table is not as expected. See notes in the description.',
'result': 'Failure'
},
ignore_index=True)


# +
# Query 18: Check that wear_consent records are suppressed in the 'observation' and 'survey_conduct' tables
# -

query = JINJA_ENV.from_string("""
SELECT
'observation' as table,
COUNT(*) AS bad_rows
FROM
`{{project_id}}.{{ct_dataset}}.observation` o
LEFT JOIN `{{project_id}}.{{ct_dataset}}.survey_conduct` sc
ON sc.survey_conduct_id = o.questionnaire_response_id
WHERE sc.survey_concept_id IN (2100000011,2100000012) -- captures questions asked in multiple surveys --
OR LOWER(observation_source_value) IN UNNEST ({{wear_codes}}) -- captures those that might be missing from survey_conduct --
GROUP BY 1
UNION ALL
SELECT
'survey_conduct' as table,
COUNT(*) AS bad_rows
FROM
`{{project_id}}.{{ct_dataset}}.survey_conduct` sc
WHERE sc.survey_concept_id IN (2100000011,2100000012)
GROUP BY 1
""")
q = query.render(project_id=project_id,
ct_dataset=ct_dataset,
wear_codes=WEAR_SURVEY_CODES)
df1=execute(client, q)
if df1['bad_rows'].sum()==0:
df = df.append({'query' : 'Query18 wear_consent records are cleaned as expected.', 'result' : 'PASS'},
ignore_index = True)
else:
df = df.append({'query' : 'Query18 wear_consent records have not been cleaned as expected.', 'result' : 'Failure'},
ignore_index = True)
df1


# +
Expand All @@ -1233,4 +1372,4 @@ def highlight_cells(val):
return f'background-color: {color}'


df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})
df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'})
55 changes: 55 additions & 0 deletions data_steward/analytics/cdr_ops/cross_ct_rt/CDR_QC_RT_vs_CT.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,61 @@

# -

# # Query8: Verify the wear_study dateshift
#
# RT dates should have been shifted back by the number of days designated to each
# participant via the primary_pid_rid_mapping table.
#
# The following query will find any rows in the wear_study tables where the RT date plus the date shift is not equal to the
# CT date. If there are resulting rows, make sure the pipeline dateshift ran properly.

# +
query = JINJA_ENV.from_string("""
SELECT
'date shift is off' as issue,
COUNT(*) as bad_rows
FROM
`{{project_id}}.{{rt_dataset}}.wear_study` rtws
JOIN
`{{project_id}}.{{ct_dataset}}.wear_study` ctws
USING(person_id)
JOIN
`{{project_id}}.{{pipeline_tables}}.primary_pid_rid_mapping` pprm
ON rtws.person_id = pprm.research_id
WHERE DATE_ADD(rtws.wear_consent_start_date, INTERVAL shift DAY) <> ctws.wear_consent_start_date
OR DATE_ADD(rtws.wear_consent_end_date, INTERVAL shift DAY) <> ctws.wear_consent_end_date
""")

q = query.render(project_id=project_id,
rt_dataset=rt_dataset,
ct_dataset=ct_dataset,
pipeline_tables=PIPELINE_TABLES)

df1 = execute(client, q)

if df1['bad_rows'].sum() == 0:
df = df.append(
{
'query':
'Query8 Wear_study dates are as expected.',
'result':
'PASS'
},
ignore_index=True)
else:
df = df.append(
{
'query':
'Query8 Wear_study dates are not aligned properly. See description.',
'result':
'FAIL'
},
ignore_index=True
)
display(df1)
# -

# # Summary_CDR_QC_RT_vs_CT_comparison

# if not pass, will be highlighted in red
Expand Down
Loading

0 comments on commit 0fd38e3

Please sign in to comment.