From a8317d4783103d4569e89222eb6db4e83db0e65e Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Wed, 2 Aug 2023 14:24:08 -0500 Subject: [PATCH 01/19] [DC-3271] saving WIP --- data_steward/tools/import_rdr_dataset.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index 969fa1330b..d40551941a 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -6,7 +6,6 @@ # Python imports from argparse import ArgumentParser -from datetime import datetime import logging # Third party imports @@ -16,7 +15,7 @@ # Project imports from utils import auth, pipeline_logging from gcloud.bq import BigQueryClient -from common import CDR_SCOPES +from common import CDR_SCOPES, JINJA_ENV from resources import replace_special_characters_for_labels, validate_date_string, rdr_src_id_schemas, cdm_schemas from tools.snapshot_by_query import BIGQUERY_DATA_TYPES from tools.import_rdr_omop import copy_vocab_tables @@ -131,6 +130,18 @@ def create_rdr_tables(client, destination_dataset, rdr_project, # copy contents from source dataset to destination dataset sql = (f'SELECT {fields_name_str} ' f'FROM `{source_table_id}`') + print(sql) + + tpl = JINJA_ENV.from_string(""" + {% for item in schema_list %} + CAST({{ item.name }} AS {{ BIGQUERY_DATA_TYPES[item.field_type.lower()]}}) AS {{ item.name }} + {% if not loop.last %} + , + {% else %} + {% endif %} + {% endfor %}""").render(schema_list=schema_list) + print(tpl) + return job_config = bigquery.job.QueryJobConfig( write_disposition=bigquery.job.WriteDisposition.WRITE_EMPTY, @@ -191,9 +202,9 @@ def main(raw_args=None): bq_client = BigQueryClient(args.curation_project_id, credentials=impersonation_creds) - dataset_object = bq_client.define_dataset(new_dataset_name, description, - {'export_date': args.export_date}) - bq_client.create_dataset(dataset_object) + #dataset_object = bq_client.define_dataset(new_dataset_name, description, + # {'export_date': args.export_date}) + #bq_client.create_dataset(dataset_object) create_rdr_tables(bq_client, new_dataset_name, args.rdr_project_id, args.rdr_dataset) From d73c59fcfe66cca56ef681d42097b91be19d9546 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Thu, 3 Aug 2023 10:26:43 -0500 Subject: [PATCH 02/19] [DC-3271] Add new template, use new template --- data_steward/tools/import_rdr_dataset.py | 38 ++++++++++-------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index d40551941a..917f6d6dd6 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -22,6 +22,16 @@ LOGGER = logging.getLogger(__name__) +tpl = JINJA_ENV.from_string(""" +SELECT +{% for item in schema_list %} +CAST({{ item.name }} AS {{ BIGQUERY_DATA_TYPES[item.field_type.lower()]}}) AS {{ item.name }}{% if not loop.last %}, +{% else %} + +{% endif %} +{% endfor %} +FROM `{{source_table_id}}`""") + def parse_rdr_args(raw_args=None): parser = ArgumentParser( @@ -111,6 +121,7 @@ def create_rdr_tables(client, destination_dataset, rdr_project, try: LOGGER.info(f'Get table `{source_table_id}` in RDR') + source_table_id = "aou-res-curation-test.2019q4r1_combined.observation_period" client.get_table(source_table_id) LOGGER.info(f'Creating empty CDM table, `{table}`') @@ -121,27 +132,10 @@ def create_rdr_tables(client, destination_dataset, rdr_project, f'Copying source table `{source_table_id}` to destination table `{destination_table_id}`' ) - sc_list = [] - for item in schema_list: - field_cast = f'CAST({item.name} AS {BIGQUERY_DATA_TYPES[item.field_type.lower()]}) AS {item.name}' - sc_list.append(field_cast) - - fields_name_str = ',\n'.join(sc_list) - # copy contents from source dataset to destination dataset - sql = (f'SELECT {fields_name_str} ' f'FROM `{source_table_id}`') - print(sql) - - tpl = JINJA_ENV.from_string(""" - {% for item in schema_list %} - CAST({{ item.name }} AS {{ BIGQUERY_DATA_TYPES[item.field_type.lower()]}}) AS {{ item.name }} - {% if not loop.last %} - , - {% else %} - {% endif %} - {% endfor %}""").render(schema_list=schema_list) - print(tpl) - return + sql = tpl.render(schema_list=schema_list, + BIGQUERY_DATA_TYPES=BIGQUERY_DATA_TYPES, + source_table_id=source_table_id) job_config = bigquery.job.QueryJobConfig( write_disposition=bigquery.job.WriteDisposition.WRITE_EMPTY, @@ -202,9 +196,9 @@ def main(raw_args=None): bq_client = BigQueryClient(args.curation_project_id, credentials=impersonation_creds) - #dataset_object = bq_client.define_dataset(new_dataset_name, description, + # dataset_object = bq_client.define_dataset(new_dataset_name, description, # {'export_date': args.export_date}) - #bq_client.create_dataset(dataset_object) + # bq_client.create_dataset(dataset_object) create_rdr_tables(bq_client, new_dataset_name, args.rdr_project_id, args.rdr_dataset) From 50388192e73802557748913c49f5b35143a92bd0 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Thu, 3 Aug 2023 10:36:06 -0500 Subject: [PATCH 03/19] [DC-3271] Remove testing setup --- data_steward/tools/import_rdr_dataset.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index 917f6d6dd6..c921abfff7 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -121,7 +121,6 @@ def create_rdr_tables(client, destination_dataset, rdr_project, try: LOGGER.info(f'Get table `{source_table_id}` in RDR') - source_table_id = "aou-res-curation-test.2019q4r1_combined.observation_period" client.get_table(source_table_id) LOGGER.info(f'Creating empty CDM table, `{table}`') @@ -196,9 +195,9 @@ def main(raw_args=None): bq_client = BigQueryClient(args.curation_project_id, credentials=impersonation_creds) - # dataset_object = bq_client.define_dataset(new_dataset_name, description, - # {'export_date': args.export_date}) - # bq_client.create_dataset(dataset_object) + dataset_object = bq_client.define_dataset(new_dataset_name, description, + {'export_date': args.export_date}) + bq_client.create_dataset(dataset_object) create_rdr_tables(bq_client, new_dataset_name, args.rdr_project_id, args.rdr_dataset) From 469ffec2915d822e96a5e648c94cb9f684e12f2a Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Thu, 9 Nov 2023 12:12:20 -0600 Subject: [PATCH 04/19] [DC-3271] update #! top-level statement, bring in initial Jinja template --- data_steward/tools/import_rdr_dataset.py | 38 ++++++++++-------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index 537b4fcc2b..aa2d41fab8 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -1,10 +1,11 @@ -#!/usr/bin/env bash +#!/usr/bin/env python # Imports RDR ETL results into a dataset in BigQuery. # Assumes you have already activated a service account that is able to # access the dataset in BigQuery. # Python imports +from argparse import ArgumentParser from datetime import datetime from typing import List, Dict import logging @@ -16,7 +17,7 @@ # Project imports from utils import auth, pipeline_logging from gcloud.bq import BigQueryClient -from common import CDR_SCOPES, AOU_DEATH, DEATH +from common import CDR_SCOPES, AOU_DEATH, DEATH, JINJA_ENV from resources import (replace_special_characters_for_labels, validate_date_string, rdr_src_id_schemas, cdm_schemas, fields_for, rdr_specific_schemas) @@ -25,15 +26,6 @@ LOGGER = logging.getLogger(__name__) -tpl = JINJA_ENV.from_string(""" -SELECT -{% for item in schema_list %} -CAST({{ item.name }} AS {{ BIGQUERY_DATA_TYPES[item.field_type.lower()]}}) AS {{ item.name }}{% if not loop.last %}, -{% else %} - -{% endif %} -{% endfor %} -FROM `{{source_table_id}}`""") def parse_rdr_args(raw_args=None): @@ -164,17 +156,19 @@ def create_rdr_tables(client, destination_dataset, rdr_project, if table_ref.num_rows == 0: raise NotFound(f'`{source_table_id}` has No data To copy from') - sc_list = [] - for item in schema_list: - if item.name == 'aou_death_id': - field = 'GENERATE_UUID() AS aou_death_id' - elif item.name == 'primary_death_record': - field = 'FALSE AS primary_death_record' - else: - field = f'CAST({item.name} AS {BIGQUERY_DATA_TYPES[item.field_type.lower()]}) AS {item.name}' - sc_list.append(field) - - fields_name_str = ',\n'.join(sc_list) + + fields_name_str = JINJA_ENV.from_string(""" + {% for item in schema_list %} + {% set name = item.name %} + {% set field_type = item.field_type %} + {% if name == 'aou_death_id' %} + GENERATE_UUID() AS aou_death_id, + {% elif name == 'primary_death_record' %} + FALSE AS primary_death_record, + {% else %} + CAST({{ name }} AS {{ BIGQUERY_DATA_TYPES[field_type.lower()]}}){{", " if not loop.last else "" }} + {% endif %} + {% endfor %}""").render(schema_list=schema_list, BIGQUERY_DATA_TYPES=BIGQUERY_DATA_TYPES) # copy contents from source dataset to destination dataset if table == 'cope_survey_semantic_version_map': From 9d2831ea82fba3deb6987d025a4546f8fdc5fb82 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Thu, 9 Nov 2023 14:58:08 -0600 Subject: [PATCH 05/19] [DC-3271] Add "AS" clause, formatting --- data_steward/tools/import_rdr_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index aa2d41fab8..0af0d2cb01 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -159,14 +159,14 @@ def create_rdr_tables(client, destination_dataset, rdr_project, fields_name_str = JINJA_ENV.from_string(""" {% for item in schema_list %} - {% set name = item.name %} - {% set field_type = item.field_type %} + {% set name = item.name %} + {% set field_type = item.field_type %} {% if name == 'aou_death_id' %} GENERATE_UUID() AS aou_death_id, {% elif name == 'primary_death_record' %} FALSE AS primary_death_record, {% else %} - CAST({{ name }} AS {{ BIGQUERY_DATA_TYPES[field_type.lower()]}}){{", " if not loop.last else "" }} + CAST({{ name }} AS {{BIGQUERY_DATA_TYPES[field_type.lower()]}}) AS {{ name }}{{", " if not loop.last else "" }} {% endif %} {% endfor %}""").render(schema_list=schema_list, BIGQUERY_DATA_TYPES=BIGQUERY_DATA_TYPES) From 2630fd2e2b312d0c8fe240750551127a6535ce3a Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Thu, 9 Nov 2023 15:05:16 -0600 Subject: [PATCH 06/19] [DC-3271] YAPF --- data_steward/tools/import_rdr_dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index 0af0d2cb01..8e11c826a1 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -27,7 +27,6 @@ LOGGER = logging.getLogger(__name__) - def parse_rdr_args(raw_args=None): parser = ArgumentParser( description='Arguments pertaining to an RDR raw load') @@ -156,7 +155,6 @@ def create_rdr_tables(client, destination_dataset, rdr_project, if table_ref.num_rows == 0: raise NotFound(f'`{source_table_id}` has No data To copy from') - fields_name_str = JINJA_ENV.from_string(""" {% for item in schema_list %} {% set name = item.name %} @@ -168,7 +166,8 @@ def create_rdr_tables(client, destination_dataset, rdr_project, {% else %} CAST({{ name }} AS {{BIGQUERY_DATA_TYPES[field_type.lower()]}}) AS {{ name }}{{", " if not loop.last else "" }} {% endif %} - {% endfor %}""").render(schema_list=schema_list, BIGQUERY_DATA_TYPES=BIGQUERY_DATA_TYPES) + {% endfor %}""").render(schema_list=schema_list, + BIGQUERY_DATA_TYPES=BIGQUERY_DATA_TYPES) # copy contents from source dataset to destination dataset if table == 'cope_survey_semantic_version_map': From a583bb9eb94e5739b56292cb1d2762bd288577ec Mon Sep 17 00:00:00 2001 From: brendagutman <77469967+brendagutman@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:17:13 -0600 Subject: [PATCH 07/19] [DC-3629] Add the wear_study percentage with fitbit data check (#1833) --- data_steward/analytics/cdr_ops/fitbit_qc.py | 68 ++++++++++++++++++++- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/data_steward/analytics/cdr_ops/fitbit_qc.py b/data_steward/analytics/cdr_ops/fitbit_qc.py index 9027fa3492..dd1eb6d4e3 100644 --- a/data_steward/analytics/cdr_ops/fitbit_qc.py +++ b/data_steward/analytics/cdr_ops/fitbit_qc.py @@ -17,6 +17,7 @@ fitbit_dataset: str = "" # identifies the name of the new fitbit dataset sandbox_dataset: str = "" # the pipeline tables sandbox source_dataset: str = "" # identifies the name of the rdr dataset +deid_dataset: str = "" # dataset contains wear_study table cutoff_date: str = "" # CDR cutoff date in YYYY--MM-DD format run_as: str = "" # service account email to impersonate # - @@ -24,7 +25,7 @@ from common import JINJA_ENV, FITBIT_TABLES, PIPELINE_TABLES, SITE_MASKING_TABLE_ID from utils import auth from gcloud.bq import BigQueryClient -from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES +from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message impersonation_creds = auth.get_impersonation_credentials( run_as, target_scopes=IMPERSONATION_SCOPES) @@ -71,7 +72,8 @@ } # ## Identify person_ids that are not in the person table -# This check verifies that person_ids are valid. That they exist in the CDM person table and are not null. There should be no bad rows. +# This check verifies that person_ids are valid. That they exist in the CDM person table and are not null. +# There should be no bad rows. # # In case of failure: # - If the person_id is not in the CDM person table. Check that `RemoveNonExistingPids` was applied. @@ -201,7 +203,8 @@ # ## Check for rows without a valid date field # Fitbit table records must have at least one valid date in order to be deemed valid. -# This is a preleminary check as this circumstance(lacking a date) should not be possible. No CR currently exists to remove data of this type. +# This is a preleminary check as this circumstance(lacking a date) should not be possible. No CR currently exists to +# remove data of this type. # # If bad rows are found a new CR may be required. Notify and recieve guidance from the DST. @@ -263,4 +266,63 @@ execute(client, union_all_query) # - +# # Check percentage of wear_study participants lacking fitbit data +# +# This check requires a deid dataset containing the generated wear_study table. +# +# If the check fails - If one of the data sources is missing or if the percentage of wear_study participants lacking +# fitbit data is more than 40% for vibrent participants or 10% for ce participants, the data analytics team should be +# notified. +# See DC-3629 for more information. + +# + +query = JINJA_ENV.from_string(""" +WITH fb_person_ids AS ( -- identify pids with fitbit data -- +SELECT DISTINCT person_id +FROM {{project_id}}.{{dataset}}.activity_summary +) +, consenting_ws_ids AS ( -- identify consenting pids -- +SELECT person_id,research_id, +FROM {{project_id}}.{{pipeline}}.primary_pid_rid_mapping dm +WHERE research_id IN (SELECT person_id + FROM {{project_id}}.{{deid_dataset}}.wear_study + WHERE wear_consent_end_date IS NULL) +) +SELECT +src_id, +ROUND(COUNT(CASE WHEN fb.person_id IS NULL THEN 1 ELSE NULL END) * 100 / COUNT(c_ws),1) AS percent_without_fb, +FROM (SELECT * FROM {{project_id}}.{{raw_rdr}}.observation WHERE observation_source_concept_id = 2100000010) o +JOIN consenting_ws_ids c_ws USING(person_id) +LEFT JOIN fb_person_ids fb ON o.person_id = fb.person_id +GROUP BY 1 +""").render(project_id=project_id, + dataset=fitbit_dataset, + raw_rdr=source_dataset, + pipeline=sandbox_dataset, + deid_dataset=deid_dataset) + +df = execute(client, query) + +# conditions for a passing check +cond_vibrent_percentage = df.loc[df['src_id'] == 'vibrent', 'percent_without_fb'].iloc[0] < 40 +cond_ce_percentage = df.loc[df['src_id'] == 'ce', 'percent_without_fb'].iloc[0] < 10 +is_success = cond_vibrent_percentage and cond_ce_percentage + +success_msg = "Conditions Pass" +failure_msg = ( + """ + One of the following checks failed. Confirm failure, and notify the proper team(Data Analytics)
+ (1) The percentage of wear_study participants lacking fitbit data should be less than than 40% for vibrent.
+ (2) The percentage of wear_study participants lacking fitbit data should be less than than 10% for ce.
+ """ + ) + +render_message(df, + success_msg, + failure_msg, + is_success=is_success) +# - + + + From 4baf50d73e03951561a14613dda730f3f7cf99b4 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Thu, 21 Dec 2023 13:37:27 -0600 Subject: [PATCH 08/19] [DC-3635] Remove portion of query checking primary consent (#1834) * [DC-3635] Remove portion of query checking primary consent * [DC-3635] Remove fitbit check in CT --- .../check_controlled_tier_part2.py | 24 ++----------------- .../rt_cdr_qc/cdr_deid_qa_report10_extra.py | 20 ---------------- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py index 628c878e11..a29a16aaa1 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py @@ -1300,7 +1300,6 @@ def query_template(table_era): # **If check fails:**
# * The issue `participant with multiple records` means that those participants have multiple rows in the wear_study table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.
# * The issue `not in person table` means that participants exist in the wear_study table that aren't in the person table, which should not be possible. Investigate the issue. Start with the CR that creates the wear_study table.
-# * The issue `no primary consent` means that participants exist in the wear_study table that do not have proper primary consent. Investigate the issue. It is possible that there is another way to determine primary consent.
# + query = JINJA_ENV.from_string(""" @@ -1329,26 +1328,6 @@ def query_template(table_era): SELECT person_id FROM `{{project_id}}.{{ct_dataset}}.person` o ) - -UNION ALL - -SELECT - 'no primary consent' as issue, - COUNT(person_id) as bad_rows -FROM `{{project_id}}.{{ct_dataset}}.wear_study` ws -WHERE person_id not in ( -- aou consenting participants -- - SELECT cte.person_id - FROM latest_primary_consent_records cte - LEFT JOIN ( -- any positive primary consent -- - SELECT * - FROM `{{project_id}}.{{ct_dataset}}.observation` - WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent') - AND value_as_concept_id = 45877994) o - ON cte.person_id = o.person_id - AND cte.latest_date = o.observation_date - WHERE o.person_id IS NOT NULL - ) - """) q = query.render(project_id=project_id, ct_dataset=ct_dataset) df1 = execute(client, q) @@ -1371,6 +1350,7 @@ def query_template(table_era): ignore_index=True) # - + df1 # + @@ -1530,4 +1510,4 @@ def highlight_cells(val): return f'background-color: {color}' -df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) \ No newline at end of file +df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py index b54ac33449..c510ef1310 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report10_extra.py @@ -864,26 +864,6 @@ def my_sql(table_name, column_name): SELECT person_id FROM `{{project_id}}.{{rt_cdr_deid}}.person` o ) - -UNION ALL - -SELECT - 'no primary consent' as issue, - COUNT(person_id) as bad_rows -FROM `{{project_id}}.{{rt_cdr_deid}}.wear_study` ws -WHERE person_id not in ( -- aou consenting participants -- - SELECT cte.person_id - FROM latest_primary_consent_records cte - LEFT JOIN ( -- any positive primary consent -- - SELECT * - FROM `{{project_id}}.{{rt_cdr_deid}}.observation` - WHERE REGEXP_CONTAINS(observation_source_value, '(?i)extraconsent_agreetoconsent') - AND value_as_concept_id = 45877994) o - ON cte.person_id = o.person_id - AND cte.latest_date = o.observation_date - WHERE o.person_id IS NOT NULL - ) - """) q = query.render(project_id=project_id, rt_cdr_deid=rt_cdr_deid) From aedfc7da3b5ca322cbe350cc5d40b9b629a04a6f Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Thu, 21 Dec 2023 13:37:46 -0600 Subject: [PATCH 09/19] [DC-3631] Update cope_survey to reduce false positive results due to concept domains (#1827) * [DC-3631] Add helpful descriptions * [DC-3631] remove trailing whitespace * [DC-3631] Remove space in function use * [DC-3631] update failure query * [DC-3631] Use meaningful variable name "result" to distinguish and prevent confusion by a single number "1" * [DC-3631] correct variable name in query * [DC-3631] Update and parameterize query * [DC-3631] Update failure logic * [DC-3631] Implement query * [DC-3631] Follow basic Python conventions * [DC-3631] Rename ambiguous "df" to "summary" * [DC-3631] update query summary sections, formatting * [DC-3631] Cleanup staged/testing snippets and update comments * [DC-3631] Add result summary * [DC-3631] Python list does not have a shape * [DC-3631] Remove "pipeline_table" variable * [DC-3631] template table instead of test table procedure_occurrence * [DC-3631] Remove procedure concept into {{table_name}} * [DC-3631] Explicit description on parameter's purpose --- .../cdr_deid_qa_report7_cope_survey.py | 285 +++++++++--------- 1 file changed, 145 insertions(+), 140 deletions(-) diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py index f834cb5fcc..b7c1c22be0 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_qa_report7_cope_survey.py @@ -29,11 +29,11 @@ # + papermill={"duration": 0.023643, "end_time": "2021-02-02T22:30:31.880820", "exception": false, "start_time": "2021-02-02T22:30:31.857177", "status": "completed"} tags=["parameters"] # Parameters -project_id = "" -com_cdr = "" -deid_cdr = "" -sandbox="" -run_as="" +project_id = "" # The project to examine +com_cdr = "" # The comibend dataset +deid_cdr = "" # the deid dataset +sandbox = "" # curation_sandbox dataset, not one related to deid_stage +run_as = "" # The account used to run checks # + @@ -43,13 +43,13 @@ client = BigQueryClient(project_id, credentials=impersonation_creds) # - -# df will have a summary in the end -df = pd.DataFrame(columns = ['query', 'result']) +# a summary of results is at the end +summary = pd.DataFrame(columns = ['query', 'result']) # + [markdown] papermill={"duration": 0.02327, "end_time": "2021-02-02T22:30:32.708257", "exception": false, "start_time": "2021-02-02T22:30:32.684987", "status": "completed"} tags=[] # # 1 done Verify that the COPE Survey Data identified to be suppressed as de-identification action in OBSERVATION table have been removed from the de-id dataset. # -# these concept_ids should be suppressed as shown in the spread sheet 'COPE - All Surveys Privacy Rules', and was temporally saved to curation_sandbox.temp_cope_privacy_rules. Moving forward, we only need to update this table accordingly. +# these concept_ids should be suppressed as shown in the spread sheet 'COPE - All Surveys Privacy Rules', and was temporally saved to curation_sandbox.temp_cope_privacy_rules. Moving forward, we only need to update this table accordingly. # # https://docs.google.com/spreadsheets/d/1UuUVcRdlp2HkBaVdROFsM4ZX_bfffg6ZoEbqj94MlXU/edit#gid=0 # @@ -59,51 +59,51 @@ # these concept_ids should be suppressed query = JINJA_ENV.from_string(""" select OMOP_conceptID,New_Requirement -from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` +from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` where New_Requirement like 'suppress%' or New_Requirement like 'row suppression' """) q = query.render(project_id=project_id,sandbox=sandbox) -df1=execute(client, q) -df1.shape +result = execute(client, q) +result.shape -df1 +result query = JINJA_ENV.from_string(""" SELECT observation_source_concept_id, concept_name,concept_code,vocabulary_id, observation_concept_id, -COUNT(1) AS n_row_not_pass +COUNT(1) AS n_row_not_pass FROM `{{project_id}}.{{deid_cdr}}.observation` ob JOIN `{{project_id}}.{{deid_cdr}}.concept` c ON ob.observation_source_concept_id=c.concept_id WHERE observation_source_concept_id IN -(select OMOP_conceptID from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` +(select OMOP_conceptID from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` where New_Requirement like 'suppress%' or New_Requirement like 'row suppression') OR observation_concept_id IN -(select OMOP_conceptID from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` +(select OMOP_conceptID from `{{project_id}}.curation_sandbox.temp_cope_privacy_rules` where New_Requirement like 'suppress%' or New_Requirement like 'row suppression') GROUP BY 1,2,3,4,5 ORDER BY n_row_not_pass DESC """) q = query.render(project_id=project_id,sandbox=sandbox,deid_cdr=deid_cdr) -df1=execute(client, q) -df1.shape +result = execute(client, q) +result.shape -df1 +result -if df1['n_row_not_pass'].sum()==0: - df = df.append({'query' : 'Query1 No COPE in deid_observation table', 'result' : 'Pass'}, - ignore_index = True) +if result['n_row_not_pass'].sum()==0: + summary = summary.append({'query' : 'Query1 No COPE in deid_observation table', 'result' : 'Pass'}, + ignore_index = True) else: - df = df.append({'query' : 'Query1 No COPE in deid_observation table' , 'result' : 'Failure'}, - ignore_index = True) + summary = summary.append({'query' : 'Query1 No COPE in deid_observation table' , 'result' : 'Failure'}, + ignore_index = True) # + [markdown] papermill={"duration": 0.023633, "end_time": "2021-02-02T22:30:36.860798", "exception": false, "start_time": "2021-02-02T22:30:36.837165", "status": "completed"} tags=[] # # 2 done Verify if a survey version is provided for the COPE survey. # # [DC-1040] # -# expected results: all the person_id and the questionnaire_response_id has a survey_version_concept_id +# expected results: all the person_id and the questionnaire_response_id has a survey_version_concept_id # original sql missed something. # # these should be generalized 2100000002,2100000003,2100000004 @@ -112,9 +112,9 @@ # - query = JINJA_ENV.from_string(""" -SELECT survey_version_concept_id, +SELECT survey_version_concept_id, count (*) row_counts, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 0 ELSE 1 END @@ -126,26 +126,26 @@ LEFT JOIN `{{project_id}}.{{deid_cdr}}.observation_ext` ext USING(observation_id) WHERE cr.concept_id_1 IN (1333174,1333343,1333207,1333310,1332811,1332812,1332715,1332813,1333101,1332814,1332815,1332816,1332817,1332818) - AND cr.relationship_id = "PPI parent code of" + AND cr.relationship_id = "PPI parent code of" group by 1 order by row_counts """) q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) -df1.shape +result = execute(client, q) +result.shape -df1 +result -if df1['Failure_row_counts'].sum()==0: - df = df.append({'query' : 'Query2 survey version provided', 'result' : 'Pass'}, - ignore_index = True) +if result['Failure_row_counts'].sum()==0: + summary = summary.append({'query' : 'Query2 survey version provided', 'result' : 'Pass'}, + ignore_index = True) else: - df = df.append({'query' : 'Query2 survey version provided', 'result' : 'Failure'}, - ignore_index = True) + summary = summary.append({'query' : 'Query2 survey version provided', 'result' : 'Failure'}, + ignore_index = True) # + [markdown] papermill={"duration": 0.023649, "end_time": "2021-02-02T22:30:39.115495", "exception": false, "start_time": "2021-02-02T22:30:39.091846", "status": "completed"} tags=[] # # 3 done no change Verify that all structured concepts related to COVID are NOT suppressed in EHR tables -# +# # DC-891 # # 756055,4100065,37311061,439676,37311060,45763724 @@ -154,10 +154,9 @@ # - query = JINJA_ENV.from_string(""" - SELECT measurement_concept_id, concept_name,concept_code,vocabulary_id, COUNT(1) AS n_row_not_pass, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 0 ELSE 1 END @@ -171,21 +170,21 @@ """) q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) -df1.shape +result = execute(client, q) +result.shape -df1 +result -if df1['Failure_row_counts'].sum()==0: - df = df.append({'query' : 'Query3 No COPE in deid_measurement table', 'result' : 'Pass'}, - ignore_index = True) +if result['Failure_row_counts'].sum()==0: + summary = summary.append({'query' : 'Query3 No COPE in deid_measurement table', 'result' : 'Pass'}, + ignore_index = True) else: - df = df.append({'query' : 'Query3 No COPE in deid_measurement table' , 'result' : 'Failure'}, - ignore_index = True) + summary = summary.append({'query' : 'Query3 No COPE in deid_measurement table' , 'result' : 'Failure'}, + ignore_index = True) # + [markdown] papermill={"duration": 0.023649, "end_time": "2021-02-02T22:30:39.115495", "exception": false, "start_time": "2021-02-02T22:30:39.091846", "status": "completed"} tags=[] # # 4 done no change Verify that all structured concepts related to COVID are NOT suppressed in EHR condition_occurrence -# +# # DC-891 # # 756055,4100065,37311061,439676,37311060,45763724 @@ -194,10 +193,9 @@ # - query = JINJA_ENV.from_string(""" - SELECT condition_concept_id, concept_name,concept_code,vocabulary_id, COUNT(1) AS n_row_not_pass, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 0 ELSE 1 END @@ -208,25 +206,24 @@ WHERE condition_concept_id IN (4100065, 37311061, 439676) GROUP BY 1,2,3,4 ORDER BY n_row_not_pass DESC - """) q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) -df1.shape +result = execute(client, q) +result.shape -df1 +result -if df1['Failure_row_counts'].sum()==0: - df = df.append({'query' : 'Query4 COVID concepts suppression in deid_observation table', 'result' : 'Pass'}, - ignore_index = True) +if result['Failure_row_counts'].sum()==0: + summary = summary.append({'query' : 'Query4 COVID concepts suppression in deid_observation table', 'result' : 'Pass'}, + ignore_index = True) else: - df = df.append({'query' : 'Query4 COVID concepts suppression in deid_observation table' , 'result' : 'Failure'}, - ignore_index = True) + summary = summary.append({'query' : 'Query4 COVID concepts suppression in deid_observation table' , 'result' : 'Failure'}, + ignore_index = True) # + [markdown] papermill={"duration": 0.023649, "end_time": "2021-02-02T22:30:39.115495", "exception": false, "start_time": "2021-02-02T22:30:39.091846", "status": "completed"} tags=[] # # 5 done no change Verify that all structured concepts related to COVID are NOT suppressed in EHR observation -# +# # DC-891 # # 756055,4100065,37311061,439676,37311060,45763724 @@ -235,10 +232,9 @@ # - query = JINJA_ENV.from_string(""" - SELECT observation_concept_id, concept_name,concept_code,vocabulary_id,observation_source_concept_id, COUNT(1) AS n_row_not_pass, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 0 ELSE 1 END @@ -251,37 +247,36 @@ ORDER BY n_row_not_pass DESC """) q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) -df1.shape +result = execute(client, q) +result.shape -df1 +result -if df1['Failure_row_counts'].sum()==0: - df = df.append({'query' : 'Query5 COVID concepts suppression in observation table', 'result' : 'Pass'}, - ignore_index = True) +if result['Failure_row_counts'].sum()==0: + summary = summary.append({'query' : 'Query5 COVID concepts suppression in observation table', 'result' : 'Pass'}, + ignore_index = True) else: - df = df.append({'query' : 'Query5 COVID concepts suppression in observation table' , 'result' : 'Failure'}, - ignore_index = True) + summary = summary.append({'query' : 'Query5 COVID concepts suppression in observation table' , 'result' : 'Failure'}, + ignore_index = True) # # 6 done updated Verify these concepts are NOT suppressed in EHR observation -# +# # [DC-1747] # these concepts 1333015, 1333023 are not longer suppressed # -# 1332737, [DC-1665] +# 1332737, [DC-1665] # # 1333291 # # 1332904,1333140 should be generalized to 1332737 , # update ?need to rewrite?? # -# 1332843 should be generalized. +# 1332843 should be generalized. query = JINJA_ENV.from_string(""" - SELECT observation_source_concept_id, concept_name,concept_code,vocabulary_id,observation_concept_id, COUNT(1) AS n_row_pass, -CASE WHEN +CASE WHEN COUNT(*) > 0 THEN 0 ELSE 1 END @@ -290,127 +285,137 @@ FROM `{{project_id}}.{{deid_cdr}}.observation` ob JOIN `{{project_id}}.{{deid_cdr}}.concept` c ON ob.observation_source_concept_id=c.concept_id -WHERE observation_source_concept_id IN (1333015, 1333023, 1332737,1333291,1332904,1333140,1332843) +WHERE observation_source_concept_id IN (1333015, 1333023, 1332737,1333291,1332904,1333140,1332843) OR observation_concept_id IN (1333015, 1333023,1332737,1333291,1332904,1333140,1332843 ) GROUP BY 1,2,3,4,5 ORDER BY n_row_pass DESC """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr) -df1=execute(client, q) -df1.shape +q = query.render(project_id=project_id, + deid_cdr=deid_cdr) +result = execute(client, q) +result.shape -df1 +result -if (df1['Failure_row_counts'].sum()==0) and (df1[df1['observation_source_concept_id'].isin(['1332904','1333140'])].empty) : - df = df.append({'query' : 'Query6 The concepts are not suppressed in observation table', 'result' : 'Pass'}, - ignore_index = True) +if (result['Failure_row_counts'].sum()==0) and (result[result['observation_source_concept_id'].isin(['1332904','1333140'])].empty) : + summary = summary.append({'query' : 'Query6 The concepts are not suppressed in observation table', 'result' : 'Pass'}, + ignore_index = True) else: - df = df.append({'query' : 'Query6 The concepts are not suppressed in observation table' , 'result' : 'Failure'}, - ignore_index = True) + summary = summary.append({'query' : 'Query6 The concepts are not suppressed in observation table' , 'result' : 'Failure'}, + ignore_index = True) -# # 7 done Vaccine-related concepts as these EHR-submitted COVID concepts are allowed from RT +# # 7 done Vaccine-related concepts as these EHR-submitted COVID concepts are allowed from RT # DC-2374 -# this query was from DC-1752 +# this query was from DC-1752 +# +# Aslo, ensure concepts existing in combined are maintained in deid, and concepts absent in combine should not appear in deid. See [DC-3631](https://precisionmedicineinitiative.atlassian.net/browse/DC-3631) query = JINJA_ENV.from_string(""" - -DECLARE vocabulary_tables DEFAULT ['vocabulary', 'concept', 'source_to_concept_map', +DECLARE vocabulary_tables DEFAULT ['vocabulary', 'concept', 'source_to_concept_map', 'concept_class', 'concept_synonym', 'concept_ancestor', 'concept_relationship', 'relationship', 'drug_strength']; -SELECT table_name,column_name +SELECT table_name,column_name FROM `{{project_id}}.{{deid_cdr}}.INFORMATION_SCHEMA.COLUMNS` c JOIN `{{project_id}}.{{deid_cdr}}.__TABLES__` t ON c.table_name = t.table_id -WHERE - table_name NOT IN UNNEST(vocabulary_tables) and +WHERE + table_name NOT IN UNNEST(vocabulary_tables) and t.row_count > 0 AND table_name NOT LIKE '\\\_%' AND table_name in ('procedure_occurrence','drug_exposure') AND column_name in ('procedure_concept_id','procedure_source_concept_id','drug_concept_id','drug_source_concept_id') """) -q = query.render(project_id=project_id,deid_cdr=deid_cdr) -target_tables=execute(client, q) +q = query.render(project_id=project_id, + deid_cdr=deid_cdr) +target_tables = execute(client, q) target_tables.shape +target_tables + # + #table_name="drug_exposure" #@column_name="drug_concept_id" -def my_sql(table_name,column_name): +def target_of(table_name, column_name): query = JINJA_ENV.from_string(""" - -SELECT +SELECT '{{table_name}}' AS table_name, '{{column_name}}' AS column_name, +concept_id_in_combined, COUNT(*) AS row_counts, -CASE WHEN - COUNT(*) > 0 +CASE WHEN + COUNT(*) > 0 AND sub.concept_id_in_combined IS NOT NULL THEN 0 ELSE 1 END AS Failure_row_counts - FROM `{{project_id}}.{{deid_cdr}}.{{table_name}}` c -JOIN `{{project_id}}.{{deid_cdr}}.concept` on concept_id={{column_name}} - WHERE ( - -- done by name and vocab -- -- this alone should be enough, no need for others -- - REGEXP_CONTAINS(concept_name, r'(?i)(COVID)') AND - REGEXP_CONTAINS(concept_name, r'(?i)(VAC)') AND - vocabulary_id not in ('PPI') - ) OR ( - -- done by code and vocab -- +JOIN ( + SELECT concept_id as concept_id_in_combined + FROM `{{project_id}}.{{com_cdr}}.{{table_name}}` c + JOIN `{{project_id}}.{{deid_cdr}}.concept` + on concept_id={{table_name}}_id + WHERE (REGEXP_CONTAINS(concept_name, r'(?i)(COVID)') AND + REGEXP_CONTAINS(concept_name, r'(?i)(VAC)') AND + vocabulary_id not in ('PPI')) + OR ( REGEXP_CONTAINS(concept_code, r'(207)|(208)|(210)|(212)|(213)') -- not 211 -- - and vocabulary_id = 'CVX' - ) OR ( - -- done by code and vocab -- - REGEXP_CONTAINS(concept_code, r'(91300)|(91301)|(91302)|(91303)|(0031A)|(0021A)|(0022A)|(0002A)|(0001A)|(0012A)|(0011A)') -- no 91304 -- - and vocabulary_id = 'CPT4' + AND vocabulary_id = 'CVX' + ) OR ( + REGEXP_CONTAINS(concept_code, r'(91300)|(91301)|(91302)|(91303)|(0031A)|(0021A)|(0022A)|(0002A)|(0001A)|(0012A)|(0011A)') + AND vocabulary_id = 'CPT4' ) + AND domain_id LIKE '%LEFT(c.domain_id, 3)%' + ) sub + on concept_id_in_combined={{table_name}}_id + GROUP BY concept_id_in_combined """) - q = query.render(project_id=project_id,deid_cdr=deid_cdr,table_name=table_name,column_name=column_name) - df11=execute(client, q) - return df11 + q = query.render(project_id=project_id, + com_cdr=com_cdr, + deid_cdr=deid_cdr, + table_name=table_name, + column_name=column_name) + r = execute(client, q) + return r -# - - +# + # use a loop to get table name AND column name AND run sql function -result = [my_sql (table_name, column_name) for table_name, column_name in zip(target_tables['table_name'], target_tables['column_name'])] -result -# if Row_count is '0' in "Combined" dataset as well, '0' showing up in this check is not a problem +tables = [t for t in target_tables['table_name']] +columns = [c for c in target_tables['column_name']] + +result_list = [] +for t, c in zip(tables, columns): + result_list.append(target_of(t, c)) +result_list # + # AND then get the result back FROM loop result list -n=len(target_tables.index) -res2 = pd.DataFrame(result[0]) - -for x in range(1,n): - res2=res2.append(result[x]) - -#res2=res2.sort_values(by='row_counts_failure', ascending=False) -res2 +n = len(target_tables.index) +final_result = pd.DataFrame(result_list[0]) + +for i in range(1, n): + final_result = final_result.append(result_list[i]) + +final_result = final_result.sort_values(by='Failure_row_counts', ascending=False) +final_result # - -if res2['Failure_row_counts'].sum()==0: - df = df.append({'query' : 'Query7 COVID Vaccine-related concepts NOT suppressed in EHR tables', 'result' : 'Pass'}, - ignore_index = True) +if final_result['Failure_row_counts'].sum() == 0: + summary = summary.append({'query' : 'Query7 COVID Vaccine-related concepts NOT suppressed in EHR tables', 'result' : 'Pass'}, + ignore_index = True) else: - df = df.append({'query' : 'Query7 COVID Vaccine-related concepts NOT suppressed in EHR tables' , 'result' : 'Failure'}, - ignore_index = True) + summary = summary.append({'query' : 'Query7 COVID Vaccine-related concepts NOT suppressed in EHR tables' , 'result' : 'Failure'}, + ignore_index = True) -# # Summary_deid_COPE_survey - # + def highlight_cells(val): color = 'red' if 'Failure' in val else 'white' - return f'background-color: {color}' - -df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) -# - - + return f'background-color: {color}' +summary.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) \ No newline at end of file From 02ca7cca11e1b7a11b876ade0212f52d261f7473 Mon Sep 17 00:00:00 2001 From: Ratul <91090217+ratuagga@users.noreply.github.com> Date: Tue, 26 Dec 2023 15:33:25 -0600 Subject: [PATCH 10/19] [DC-3658] Update Participant Validation QC check for excluded sites (#1840) [DC-3658] Updated Participant validation QC check. --- data_steward/analytics/cdr_ops/participant_validation_qc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/data_steward/analytics/cdr_ops/participant_validation_qc.py b/data_steward/analytics/cdr_ops/participant_validation_qc.py index 9f2aa6884a..2bcbade549 100644 --- a/data_steward/analytics/cdr_ops/participant_validation_qc.py +++ b/data_steward/analytics/cdr_ops/participant_validation_qc.py @@ -23,6 +23,7 @@ LOOKUP_DATASET_ID = "" # Identifies the lookup dataset VALIDATION_DATASET_ID = "" # Identifies the validation dataset EXCLUDED_SITES = "''" # List of excluded sites passed as string: eg. "'hpo_id1', 'hpo_id_2', 'hpo_id3',..." +EXCLUDE_IDENTITY_MATCH = "'identity_match_" + EXCLUDED_SITES.replace(" '", "'identity_match_")[1:] RUN_AS = "" # - @@ -57,6 +58,7 @@ SELECT *, ROW_NUMBER() OVER(PARTITION BY table_name ORDER BY partition_id DESC) r FROM `{PROJECT_ID}.{DRC_DATASET_ID}.INFORMATION_SCHEMA.PARTITIONS` WHERE partition_id NOT IN ('__NULL__') + AND table_name NOT IN ({EXCLUDE_IDENTITY_MATCH}) ) WHERE r = 1 ORDER BY total_rows DESC @@ -99,7 +101,7 @@ SELECT LOWER(hpo_id) as hpo_id FROM `{PROJECT_ID}.{LOOKUP_DATASET_ID}.hpo_site_id_mappings` WHERE TRIM(hpo_id) IS NOT NULL - AND TRIM(hpo_id) NOT IN ('', {EXCLUDED_SITES}) + AND TRIM(LOWER(hpo_id)) NOT IN ('', {EXCLUDED_SITES}) ) SELECT CONCAT( "SELECT * FROM (", @@ -132,7 +134,7 @@ SELECT LOWER(hpo_id) as hpo_id FROM `{PROJECT_ID}.{LOOKUP_DATASET_ID}.hpo_site_id_mappings` WHERE TRIM(hpo_id) IS NOT NULL - AND TRIM(hpo_id) NOT IN ('', {EXCLUDED_SITES}) + AND TRIM(LOWER(hpo_id)) NOT IN ('', {EXCLUDED_SITES}) ) SELECT ARRAY_TO_STRING(ARRAY_AGG(FORMAT( """ From 271a0361992f696c3cdd1be3b5addcee8659a38b Mon Sep 17 00:00:00 2001 From: Ratul <91090217+ratuagga@users.noreply.github.com> Date: Tue, 26 Dec 2023 16:11:28 -0600 Subject: [PATCH 11/19] [DC-3659] Update check controlled tier part 2 to include standard classifications (#1839) [DC-3659] Updated CT Part2 QC check. --- .../cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py index a29a16aaa1..fb7c3584b2 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/check_controlled_tier_part2.py @@ -1016,7 +1016,7 @@ def my_sql(table_name, column_name): FROM `{{project_id}}.{{ct_dataset}}.concept` c JOIN `{{project_id}}.{{ct_dataset}}.{{table_name}}` ON (concept_id={{column_name}}) -WHERE standard_concept !='S' +WHERE standard_concept not in ('S', 'C') AND {{column_name}} !=0 """) q = query.render(project_id=project_id, From 44983fc360cecfd97a18a24d57179331745d8ad2 Mon Sep 17 00:00:00 2001 From: Ratul <91090217+ratuagga@users.noreply.github.com> Date: Tue, 26 Dec 2023 17:05:27 -0600 Subject: [PATCH 12/19] [DC-3650] Include PII Validation in Snapshot script (#1837) [DC-3650] validation dataset script to validate Hpo instead of cron job. --- .../participants/snapshot_validation_dataset.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/data_steward/validation/participants/snapshot_validation_dataset.py b/data_steward/validation/participants/snapshot_validation_dataset.py index 87807bf741..4194c37094 100644 --- a/data_steward/validation/participants/snapshot_validation_dataset.py +++ b/data_steward/validation/participants/snapshot_validation_dataset.py @@ -18,6 +18,7 @@ from constants.validation.participants.snapshot_validaiton_dataset import ( PARTITIONS_QUERY, SNAPSHOT_TABLE_QUERY) from bq_utils import get_hpo_info, get_table_id +from validation.participants.validate import setup_and_validate_participants LOGGER = logging.getLogger(__name__) @@ -103,6 +104,15 @@ def create_snapshot(client: BigQueryClient, release_tag: str) -> str: return dataset.dataset_id +def validate_hpo_ids(bq_client, skip_list): + for item in get_hpo_info(): + hpo_id = item['hpo_id'] + if hpo_id in skip_list: + continue + # Prevent updating udfs for all hpo_sites + setup_and_validate_participants(bq_client, hpo_id, update_udf=False) + + def get_arg_parser(): parser = argparse.ArgumentParser( description="""Generate validation snapshot""") @@ -125,6 +135,10 @@ def get_arg_parser(): dest='run_as_email', help='Service account to impersonate', required=True) + parser.add_argument( + '--hpo_id_ex', + nargs='*', + help='List of HPOs to exclude from processing (none by default)') return parser @@ -141,6 +155,9 @@ def main(): bq_client = BigQueryClient(args.project_id, credentials=impersonation_creds) + # Validate hpo_ids + validate_hpo_ids(bq_client, skip_list=args.hpo_id_ex) + dataset_id = create_snapshot(bq_client, args.release_tag) create_id_match_tables(bq_client, dataset_id) From 1623312fbdfdb2655d429018dda17334aae1057e Mon Sep 17 00:00:00 2001 From: Ratul <91090217+ratuagga@users.noreply.github.com> Date: Wed, 27 Dec 2023 10:21:19 -0600 Subject: [PATCH 13/19] [DC-3651] Fix Combined Backup script for git_version (#1835) [DC-3651] Combined Backup script fix. --- data_steward/tools/create_combined_backup_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/data_steward/tools/create_combined_backup_dataset.py b/data_steward/tools/create_combined_backup_dataset.py index 53cda9c264..2b519f2300 100644 --- a/data_steward/tools/create_combined_backup_dataset.py +++ b/data_steward/tools/create_combined_backup_dataset.py @@ -529,14 +529,14 @@ def main(raw_args=None): '--target_dataset', combined_backup ]) today = datetime.now().strftime('%Y-%m-%d') + git_version = str(get_git_tag()) add_cdr_metadata.main([ '--component', add_cdr_metadata.INSERT, '--project_id', client.project, - '--target_dataset', combined_backup, '--etl_version', - get_git_tag(), '--ehr_source', args.unioned_ehr_dataset, - '--ehr_cutoff_date', args.ehr_cutoff_date, '--rdr_source', - args.rdr_dataset, '--cdr_generation_date', today, - '--vocabulary_version', args.vocab_dataset, '--rdr_export_date', - args.rdr_export_date + '--target_dataset', combined_backup, '--etl_version', git_version, + '--ehr_source', args.unioned_ehr_dataset, '--ehr_cutoff_date', + args.ehr_cutoff_date, '--rdr_source', args.rdr_dataset, + '--cdr_generation_date', today, '--vocabulary_version', + args.vocab_dataset, '--rdr_export_date', args.rdr_export_date ], bq_client=client) LOGGER.info('EHR + RDR combine completed') From 9e9033e877460d9563931204737e2a92e8126105 Mon Sep 17 00:00:00 2001 From: Hiro Mishima <90213198+hiro-mishima@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:26:09 -0500 Subject: [PATCH 14/19] [DC-3333] Python 3.11 upgrade (#1798) * [DC-3333] Update venv and docker config * [DC-3333] Update cicleci settings * [DC-3333] Fix failing lint checks * [DC-3333] Revert Pandas for QC notebook compatibility * [DC-3333] Update failing ppi_branching_test * [DC-3333] Update bq to comply with latest version * [DC-3333] Disable custom log temporarily * [DC-3333] Update custom logging * [DC-3333] Delete custom loggers * [DC-3333] Fix the failing unit tests * [DC-3333] Fix the failing unit test * [DC-3333] Fix the failing unit test * [DC-3333] Fix failing load_vocab test * [DC-3333] Remove moz_sql_parser reference * [DC-3333] Update Docker image * [DC-3333] Update gsdk to the latest * [DC-3333] Correct checksum of Google SDK --- .circleci/config.yml | 6 +- data_steward/README.md | 6 +- data_steward/admin/admin_api.py | 19 +- data_steward/bq_utils.py | 8 +- .../null_invalid_foreign_keys.py | 3 +- data_steward/curation_logging/__init__.py | 0 .../curation_logging/curation_gae_handler.py | 512 ----------- .../curation_logging/gcp_request_log_pb2.py | 847 ------------------ data_steward/deid/requirements.txt | 55 +- data_steward/dev_requirements.txt | 29 +- data_steward/gcloud/bq/__init__.py | 26 +- data_steward/requirements.txt | 396 ++++---- data_steward/retraction/retract_data_bq.py | 8 +- data_steward/tools/import_rdr_dataset.py | 3 +- data_steward/tools/load_vocab.py | 8 +- data_steward/utils/bq.py | 30 +- data_steward/validation/main.py | 19 +- docker-compose.yml | 9 +- docker/develop/Dockerfile | 114 +-- .../cleaning_rules/ppi_branching_test.py | 7 +- .../retraction/retract_data_bq_test.py | 49 +- .../retraction/retract_data_gcs_test.py | 2 +- .../data_steward/validation/ehr_union_test.py | 73 +- .../data_steward/validation/export_test.py | 2 +- tests/test_util.py | 2 +- .../data_steward/admin/admin_api_test.py | 7 +- .../cleaning_rules/ppi_branching_test.py | 20 +- ...place_standard_id_in_domain_tables_test.py | 27 +- .../gcp_stackdriver_logger_test.py | 328 ------- .../data_steward/gcloud/bq/bq_test.py | 13 - .../unit_tests/data_steward/utils/bq_test.py | 13 - .../validation/app_errors_test.py | 8 +- .../data_steward/validation/ehr_union_test.py | 4 +- .../data_steward/validation/main_test.py | 18 +- .../participants/identity_match_test.py | 2 +- 35 files changed, 510 insertions(+), 2163 deletions(-) delete mode 100644 data_steward/curation_logging/__init__.py delete mode 100644 data_steward/curation_logging/curation_gae_handler.py delete mode 100644 data_steward/curation_logging/gcp_request_log_pb2.py delete mode 100644 tests/unit_tests/data_steward/curation_logging/gcp_stackdriver_logger_test.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 34c153723e..17e30f575a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,15 +7,15 @@ orbs: job_defaults: &job_defaults machine: - image: ubuntu-2004:202107-02 + image: ubuntu-2204:2022.04.2 docker_layer_caching: true parallelism: 1 # CircleCI 2.0 does not support environment variables that refer to each other the same way as 1.0 did. # If any of these refer to each other, rewrite them so that they don't or see https://circleci.com/docs/2.0/env-vars/#interpolating-environment-variables-to-set-other-environment-variables . environment: - CIRCLE_ARTIFACTS: /tmp/circleci-artifacts - - GSDK_VERSION: 360.0.0 - - GSDK_CHECKSUM: 6192cb2791f592da6d372888dbd7ee81d3d91f255d844ab5fc518fc97e453648 + - GSDK_VERSION: 454.0.0 + - GSDK_CHECKSUM: 58fd3e6d34e6a6e4a4afbfd3a1470ea23ef7ed6e69841c4eb89829ef833dac2c - CIRCLECI_CLI_VERSION: "0.1.22924" - CIRCLECI_CLI_CHECKSUM: "4187a5245f06dd8e1d51d5a99ad40ed9e8963397cecf006fd2d6a04ac374bef6" diff --git a/data_steward/README.md b/data_steward/README.md index 7dcede7154..290dab6c20 100644 --- a/data_steward/README.md +++ b/data_steward/README.md @@ -4,7 +4,7 @@ Specification document and data curation processes for data submitted to the DRC ## Development Requirements - * Python 3.7.x (download from [here](https://www.python.org/downloads/) and install) + * Python 3.11.x (download from [here](https://www.python.org/downloads/) and install) * pip (download [get-pip.py](https://bootstrap.pypa.io/get-pip.py) and run `python get-pip.py`) * _Recommended: [virtualenv](https://pypi.python.org/pypi/virtualenv)_ @@ -29,8 +29,8 @@ the internet. The following environment variables are needed to configure access * Create a virtual environment and install requirements by running ``` - # create a new environment with python3.7 as the default python - virtualenv -p "$(which python3.7)" + # create a new environment with python3.11 as the default python + virtualenv -p "$(which python3.11)" # activate it source /bin/activate diff --git a/data_steward/admin/admin_api.py b/data_steward/admin/admin_api.py index 2710a7a266..c209aa5ca0 100644 --- a/data_steward/admin/admin_api.py +++ b/data_steward/admin/admin_api.py @@ -14,9 +14,8 @@ import api_util import app_identity from admin import key_rotation, prod_pid_detection -from curation_logging.curation_gae_handler import (begin_request_logging, - end_request_logging, - initialize_logging) + +import google.cloud.logging as gc_logging PREFIX = '/admin/v1/' REMOVE_EXPIRED_KEYS_RULE = f'{PREFIX}RemoveExpiredServiceAccountKeys' @@ -31,6 +30,10 @@ DETECT_PID_VIOLATION_RULE = f'{PREFIX}DetectPersonIdViolation' +# Set up logging client so the logs will be grouped with "Correlate by" +logging_client = gc_logging.Client() +logging_client.setup_logging() + app = Flask(__name__) @@ -93,11 +96,6 @@ def detect_pid_violation(): return 'detect-pid-violation-complete' -@app.before_first_request -def set_up_logging(): - initialize_logging() - - app.add_url_rule(REMOVE_EXPIRED_KEYS_RULE, endpoint='remove_expired_keys', view_func=remove_expired_keys, @@ -107,8 +105,3 @@ def set_up_logging(): endpoint='detect_pid_violation', view_func=detect_pid_violation, methods=['GET']) - -app.before_request( - begin_request_logging) # Must be first before_request() call. - -app.teardown_request(end_request_logging) diff --git a/data_steward/bq_utils.py b/data_steward/bq_utils.py index ec85674633..fe502eba28 100644 --- a/data_steward/bq_utils.py +++ b/data_steward/bq_utils.py @@ -157,8 +157,8 @@ def create_service(): return build('bigquery', 'v2', cache={}) -@deprecated(reason='Use resources.get_table_id(table_name, hpo_id=None) instead' - ) +@deprecated( + reason='Use resources.get_table_id(table_name, hpo_id=None) instead') def get_table_id(hpo_id, table_name): """ Get the bigquery table id associated with an HPOs CDM table @@ -172,8 +172,8 @@ def get_table_id(hpo_id, table_name): return hpo_id + '_' + table_name -@deprecated(reason='Use gcloud.bq.BigQueryClient.get_table(self, table) instead' - ) +@deprecated( + reason='Use gcloud.bq.BigQueryClient.get_table(self, table) instead') def get_table_info(table_id, dataset_id=None, project_id=None): """ Get metadata describing a table diff --git a/data_steward/cdr_cleaner/cleaning_rules/null_invalid_foreign_keys.py b/data_steward/cdr_cleaner/cleaning_rules/null_invalid_foreign_keys.py index cb849f12b1..db6422db35 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/null_invalid_foreign_keys.py +++ b/data_steward/cdr_cleaner/cleaning_rules/null_invalid_foreign_keys.py @@ -108,8 +108,7 @@ def get_column_mode_dict(self, table) -> Dict[str, str]: :return: dict. Table's column names as keys and the column's mode (nullable/required) as values. """ return { - field['name']: field['mode'] - for field in resources.fields_for(table) + field['name']: field['mode'] for field in resources.fields_for(table) } def get_foreign_keys(self, table): diff --git a/data_steward/curation_logging/__init__.py b/data_steward/curation_logging/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/data_steward/curation_logging/curation_gae_handler.py b/data_steward/curation_logging/curation_gae_handler.py deleted file mode 100644 index e533baf077..0000000000 --- a/data_steward/curation_logging/curation_gae_handler.py +++ /dev/null @@ -1,512 +0,0 @@ -"""Based closely on -https://github.com/all-of-us/raw-data-repository/blob/1.60.6/rdr_service/services/gcp_logging.py. This custom handler -groups all the log messages generated within the same http request into an operation, this grouping mechanism allows -us to quickly navigate to the relevant log message. """ -import collections -import json -import logging -import os -import string -import threading -import app_identity -from datetime import datetime, timezone -from enum import IntEnum -import random - -import requests -from flask import request, Response - -from google.api.monitored_resource_pb2 import MonitoredResource -from google.cloud import logging as gcp_logging -from google.cloud import logging_v2 as gcp_logging_v2 -from google.logging.type import http_request_pb2 as gcp_http_request_pb2 -from google.protobuf import json_format as gcp_json_format, any_pb2 as gcp_any_pb2 - -# Do not remove this import. -import curation_logging.gcp_request_log_pb2 # pylint: disable=unused-import - -# https://pypi.org/project/google-cloud-logging/ -# https://cloud.google.com/logging/docs/reference/v2/rpc/google.logging.v2 -# https://developers.google.com/resources/api-libraries/documentation/logging/v2/python/latest/logging_v2.entries.html - -# How many log lines should be batched before pushing them to StackDriver. -_LOG_BUFFER_SIZE = 100 - -GAE_LOGGING_MODULE_ID = 'app-' + os.environ.get('GAE_SERVICE', 'default') -GAE_LOGGING_VERSION_ID = os.environ.get('GAE_VERSION', 'devel') -LOG_NAME_TEMPLATE = 'projects/{project_id}/logs/appengine.googleapis.com%2Frequest_log' -REQUEST_LOG_TYPE = 'type.googleapis.com/google.appengine.logging.v1.RequestLog' - -# This is where we save all data that is tied to a specific execution thread. -_thread_store = threading.local() - - -class LogCompletionStatusEnum(IntEnum): - """ - Indicator for log entry completion status, which can span multiple log entries. - """ - COMPLETE = 0 - PARTIAL_BEGIN = 1 - PARTIAL_MORE = 2 - PARTIAL_FINISHED = 3 - - -def setup_logging_zone(): - """ - Attempt to get the project zone information. - return: Zone pb2 structure. - """ - zone = 'local-machine' - if 'GAE_SERVICE' in os.environ: - try: - resp = requests.get( - 'http://metadata.google.internal/computeMetadata/v1/instance/zone', - timeout=15.0) - if resp.status_code == 200: - zone = resp.text.strip() - # pylint: disable=broad-except - except Exception: - zone = 'unknown' - return zone - - -def setup_logging_resource(): - """ - Set the values for the Google Logging Resource object. Thread safe. - :return: MonitoredResource pb2 structure. - """ - labels = { - "project_id": app_identity.get_application_id(), - "module_id": GAE_LOGGING_MODULE_ID, - "version_id": GAE_LOGGING_VERSION_ID, - "zone": setup_logging_zone() - } - - # https://cloud.google.com/logging/docs/reference/v2/rpc/google.api#google.api.MonitoredResource - resource_pb2 = MonitoredResource(type='gae_app', labels=labels) - return resource_pb2 - - -# pylint: disable=unused-argument -def setup_log_line(record: logging.LogRecord, resource=None, method=None): - """ - Prepare a log event for sending to GCP StackDriver. Thread safe. - :param record: Log event record. - :param resource: request resource - :param method: request method - :return: LogLine proto buffer object - """ - event_ts = datetime.utcfromtimestamp(record.created) - event_ts = event_ts.replace(tzinfo=timezone.utc) - event_ts = event_ts.isoformat() - severity = gcp_logging._helpers._normalize_severity(record.levelno) - message = str(record.msg) % record.args if record.args else str(record.msg) - - # Look for embedded traceback source location override information - if '%%' in message: - tmp_sl = message[message.find('%%'):message.rfind('%%') + 2] - message = message.replace(tmp_sl, '') - tmp_sl = tmp_sl.replace('%%', '') - source_location = json.loads(tmp_sl) - - else: - funcname = record.funcName if record.funcName else '' - file = record.pathname if record.pathname else '' - lineno = record.lineno if record.lineno else 0 - source_location = { - "file": file, - "functionName": funcname, - "line": lineno - } - - message = message.replace('$$method$$', method if method else '') - message = message.replace('$$resource$$', resource if resource else '/') - - log_line = { - "logMessage": message, - "severity": severity, - "sourceLocation": source_location, - "time": event_ts - } - - return log_line - - -def get_highest_severity_level_from_lines(lines): - """ - Figure out the highest severity level in a given set of log records. - :param lines: List of log records - """ - if lines: - - s = sorted([line['severity'] for line in lines], - key=lambda severity: -severity) - return s[0] - else: - return gcp_logging_v2.gapic.enums.LogSeverity(200) - - -def setup_proto_payload(lines: list, log_status: LogCompletionStatusEnum, - **kwargs): - """ - Build the log protoPayload portion of the log entry. Thread safe. - :param lines: List of LogMessage lines to add. - :param log_status: Logging completion status value. - :return: RequestLog pb2 object. - """ - - # Base set of values for proto_payload object. - req_dict = { - "@type": REQUEST_LOG_TYPE, - "startTime": datetime.now(timezone.utc).isoformat(), - "ip": "0.0.0.0", - "first": True, - "finished": True, - "endTime": datetime.now(timezone.utc).isoformat(), - "responseSize": 355, - "line": [], - # If we see these lines in the logs, we know something isn't working correctly. - "userAgent": "Bad Mojo", - "resource": "/Bad-Mojo-Teacups/Logging", - } - - # Override any key values. - for k, v in kwargs.items(): - req_dict[k] = v - - # Set completion statuses - if log_status == LogCompletionStatusEnum.PARTIAL_BEGIN: - req_dict['finished'] = False - elif log_status == LogCompletionStatusEnum.PARTIAL_MORE: - req_dict['first'] = False - req_dict['finished'] = False - elif log_status == LogCompletionStatusEnum.PARTIAL_FINISHED: - req_dict['first'] = False - - if lines and len(lines) > 0: - for x in range(len(lines)): - req_dict["line"].append(lines[x]) - - # Convert dict to Generic pb2 message object, requires gcp_request_log_pb2 import. - request_log_pb2 = gcp_json_format.ParseDict(req_dict, gcp_any_pb2.Any()) - - return request_log_pb2 - - -def update_long_operation(request_log_id, op_status): - """ - Handle long operations. Thread safe. - :param request_log_id: request logging id. - :param op_status: LogCompletionStatusEnum value. - """ - if op_status == LogCompletionStatusEnum.COMPLETE: - first = last = True - else: - first = True if op_status == LogCompletionStatusEnum.PARTIAL_BEGIN else False - last = True if op_status == LogCompletionStatusEnum.PARTIAL_FINISHED else False - - # https://cloud.google.com/logging/docs/reference/v2/rpc/google.logging.v2#google.logging.v2.LogEntryOperation - operation_pb2 = gcp_logging_v2.proto.log_entry_pb2.LogEntryOperation( - id=request_log_id, - producer='appengine.googleapis.com/request_id', - first=first, - last=last) - - return operation_pb2 - - -class GCPStackDriverLogger(object): - """ - Sends log records to google stack driver logging. Each thread needs its own copy of this object. - Buffers up to `buffer_size` log records into one ProtoBuffer to be submitted. - """ - - def __init__(self, buffer_size=_LOG_BUFFER_SIZE): - - self._buffer_size = buffer_size - self._buffer = collections.deque() - - self._reset() - - self._logging_client = gcp_logging_v2.LoggingServiceV2Client() - self._operation_pb2 = None - - # Used to determine how long a request took. - self._first_log_ts = None - - def _reset(self): - - self._first_log_ts = None - - self.log_completion_status = LogCompletionStatusEnum.COMPLETE - self._operation_pb2 = None - - self._trace = None - self._start_time = None - self._end_time = None - - self._request_method = None - self._request_resource = None - self._request_agent = None - self._request_remote_addr = None - self._request_log_id = None - self._request_host = None - - # cloud tasks - self._request_taskname = None - self._request_queue = None - - self._response_status_code = 200 - self._response_size = None - - self._buffer.clear() - - def setup_from_request(self, _request, initial=False): - """ - Gather everything we need to log from the request object. - :param _request: Flask request object - :param initial: Is this the beginning of a request? If no, this means flask 'begin_request' call failed. - """ - # send any pending log entries in case 'end_request' was not called. - if len(self._buffer) and initial: - self.finalize() - - self._start_time = datetime.now(timezone.utc).isoformat() - self._request_method = _request.method - self._request_resource = _request.full_path - if self._request_resource and self._request_resource.endswith('?'): - self._request_resource = self._request_resource[:-1] - self._request_agent = str(_request.user_agent) - self._request_remote_addr = _request.headers.get( - 'X-Appengine-User-Ip', _request.remote_addr) - self._request_host = _request.headers.get( - 'X-Appengine-Default-Version-Hostname', _request.host) - self._request_log_id = _request.headers.get( - 'X-Appengine-Request-Log-Id', 'None') - - self._request_taskname = _request.headers.get('X-Appengine-Taskname', - None) - self._request_queue = _request.headers.get('X-Appengine-Queuename', - None) - - trace_id = _request.headers.get('X-Cloud-Trace-Context', '') - if trace_id: - trace_id = trace_id.split('/')[0] - trace = 'projects/{0}/traces/{1}'.format( - app_identity.get_application_id(), trace_id) - self._trace = trace - - def log_event(self, record: logging.LogRecord): - """ - Capture and store a log event record. - :param record: Python log record - """ - self._buffer.appendleft(record) - - if not self._first_log_ts: - self._first_log_ts = datetime.utcnow() - - if len(self._buffer) >= self._buffer_size: - if self.log_completion_status == LogCompletionStatusEnum.COMPLETE: - self.log_completion_status = LogCompletionStatusEnum.PARTIAL_BEGIN - self._operation_pb2 = update_long_operation( - self._request_log_id, self.log_completion_status) - - elif self.log_completion_status == LogCompletionStatusEnum.PARTIAL_BEGIN: - self.log_completion_status = LogCompletionStatusEnum.PARTIAL_MORE - self._operation_pb2 = update_long_operation( - self._request_log_id, self.log_completion_status) - - self.publish_to_stackdriver() - - def finalize(self, _response=None, _request=None): - """ - Finalize and send any log entries to StackDriver. - """ - if not self._start_time and _request: - self.setup_from_request(_request=_request, initial=False) - - if self.log_completion_status == LogCompletionStatusEnum.COMPLETE: - if len(self._buffer) == 0 and not _response: - # nothing to log - self._reset() - return - else: - self.log_completion_status = LogCompletionStatusEnum.PARTIAL_FINISHED - self._operation_pb2 = update_long_operation( - self._request_log_id, self.log_completion_status) - - # _response could be of the exception type if an exception is raised - if isinstance(_response, Response): - self._response_status_code = _response.status_code - self._response_size = len(_response.data) - else: - self._response_status_code = None - self._response_size = None - - self.publish_to_stackdriver() - self._reset() - - def publish_to_stackdriver(self): - """ - Send a set of log entries to StackDriver. - """ - insert_id = \ - ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(16)) - lines = list() - index = 0 - - while len(self._buffer): - line = self._buffer.pop() - lines.append( - setup_log_line(line, self._request_resource, - self._request_method)) - index += 1 - - self._end_time = datetime.now(timezone.utc).isoformat() - - log_entry_pb2_args = { - 'resource': setup_logging_resource(), - 'severity': get_highest_severity_level_from_lines(lines), - 'trace': self._trace, - 'insert_id': insert_id, - 'trace_sampled': True if self._trace else False - } - - if self._response_status_code: - log_entry_pb2_args[ - 'http_request'] = gcp_http_request_pb2.HttpRequest( - status=self._response_status_code) - # Transform the response code to a logging severity level. - tmp_code = int(round(self._response_status_code / 100, 0) * 100) - if tmp_code > int(log_entry_pb2_args['severity']): - log_entry_pb2_args[ - 'severity'] = gcp_logging_v2.gapic.enums.LogSeverity( - tmp_code) - - if not self._operation_pb2: - self.log_completion_status = LogCompletionStatusEnum.COMPLETE - self._operation_pb2 = update_long_operation( - self._request_log_id, self.log_completion_status) - - log_entry_pb2_args['operation'] = self._operation_pb2 - - proto_payload_args = { - 'startTime': self._start_time, - 'endTime': self._end_time, - 'method': self._request_method, - 'resource': self._request_resource, - 'userAgent': self._request_agent, - 'host': self._request_host, - 'ip': self._request_remote_addr, - 'responseSize': self._response_size, - 'status': self._response_status_code, - 'requestId': self._request_log_id, - 'traceId': self._trace, - 'versionId': os.environ.get('GAE_VERSION', 'devel'), - 'urlMapEntry': 'validation.main.app' - } - - if self._request_taskname: - proto_payload_args['taskName'] = self._request_taskname - proto_payload_args['taskQueueName'] = self._request_queue - - if self._first_log_ts: - if self._first_log_ts: - total_time = datetime.utcnow() - self._first_log_ts - else: - total_time = 0 - proto_payload_args['latency'] = '{0}.{1}s'.format( - total_time.seconds, total_time.microseconds) - - proto_payload_pb2 = setup_proto_payload(lines, - self.log_completion_status, - **proto_payload_args) - - log_entry_pb2_args['proto_payload'] = proto_payload_pb2 - - # https://cloud.google.com/logging/docs/reference/v2/rpc/google.logging.v2#google.logging.v2.LogEntry - log_entry_pb2 = gcp_logging_v2.types.log_entry_pb2.LogEntry( - **log_entry_pb2_args) - - self._logging_client.write_log_entries( - [log_entry_pb2], - log_name=LOG_NAME_TEMPLATE.format( - project_id=app_identity.get_application_id())) - - -def get_gcp_logger() -> GCPStackDriverLogger: - """ - Return the GCPStackDriverLogger object for this thread. - :return: GCPStackDriverLogger object - """ - if hasattr(_thread_store, 'logger'): - _logger = getattr(_thread_store, 'logger') - return _logger - - # We may need to initialize the logger for this thread. - if 'GAE_ENV' in os.environ: - _logger = GCPStackDriverLogger() - setattr(_thread_store, 'logger', _logger) - return _logger - - -class GCPLoggingHandler(logging.Handler): - - def emit(self, record: logging.LogRecord): - """ - Capture and store a log event record. - :param record: Python log record - """ - _logger = get_gcp_logger() - if _logger: - _logger.log_event(record) - return - - line = setup_log_line(record) - print(line) - - -def initialize_logging(log_level=logging.INFO): - """ - Setup GCP Stack Driver logging if we are running in App Engine. - :param log_level: Log level to use. - """ - if 'GAE_ENV' in os.environ: - # Configure root logger - root_logger = logging.getLogger() - root_logger.setLevel(log_level) - # Configure StackDriver logging handler - log_handler = GCPLoggingHandler() - log_handler.setLevel(log_level) - - # Add StackDriver logging handler to root logger. - root_logger.addHandler(log_handler) - - -def begin_request_logging(): - """ - Initialize logging for a new request. Not guarantied to always be called. - """ - _logger = get_gcp_logger() - if _logger: - _logger.setup_from_request(_request=request, initial=True) - - -def end_request_logging(response): - """ - Finalize and send any log entries. Not guarantied to always be called. - """ - _logger = get_gcp_logger() - if _logger: - _logger.finalize(_response=response, _request=request) - return response - - -def flush_request_logs(): - """ - Flush any pending log records. - """ - _logger = get_gcp_logger() - if _logger: - _logger.finalize(_request=request) diff --git a/data_steward/curation_logging/gcp_request_log_pb2.py b/data_steward/curation_logging/gcp_request_log_pb2.py deleted file mode 100644 index 3b350fafad..0000000000 --- a/data_steward/curation_logging/gcp_request_log_pb2.py +++ /dev/null @@ -1,847 +0,0 @@ -# pylint: disable-all -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: google/appengine/logging/v1/request_log.proto - -import sys -_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1')) -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -from google.protobuf import descriptor_pb2 -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - -from google.logging.type import log_severity_pb2 as google_dot_logging_dot_type_dot_log__severity__pb2 -from google.protobuf import duration_pb2 as google_dot_protobuf_dot_duration__pb2 -from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 - -DESCRIPTOR = _descriptor.FileDescriptor( - name='google/appengine/logging/v1/request_log.proto', - package='google.appengine.logging.v1', - syntax='proto3', - serialized_pb=_b( - '\n-google/appengine/logging/v1/request_log.proto\x12\x1bgoogle.appengine.logging.v1\x1a&google/logging/type/log_severity.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\xc2\x01\n\x07LogLine\x12(\n\x04time\x18\x01 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x32\n\x08severity\x18\x02 \x01(\x0e\x32 .google.logging.type.LogSeverity\x12\x13\n\x0blog_message\x18\x03 \x01(\t\x12\x44\n\x0fsource_location\x18\x04 \x01(\x0b\x32+.google.appengine.logging.v1.SourceLocation\"C\n\x0eSourceLocation\x12\x0c\n\x04\x66ile\x18\x01 \x01(\t\x12\x0c\n\x04line\x18\x02 \x01(\x03\x12\x15\n\rfunction_name\x18\x03 \x01(\t\":\n\x0fSourceReference\x12\x12\n\nrepository\x18\x01 \x01(\t\x12\x13\n\x0brevision_id\x18\x02 \x01(\t\"\xbe\x06\n\nRequestLog\x12\x0e\n\x06\x61pp_id\x18\x01 \x01(\t\x12\x11\n\tmodule_id\x18% \x01(\t\x12\x12\n\nversion_id\x18\x02 \x01(\t\x12\x12\n\nrequest_id\x18\x03 \x01(\t\x12\n\n\x02ip\x18\x04 \x01(\t\x12.\n\nstart_time\x18\x06 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12,\n\x08\x65nd_time\x18\x07 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12*\n\x07latency\x18\x08 \x01(\x0b\x32\x19.google.protobuf.Duration\x12\x13\n\x0bmega_cycles\x18\t \x01(\x03\x12\x0e\n\x06method\x18\n \x01(\t\x12\x10\n\x08resource\x18\x0b \x01(\t\x12\x14\n\x0chttp_version\x18\x0c \x01(\t\x12\x0e\n\x06status\x18\r \x01(\x05\x12\x15\n\rresponse_size\x18\x0e \x01(\x03\x12\x10\n\x08referrer\x18\x0f \x01(\t\x12\x12\n\nuser_agent\x18\x10 \x01(\t\x12\x10\n\x08nickname\x18( \x01(\t\x12\x15\n\rurl_map_entry\x18\x11 \x01(\t\x12\x0c\n\x04host\x18\x14 \x01(\t\x12\x0c\n\x04\x63ost\x18\x15 \x01(\x01\x12\x17\n\x0ftask_queue_name\x18\x16 \x01(\t\x12\x11\n\ttask_name\x18\x17 \x01(\t\x12\x1b\n\x13was_loading_request\x18\x18 \x01(\x08\x12/\n\x0cpending_time\x18\x19 \x01(\x0b\x32\x19.google.protobuf.Duration\x12\x16\n\x0einstance_index\x18\x1a \x01(\x05\x12\x10\n\x08\x66inished\x18\x1b \x01(\x08\x12\r\n\x05\x66irst\x18* \x01(\x08\x12\x13\n\x0binstance_id\x18\x1c \x01(\t\x12\x32\n\x04line\x18\x1d \x03(\x0b\x32$.google.appengine.logging.v1.LogLine\x12\x1a\n\x12\x61pp_engine_release\x18& \x01(\t\x12\x10\n\x08trace_id\x18\' \x01(\t\x12\x46\n\x10source_reference\x18) \x03(\x0b\x32,.google.appengine.logging.v1.SourceReferenceBx\n\x1f\x63om.google.appengine.logging.v1B\x0fRequestLogProtoP\x01ZBgoogle.golang.org/genproto/googleapis/appengine/logging/v1;loggingb\x06proto3' - ), - dependencies=[ - google_dot_logging_dot_type_dot_log__severity__pb2.DESCRIPTOR, - google_dot_protobuf_dot_duration__pb2.DESCRIPTOR, - google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR, - ]) - -_LOGLINE = _descriptor.Descriptor( - name='LogLine', - full_name='google.appengine.logging.v1.LogLine', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='time', - full_name='google.appengine.logging.v1.LogLine.time', - index=0, - number=1, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='severity', - full_name='google.appengine.logging.v1.LogLine.severity', - index=1, - number=2, - type=14, - cpp_type=8, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='log_message', - full_name='google.appengine.logging.v1.LogLine.log_message', - index=2, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='source_location', - full_name='google.appengine.logging.v1.LogLine.source_location', - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - ], - extensions=[], - nested_types=[], - enum_types=[], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=184, - serialized_end=378, -) - -_SOURCELOCATION = _descriptor.Descriptor( - name='SourceLocation', - full_name='google.appengine.logging.v1.SourceLocation', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='file', - full_name='google.appengine.logging.v1.SourceLocation.file', - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='line', - full_name='google.appengine.logging.v1.SourceLocation.line', - index=1, - number=2, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='function_name', - full_name='google.appengine.logging.v1.SourceLocation.function_name', - index=2, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - ], - extensions=[], - nested_types=[], - enum_types=[], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=380, - serialized_end=447, -) - -_SOURCEREFERENCE = _descriptor.Descriptor( - name='SourceReference', - full_name='google.appengine.logging.v1.SourceReference', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='repository', - full_name='google.appengine.logging.v1.SourceReference.repository', - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='revision_id', - full_name='google.appengine.logging.v1.SourceReference.revision_id', - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - ], - extensions=[], - nested_types=[], - enum_types=[], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=449, - serialized_end=507, -) - -_REQUESTLOG = _descriptor.Descriptor( - name='RequestLog', - full_name='google.appengine.logging.v1.RequestLog', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='app_id', - full_name='google.appengine.logging.v1.RequestLog.app_id', - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='module_id', - full_name='google.appengine.logging.v1.RequestLog.module_id', - index=1, - number=37, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='version_id', - full_name='google.appengine.logging.v1.RequestLog.version_id', - index=2, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='request_id', - full_name='google.appengine.logging.v1.RequestLog.request_id', - index=3, - number=3, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='ip', - full_name='google.appengine.logging.v1.RequestLog.ip', - index=4, - number=4, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='start_time', - full_name='google.appengine.logging.v1.RequestLog.start_time', - index=5, - number=6, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='end_time', - full_name='google.appengine.logging.v1.RequestLog.end_time', - index=6, - number=7, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='latency', - full_name='google.appengine.logging.v1.RequestLog.latency', - index=7, - number=8, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='mega_cycles', - full_name='google.appengine.logging.v1.RequestLog.mega_cycles', - index=8, - number=9, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='method', - full_name='google.appengine.logging.v1.RequestLog.method', - index=9, - number=10, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='resource', - full_name='google.appengine.logging.v1.RequestLog.resource', - index=10, - number=11, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='http_version', - full_name='google.appengine.logging.v1.RequestLog.http_version', - index=11, - number=12, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='status', - full_name='google.appengine.logging.v1.RequestLog.status', - index=12, - number=13, - type=5, - cpp_type=1, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='response_size', - full_name='google.appengine.logging.v1.RequestLog.response_size', - index=13, - number=14, - type=3, - cpp_type=2, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='referrer', - full_name='google.appengine.logging.v1.RequestLog.referrer', - index=14, - number=15, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='user_agent', - full_name='google.appengine.logging.v1.RequestLog.user_agent', - index=15, - number=16, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='nickname', - full_name='google.appengine.logging.v1.RequestLog.nickname', - index=16, - number=40, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='url_map_entry', - full_name='google.appengine.logging.v1.RequestLog.url_map_entry', - index=17, - number=17, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='host', - full_name='google.appengine.logging.v1.RequestLog.host', - index=18, - number=20, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='cost', - full_name='google.appengine.logging.v1.RequestLog.cost', - index=19, - number=21, - type=1, - cpp_type=5, - label=1, - has_default_value=False, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='task_queue_name', - full_name='google.appengine.logging.v1.RequestLog.task_queue_name', - index=20, - number=22, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='task_name', - full_name='google.appengine.logging.v1.RequestLog.task_name', - index=21, - number=23, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='was_loading_request', - full_name= - 'google.appengine.logging.v1.RequestLog.was_loading_request', - index=22, - number=24, - type=8, - cpp_type=7, - label=1, - has_default_value=False, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='pending_time', - full_name='google.appengine.logging.v1.RequestLog.pending_time', - index=23, - number=25, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='instance_index', - full_name='google.appengine.logging.v1.RequestLog.instance_index', - index=24, - number=26, - type=5, - cpp_type=1, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='finished', - full_name='google.appengine.logging.v1.RequestLog.finished', - index=25, - number=27, - type=8, - cpp_type=7, - label=1, - has_default_value=False, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='first', - full_name='google.appengine.logging.v1.RequestLog.first', - index=26, - number=42, - type=8, - cpp_type=7, - label=1, - has_default_value=False, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='instance_id', - full_name='google.appengine.logging.v1.RequestLog.instance_id', - index=27, - number=28, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='line', - full_name='google.appengine.logging.v1.RequestLog.line', - index=28, - number=29, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='app_engine_release', - full_name= - 'google.appengine.logging.v1.RequestLog.app_engine_release', - index=29, - number=38, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='trace_id', - full_name='google.appengine.logging.v1.RequestLog.trace_id', - index=30, - number=39, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=_b("").decode('utf-8'), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='source_reference', - full_name='google.appengine.logging.v1.RequestLog.source_reference', - index=31, - number=41, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - options=None), - ], - extensions=[], - nested_types=[], - enum_types=[], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[], - serialized_start=510, - serialized_end=1340, -) - -_LOGLINE.fields_by_name[ - 'time'].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP -_LOGLINE.fields_by_name[ - 'severity'].enum_type = google_dot_logging_dot_type_dot_log__severity__pb2._LOGSEVERITY -_LOGLINE.fields_by_name['source_location'].message_type = _SOURCELOCATION -_REQUESTLOG.fields_by_name[ - 'start_time'].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP -_REQUESTLOG.fields_by_name[ - 'end_time'].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP -_REQUESTLOG.fields_by_name[ - 'latency'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION -_REQUESTLOG.fields_by_name[ - 'pending_time'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION -_REQUESTLOG.fields_by_name['line'].message_type = _LOGLINE -_REQUESTLOG.fields_by_name['source_reference'].message_type = _SOURCEREFERENCE -DESCRIPTOR.message_types_by_name['LogLine'] = _LOGLINE -DESCRIPTOR.message_types_by_name['SourceLocation'] = _SOURCELOCATION -DESCRIPTOR.message_types_by_name['SourceReference'] = _SOURCEREFERENCE -DESCRIPTOR.message_types_by_name['RequestLog'] = _REQUESTLOG -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -LogLine = _reflection.GeneratedProtocolMessageType( - 'LogLine', - (_message.Message,), - dict( - DESCRIPTOR=_LOGLINE, - __module__='google.appengine.logging.v1.request_log_pb2' - # @@protoc_insertion_point(class_scope:google.appengine.logging.v1.LogLine) - )) -_sym_db.RegisterMessage(LogLine) - -SourceLocation = _reflection.GeneratedProtocolMessageType( - 'SourceLocation', - (_message.Message,), - dict( - DESCRIPTOR=_SOURCELOCATION, - __module__='google.appengine.logging.v1.request_log_pb2' - # @@protoc_insertion_point(class_scope:google.appengine.logging.v1.SourceLocation) - )) -_sym_db.RegisterMessage(SourceLocation) - -SourceReference = _reflection.GeneratedProtocolMessageType( - 'SourceReference', - (_message.Message,), - dict( - DESCRIPTOR=_SOURCEREFERENCE, - __module__='google.appengine.logging.v1.request_log_pb2' - # @@protoc_insertion_point(class_scope:google.appengine.logging.v1.SourceReference) - )) -_sym_db.RegisterMessage(SourceReference) - -RequestLog = _reflection.GeneratedProtocolMessageType( - 'RequestLog', - (_message.Message,), - dict( - DESCRIPTOR=_REQUESTLOG, - __module__='google.appengine.logging.v1.request_log_pb2' - # @@protoc_insertion_point(class_scope:google.appengine.logging.v1.RequestLog) - )) -_sym_db.RegisterMessage(RequestLog) - -DESCRIPTOR.has_options = True -DESCRIPTOR._options = _descriptor._ParseOptions( - descriptor_pb2.FileOptions(), - _b('\n\037com.google.appengine.logging.v1B\017RequestLogProtoP\001ZBgoogle.golang.org/genproto/googleapis/appengine/logging/v1;logging' - )) -try: - # THESE ELEMENTS WILL BE DEPRECATED. - # Please use the generated *_pb2_grpc.py files instead. - import grpc - from grpc.beta import implementations as beta_implementations - from grpc.beta import interfaces as beta_interfaces - from grpc.framework.common import cardinality - from grpc.framework.interfaces.face import utilities as face_utilities -except ImportError: - pass -# @@protoc_insertion_point(module_scope) diff --git a/data_steward/deid/requirements.txt b/data_steward/deid/requirements.txt index d9fc82c052..36d7ae303f 100644 --- a/data_steward/deid/requirements.txt +++ b/data_steward/deid/requirements.txt @@ -1,14 +1,41 @@ -google-api-core -google-auth -google-auth-httplib2 -google-auth-oauthlib -google-cloud-bigquery -google-cloud-core -google-resumable-media -googleapis-common-protos -numpy -pandas -pandas-gbq -pydata-google-auth -pymongo -tqdm +cachetools==5.3.2 +certifi==2023.7.22 +charset-normalizer==3.3.1 +db-dtypes==1.1.1 +dnspython==2.4.2 +google-api-core==2.12.0 +google-auth==2.23.4 +google-auth-httplib2==0.1.1 +google-auth-oauthlib==1.1.0 +google-cloud-bigquery==3.13.0 +google-cloud-bigquery-storage==2.22.0 +google-cloud-core==2.3.3 +google-crc32c==1.5.0 +google-resumable-media==2.6.0 +googleapis-common-protos==1.61.0 +grpcio==1.59.2 +grpcio-status==1.59.2 +httplib2==0.22.0 +idna==3.4 +numpy==1.26.1 +oauthlib==3.2.2 +packaging==23.2 +pandas==1.5.3 +pandas-gbq==0.19.2 +proto-plus==1.22.3 +protobuf==4.24.4 +pyarrow==13.0.0 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pydata-google-auth==1.8.2 +pymongo==4.5.0 +pyparsing==3.1.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +requests==2.31.0 +requests-oauthlib==1.3.1 +rsa==4.9 +six==1.16.0 +tqdm==4.66.1 +tzdata==2023.3 +urllib3==2.0.7 diff --git a/data_steward/dev_requirements.txt b/data_steward/dev_requirements.txt index 3b66e5ca2c..23b0188ec2 100644 --- a/data_steward/dev_requirements.txt +++ b/data_steward/dev_requirements.txt @@ -1,10 +1,21 @@ -coverage==4.5.2 -pylint==2.4.4 -modernize==0.7 -unittest-xml-reporting>=2.5.2,==2.* -yapf==0.29.0 -lxml==4.5.0 -beautifulsoup4==4.8.2 -soupsieve==1.9.5 +appdirs==1.4.4 +astroid==3.0.1 +beautifulsoup4==4.12.2 bs4==0.0.1 -pipdeptree==1.0.0 +coverage==7.3.2 +dill==0.3.7 +fissix==21.11.13 +importlib-metadata==6.8.0 +isort==5.12.0 +lxml==4.9.3 +mccabe==0.7.0 +modernize==0.8.0 +pipdeptree==2.13.0 +platformdirs==3.11.0 +pylint==3.0.2 +soupsieve==2.5 +tomli==2.0.1 +tomlkit==0.12.1 +unittest-xml-reporting==3.2.0 +yapf==0.40.2 +zipp==3.17.0 diff --git a/data_steward/gcloud/bq/__init__.py b/data_steward/gcloud/bq/__init__.py index 54980abc97..f8d56a6b76 100644 --- a/data_steward/gcloud/bq/__init__.py +++ b/data_steward/gcloud/bq/__init__.py @@ -28,6 +28,7 @@ is_rdr_dataset, is_mapping_table from constants.utils import bq as consts from common import JINJA_ENV, IDENTITY_MATCH, PARTICIPANT_MATCH, PIPELINE_TABLES, SITE_MASKING_TABLE_ID +from resources import get_bq_col_type tracer_provider = TracerProvider() trace.set_tracer_provider(tracer_provider) @@ -106,22 +107,6 @@ def get_table_schema(self, table_name: str, fields=None) -> list: return schema - def _to_standard_sql_type(self, field_type: str) -> str: - """ - Get standard SQL type corresponding to a SchemaField type - - :param field_type: type in SchemaField object (can be legacy or standard SQL type) - :return: standard SQL type name - """ - upper_field_type = field_type.upper() - standard_sql_type_code = bigquery.schema.LEGACY_TO_STANDARD_TYPES.get( - upper_field_type) - if not standard_sql_type_code: - raise ValueError(f'{field_type} is not a valid field type') - standard_sql_type = bigquery.StandardSqlDataTypes( - standard_sql_type_code) - return standard_sql_type.name - def _to_sql_field(self, field: bigquery.SchemaField) -> bigquery.SchemaField: """ @@ -130,9 +115,12 @@ def _to_sql_field(self, :param field: the schema field object :return: a converted schema field object """ - return bigquery.SchemaField( - field.name, self._to_standard_sql_type(field.field_type), - field.mode, field.description, field.fields) + return bigquery.SchemaField(name=field.name, + field_type=get_bq_col_type( + field.field_type), + mode=field.mode, + description=field.description, + fields=field.fields) def get_validated_schema_fields( schema_filepath: str) -> typing.List[bigquery.SchemaField]: diff --git a/data_steward/requirements.txt b/data_steward/requirements.txt index da774adef7..ce2ce2037c 100644 --- a/data_steward/requirements.txt +++ b/data_steward/requirements.txt @@ -1,192 +1,256 @@ -aniso8601==1.3.0 +aniso8601==9.0.1 +annotated-types==0.6.0 ansiwrap==0.8.4 +anyio==4.0.0 appdirs==1.4.4 -appnope==0.1.2 -argon2-cffi==21.1.0 +appnope==0.1.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 asn1crypto==1.5.1 -astroid==2.3.3 +astroid==3.0.1 +asttokens==2.4.1 async-generator==1.10 -attrs==20.3.0 +async-lru==2.0.4 +attrs==23.1.0 +Babel==2.13.1 backports-abc==0.5 backports.shutil-get-terminal-size==1.0.0 -beautifulsoup4==4.8.2 -black==21.4b2 -bleach==3.1.0 -Bottleneck==1.3.2 +beautifulsoup4==4.12.2 +black==23.10.1 +bleach==6.1.0 +blinker==1.6.3 +Bottleneck==1.3.7 bs4==0.0.1 -cachetools==2.0.1 -certifi==2017.11.5 -cffi==1.14.5 -chardet==3.0.4 -click==7.1.2 -configparser==4.0.2 -contextlib2==0.6.0.post1 -coverage==4.5.2 -cycler==0.10.0 -decorator==4.4.1 -defusedxml==0.6.0 -Deprecated==1.2.13 +cachetools==5.3.2 +certifi==2023.7.22 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.1 +click==8.1.7 +comm==0.1.4 +configparser==6.0.0 +contextlib2==21.6.0 +contourpy==1.1.1 +coverage==7.3.2 +cycler==0.12.1 +db-dtypes==1.1.1 +debugpy==1.8.0 +decorator==5.1.1 +defusedxml==0.7.1 +Deprecated==1.2.14 deprecation==2.1.0 +dill==0.3.7 docopt==0.4.0 -dpath==1.4.2 -entrypoints==0.3 -Flask==0.10 +dpath==2.1.6 +entrypoints==0.4 +executing==2.0.1 +fastjsonschema==2.18.1 +fissix==21.11.13 +Flask==3.0.0 +fonttools==4.43.1 +fqdn==1.5.1 funcsigs==1.0.2 -gitdb==4.0.9 -GitPython==3.1.27 -google-api-core==1.28.0 -google-api-python-client==2.6.0 -google-auth==1.30.0 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==0.4.4 -google-cloud-bigquery==2.18.0 -google-cloud-bigquery-storage==2.4.0 -google-cloud-core==1.6.0 -google-cloud-logging==1.15.1 -google-cloud-secret-manager==2.8.0 -google-cloud-storage==1.38.0 -google-cloud-trace==1.3.0 -google-crc32c==1.1.2 -google-resumable-media==1.2.0 -googleapis-common-protos==1.53.0 +gitdb==4.0.11 +GitPython==3.1.40 +google-api-core==2.12.0 +google-api-python-client==2.106.0 +google-auth==2.23.4 +google-auth-httplib2==0.1.1 +google-auth-oauthlib==1.1.0 +google-cloud-appengine-logging==1.3.2 +google-cloud-audit-log==0.2.5 +google-cloud-bigquery==3.13.0 +google-cloud-bigquery-storage==2.22.0 +google-cloud-core==2.3.3 +google-cloud-logging==3.8.0 +google-cloud-secret-manager==2.16.4 +google-cloud-storage==2.13.0 +google-cloud-trace==1.11.3 +google-crc32c==1.5.0 +google-resumable-media==2.6.0 +googleapis-common-protos==1.61.0 GoogleAppEngineCloudStorageClient==1.9.22.1 -greenlet==1.1.2 -grpc-google-iam-v1==0.12.3 -grpcio==1.37.1 -gunicorn==20.0.2 +greenlet==3.0.1 +grpc-google-iam-v1==0.12.6 +grpcio==1.59.2 +grpcio-status==1.59.2 +gunicorn==21.2.0 htmlmin==0.1.12 -httplib2==0.19.1 -idna==2.10 -ImageHash==4.2.1 -importlib-metadata==0.23 -iniconfig==1.1.1 +httplib2==0.22.0 +idna==3.4 +ImageHash==4.3.1 +importlib-metadata==6.8.0 +iniconfig==2.0.0 ipaddress==1.0.23 -ipykernel==4.10.1 -ipython==5.8.0 +ipykernel==6.26.0 +ipython==8.17.2 ipython-genutils==0.2.0 -ipywidgets==7.5.1 -isort==4.3.21 -itsdangerous==0.24 -Jinja2==2.11.3 -joblib==1.0.1 -jsonschema==3.1.1 +ipywidgets==8.1.1 +isoduration==20.11.0 +isort==5.12.0 +itsdangerous==2.1.2 +jedi==0.19.1 +Jinja2==3.1.2 +joblib==1.1.1 +json5==0.9.14 +jsonpointer==2.4 +jsonschema==4.19.2 +jsonschema-specifications==2023.7.1 jupyter==1.0.0 -jupyter-client==6.1.7 -jupyter-console==5.2.0 -jupyter-core==4.6.1 -jupyterlab-pygments==0.1.2 -jupytext==1.7.1 -kiwisolver==1.3.1 -lazy-object-proxy==1.4.3 -libcst==0.3.19 -lxml==4.5.0 -mandrill==1.0.59 -Markdown==2.6.9 -markdown-it-py==0.5.8 -MarkupSafe==1.1.1 -matplotlib==3.4.1 -matplotlib-venn==0.11.7 -mccabe==0.6.1 -missingno==0.5.0 -mistune==0.8.4 -mo-future==2.48.19205 -mock==4.0.3 -modernize==0.7 -more-itertools==5.0.0 -moz-sql-parser==2.44.19084 -multimethod==1.4 -mypy-extensions==0.4.3 -nbclient==0.5.3 -nbconvert==6.0.7 -nbformat==5.1.3 -nest-asyncio==1.5.1 -networkx==2.4 -notebook==6.4.0 -numpy==1.20.3 +jupyter-console==6.6.3 +jupyter-events==0.8.0 +jupyter-lsp==2.2.0 +jupyter_client==8.5.0 +jupyter_core==5.5.0 +jupyter_server==2.9.1 +jupyter_server_terminals==0.4.4 +jupyterlab==4.0.7 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==3.0.9 +jupyterlab_server==2.25.0 +jupytext==1.15.2 +kiwisolver==1.4.5 +lazy-object-proxy==1.9.0 +libcst==1.1.0 +lxml==4.9.3 +mandrill==1.0.60 +Markdown==3.5 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +matplotlib==3.8.0 +matplotlib-inline==0.1.6 +matplotlib-venn==0.11.9 +mccabe==0.7.0 +mdit-py-plugins==0.4.0 +mdurl==0.1.2 +missingno==0.5.2 +mistune==3.0.2 +mo-dots==4.22.21108 +mo-future==3.147.20327 +mo-imports==3.149.20327 +mo-kwargs==4.22.21108 +mo-logs==4.23.21108 +mock==5.1.0 +modernize==0.8.0 +more-itertools==10.1.0 +moz-sql-parser==4.40.21126 +multimethod==1.10 +mypy-extensions==1.0.0 +nbclient==0.8.0 +nbconvert==7.10.0 +nbformat==5.9.2 +nest-asyncio==1.5.8 +networkx==3.2.1 +notebook==7.0.6 +notebook_shim==0.2.3 +numpy==1.26.1 oauth2client==4.1.3 -oauthlib==3.1.0 -opentelemetry-api==1.9.1 -opentelemetry-exporter-gcp-trace==1.1.0 -opentelemetry-sdk==1.9.1 -opentelemetry-semantic-conventions==0.28b1 -packaging==20.9 -pandas==1.2.4 -pandas-gbq==0.15.0 -pandas-profiling==3.0.0 -pandocfilters==1.4.3 -papermill==2.2.2 -pathlib2==2.3.5 -pathspec==0.8.1 -pbr==3.1.1 +oauthlib==3.2.2 +opentelemetry-api==1.20.0 +opentelemetry-exporter-gcp-trace==1.6.0 +opentelemetry-resourcedetector-gcp==1.6.0a0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +overrides==7.4.0 +packaging==23.2 +pandas==1.5.3 +pandas-gbq==0.19.2 +pandas-profiling==3.2.0 +pandocfilters==1.5.0 +papermill==2.4.0 +parso==0.8.3 +pathlib2==2.3.7.post1 +pathspec==0.11.2 +pbr==5.11.1 pexpect==4.8.0 -pg8000==1.24.1 -phik==0.12.0 +pg8000==1.30.3 +phik==0.12.3 pickleshare==0.7.5 -Pillow==8.2.0 -pipdeptree==1.0.0 -pluggy==1.0.0 -prometheus-client==0.7.1 -prompt-toolkit==1.0.18 -proto-plus==1.18.1 -protobuf==3.17.1 -pg8000==1.24.1 +Pillow==10.1.0 +pipdeptree==2.13.0 +platformdirs==3.11.0 +pluggy==1.3.0 +prometheus-client==0.18.0 +prompt-toolkit==3.0.39 +proto-plus==1.22.3 +protobuf==4.24.4 +psutil==5.9.6 ptyprocess==0.7.0 -py==1.10.0 -pyarrow==4.0.1 -pyasn1==0.4.7 -pyasn1-modules==0.2.6 -pycparser==2.20 -pydantic==1.8.2 -pydata-google-auth==0.1.3 -Pygments==2.4.2 -pylint==2.4.4 -pyparsing==2.4.5 -pyrsistent==0.15.5 -pytest==6.2.5 -python-dateutil==2.8.1 -pytz==2017.3 -PyWavelets==1.1.1 -PyYAML==5.4.1 -pyzmq==18.1.0 +pure-eval==0.2.2 +py==1.11.0 +pyarrow==13.0.0 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycparser==2.21 +pydantic==2.4.2 +pydantic_core==2.10.1 +pydata-google-auth==1.8.2 +Pygments==2.16.1 +pylint==3.0.2 +pyparsing==3.1.1 +pyrsistent==0.20.0 +pytest==7.4.3 +python-dateutil==2.8.2 +python-json-logger==2.0.7 +pytz==2023.3.post1 +PyWavelets==1.4.1 +PyYAML==6.0.1 +pyzmq==25.1.1 qgrid==1.3.1 -qtconsole==4.5.5 -regex==2021.4.4 -requests==2.25.1 -requests-oauthlib==1.3.0 -rsa==3.4.2 -scipy==1.6.3 -scramp==1.4.1 -seaborn==0.11.1 -Send2Trash==1.5.0 +qtconsole==5.4.4 +QtPy==2.4.1 +referencing==0.30.2 +regex==2023.10.3 +requests==2.31.0 +requests-oauthlib==1.3.1 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.10.6 +rsa==4.9 +scipy==1.11.3 +scramp==1.4.4 +seaborn==0.13.0 +Send2Trash==1.8.2 simplegeneric==0.8.1 -simplejson==3.17.0 -singledispatch==3.4.0.3 -six==1.14.0 -smmap==5.0.0 -soupsieve==1.9.5 -SQLAlchemy==1.4.32 -tangled-up-in-unicode==0.1.0 -tenacity==7.0.0 -terminado==0.9.5 -testpath==0.4.4 +simplejson==3.19.2 +singledispatch==4.1.0 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.0 +soupsieve==2.5 +SQLAlchemy==2.0.22 +stack-data==0.6.3 +tangled-up-in-unicode==0.2.0 +tenacity==8.2.3 +terminado==0.17.1 +testpath==0.6.0 textwrap3==0.9.2 +tinycss2==1.2.1 toml==0.10.2 -tornado==6.1 -tqdm==4.60.0 -traitlets==4.3.3 -typed-ast==1.4.3 -typing-extensions==3.7.4.3 -typing-inspect==0.6.0 -unittest-xml-reporting==2.5.2 -uritemplate==3.0.0 -urllib3==1.24.3 +tomli==2.0.1 +tomlkit==0.12.1 +tornado==6.3.3 +tqdm==4.66.1 +traitlets==5.13.0 +typed-ast==1.5.5 +types-python-dateutil==2.8.19.14 +typing-inspect==0.9.0 +typing_extensions==4.8.0 +tzdata==2023.3 +unittest-xml-reporting==3.2.0 +uri-template==1.3.0 +uritemplate==4.1.1 +urllib3==2.0.7 utils==1.0.1 -visions==0.7.1 -wcwidth==0.1.7 +visions==0.7.4 +wcwidth==0.2.9 +webcolors==1.13 webencodings==0.5.1 -Werkzeug==0.12.2 -widgetsnbextension==3.5.1 -wrapt==1.11.2 +websocket-client==1.6.4 +Werkzeug==3.0.1 +widgetsnbextension==4.0.9 +wrapt==1.15.0 xlrd==2.0.1 -yapf==0.29.0 -zipp==3.4.1 +yapf==0.40.2 +zipp==3.17.0 diff --git a/data_steward/retraction/retract_data_bq.py b/data_steward/retraction/retract_data_bq.py index 9ee05ea7ab..6dd57d75a2 100644 --- a/data_steward/retraction/retract_data_bq.py +++ b/data_steward/retraction/retract_data_bq.py @@ -317,10 +317,10 @@ def get_retraction_queries_fact_relationship( def get_tables_to_retract( - client: BigQueryClient, - dataset, - retraction_type, - hpo_id: Optional[str] = None, + client: BigQueryClient, + dataset, + retraction_type, + hpo_id: Optional[str] = None, ) -> List[str]: """ Creates a list of tables that need retraction in the dataset. diff --git a/data_steward/tools/import_rdr_dataset.py b/data_steward/tools/import_rdr_dataset.py index 8e11c826a1..701f7b4dae 100644 --- a/data_steward/tools/import_rdr_dataset.py +++ b/data_steward/tools/import_rdr_dataset.py @@ -175,7 +175,8 @@ def create_rdr_tables(client, destination_dataset, rdr_project, f'FROM `{source_table_id}` ' f'WHERE participant_id IS NOT Null') else: - sql = (f'SELECT {fields_name_str} ' f'FROM `{source_table_id}`') + sql = (f'SELECT {fields_name_str} ' + f'FROM `{source_table_id}`') job_config = bigquery.job.QueryJobConfig( write_disposition=bigquery.job.WriteDisposition.WRITE_EMPTY, diff --git a/data_steward/tools/load_vocab.py b/data_steward/tools/load_vocab.py index 0742acfbff..62cfa8dbb7 100644 --- a/data_steward/tools/load_vocab.py +++ b/data_steward/tools/load_vocab.py @@ -138,9 +138,11 @@ def safe_schema_for(bq_client: BigQueryClient, table: str) -> List[SchemaField]: :return: a list of SchemaField objects """ return [ - SchemaField( - f.name, 'string' if f.field_type.lower() in DATE_TIME_TYPES else - f.field_type, f.mode, f.description) + SchemaField(name=f.name, + field_type='string' if f.field_type.lower() + in DATE_TIME_TYPES else f.field_type, + mode=f.mode, + description=f.description) for f in bq_client.get_table_schema(table) ] diff --git a/data_steward/utils/bq.py b/data_steward/utils/bq.py index 9f553bc049..ab4fc6af79 100644 --- a/data_steward/utils/bq.py +++ b/data_steward/utils/bq.py @@ -17,7 +17,7 @@ # Project Imports from utils import auth from constants.utils import bq as consts -from resources import fields_for +from resources import fields_for, get_bq_col_type from common import JINJA_ENV _MAX_RESULTS_PADDING = 100 @@ -163,26 +163,6 @@ def upload_csv_data_to_bq_table(client, dataset_id, table_name, fq_file_path, return result -@deprecated( - reason= - 'Use gcloud.bq.BigQueryClient._to_standard_sql_type(self, field_type: str) instead' -) -def _to_standard_sql_type(field_type: str) -> str: - """ - Get standard SQL type corresponding to a SchemaField type - - :param field_type: type in SchemaField object (can be legacy or standard SQL type) - :return: standard SQL type name - """ - upper_field_type = field_type.upper() - standard_sql_type_code = bigquery.schema.LEGACY_TO_STANDARD_TYPES.get( - upper_field_type) - if not standard_sql_type_code: - raise ValueError(f'{field_type} is not a valid field type') - standard_sql_type = bigquery.StandardSqlDataTypes(standard_sql_type_code) - return standard_sql_type.name - - @deprecated( reason= 'Use gcloud.bq.BigQueryClient._to_sql_field(self,field: bigquery.SchemaField) instead' @@ -194,9 +174,11 @@ def _to_sql_field(field: bigquery.SchemaField) -> bigquery.SchemaField: :param field: the schema field object :return: a converted schema field object """ - return bigquery.SchemaField(field.name, - _to_standard_sql_type(field.field_type), - field.mode, field.description, field.fields) + return bigquery.SchemaField(name=field.name, + field_type=get_bq_col_type(field.field_type), + mode=field.mode, + description=field.description, + fields=field.fields) @deprecated(reason=""" diff --git a/data_steward/validation/main.py b/data_steward/validation/main.py index 2e7c5af8dc..3cdb1e3f2d 100644 --- a/data_steward/validation/main.py +++ b/data_steward/validation/main.py @@ -19,6 +19,7 @@ from google.cloud import bigquery from google.cloud.storage.bucket import Blob from google.cloud.exceptions import GoogleCloudError +import google.cloud.logging as gc_logging from googleapiclient.errors import HttpError # Project imports @@ -37,8 +38,6 @@ from common import ACHILLES_EXPORT_PREFIX_STRING, ACHILLES_EXPORT_DATASOURCES_JSON, BIGQUERY_DATASET_ID, UNIONED_DATASET_ID from constants.validation import hpo_report as report_consts from constants.validation import main as consts -from curation_logging.curation_gae_handler import begin_request_logging, end_request_logging, \ - initialize_logging from retraction import retract_data_bq, retract_data_gcs from validation import achilles, achilles_heel, ehr_union, export, hpo_report from validation import email_notification as en @@ -51,6 +50,10 @@ app = Flask(__name__) +# Set up logging client so the logs will be grouped with "Correlate by" +logging_client = gc_logging.Client() +logging_client.setup_logging() + # register application error handlers app.register_blueprint(errors_blueprint) @@ -1149,11 +1152,6 @@ def ps_api_cron(): return consts.PS_API_SUCCESS -@app.before_first_request -def set_up_logging(): - initialize_logging() - - app.add_url_rule(consts.PREFIX + 'ValidateAllHpoFiles', endpoint='validate_all_hpos', view_func=validate_all_hpos, @@ -1195,10 +1193,3 @@ def set_up_logging(): endpoint='ps_api_cron', view_func=ps_api_cron, methods=['GET']) - -app.before_request( - begin_request_logging) # Must be first before_request() call. - -app.teardown_request( - end_request_logging -) # teardown_request to be called regardless if there is an exception thrown diff --git a/docker-compose.yml b/docker-compose.yml index 5c1a8f8c86..557bbe659e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ version: "3" services: # base image upon which all other containers are derived develop: - image: "curation:develop-18.04" + image: "curation:develop-22.04" build: context: . dockerfile: docker/develop/Dockerfile @@ -11,11 +11,12 @@ services: # they were envvars. it may be necessary to replicate some of these in the # runtime "environments" config key args: - GSDK_VERSION: "359.0.0" - GSDK_CHECKSUM: "b84828fbd7c2c842edf3df4d514f01f7b6a9e587fa5b563b0f1dde4587574b1b" + GSDK_VERSION: 454.0.0 + GSDK_CHECKSUM: 58fd3e6d34e6a6e4a4afbfd3a1470ea23ef7ed6e69841c4eb89829ef833dac2c CIRCLECI_CLI_VERSION: "0.1.15973" CIRCLECI_CLI_CHECKSUM: "4187a5245f06dd8e1d51d5a99ad40ed9e8963397cecf006fd2d6a04ac374bef6" - # run-time environment variable values. + GID: 20 + UID: 502 # run-time environment variable values. # may containe duplicates from "build.args" map environment: APPLICATION_ID: "aou-res-curation-test" diff --git a/docker/develop/Dockerfile b/docker/develop/Dockerfile index efc29a4333..278bab949f 100644 --- a/docker/develop/Dockerfile +++ b/docker/develop/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM python:3.11 LABEL org.opencontainers.image.authors="daniel.p.carbone@vumc.org" LABEL org.opencontainers.image.url="https://github.com/all-of-us/curation" LABEL org.opencontainers.image.documentation="https://github.com/all-of-us/curation" @@ -17,19 +17,19 @@ LABEL org.opencontainers.image.description="Base development container image use # run the base container init first, as it is highly unlikely to change. # this allows us to modify subsequent steps without initating a full image rebuild -# install deps and link "python" and "python3" execs to 3.7 +# install deps and link "python" and "python3" execs to 3.11 RUN apt update \ - && apt upgrade -y \ - && apt install -y \ - curl \ - git \ - python3.7-dev \ - python3.7-venv \ - python3-pip \ - python3-wheel \ - wget \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 \ - && update-alternatives --install /usr/bin/python python /usr/bin/python3.7 1 + && apt upgrade -y \ + && apt install -y \ + curl \ + git \ + python3.11-dev \ + python3.11-venv \ + python3-pip \ + python3-wheel \ + wget \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 ## BUILD ARGS @@ -77,38 +77,38 @@ ENV GOOGLE_APPLICATION_CREDENTIALS "${CIRCLE_WORKING_DIRECTORY}/aou-res-curation # download circleci cli RUN wget --quiet "${CIRCLECI_CLI_DOWNLOAD}" \ - && if ! $(echo "${CIRCLECI_CLI_CHECKSUM}" "${CIRCLECI_CLI_TAR_FILE}" | sha256sum --check --status); \ - then echo "CircleCI cli tar integrity check failure, please update build arg with correct checksum and/or version";\ - exit 1;\ - fi; + && if ! $(echo "${CIRCLECI_CLI_CHECKSUM}" "${CIRCLECI_CLI_TAR_FILE}" | sha256sum --check --status); \ + then echo "CircleCI cli tar integrity check failure, please update build arg with correct checksum and/or version";\ + exit 1;\ + fi; RUN tar -xvf "${CIRCLECI_CLI_TAR_FILE}" -C "/tmp" \ - && mv "${CIRCLECI_CLI_TMP_PATH}/circleci" "/usr/bin/circleci" \ - && rm -rf "${CIRCLECI_CLI_TMP_PATH}" \ - && /usr/bin/circleci update + && mv "${CIRCLECI_CLI_TMP_PATH}/circleci" "/usr/bin/circleci" \ + && rm -rf "${CIRCLECI_CLI_TMP_PATH}" \ + && /usr/bin/circleci update # CONTAINER USER SETUP # create runtime user group RUN if ! find . | grep -q ":${GID}:" /etc/group; then \ - addgroup --gid "${GID}" "${CURATION_USER}"; \ - fi; + addgroup --gid "${GID}" "${CURATION_USER}"; \ + fi; # create runtime user RUN adduser \ - --shell /usr/bin/bash \ - --uid "${UID}" \ - --gid "${GID}" \ - --gecos "" \ - --disabled-login \ - "${CURATION_USER}" + --shell /usr/bin/bash \ + --uid "${UID}" \ + --gid "${GID}" \ + --gecos "" \ + --disabled-login \ + "${CURATION_USER}" # WORKING DIRECTORY SETUP # create our penultimate working directory RUN mkdir -p "${CIRCLE_WORKING_DIRECTORY}" \ - && chown -R "${CURATION_USER}" "${CIRCLE_WORKING_DIRECTORY}" \ - && chmod -R 775 "${CIRCLE_WORKING_DIRECTORY}" + && chown -R "${CURATION_USER}" "${CIRCLE_WORKING_DIRECTORY}" \ + && chmod -R 775 "${CIRCLE_WORKING_DIRECTORY}" # EVERYTHING ELSE @@ -126,33 +126,33 @@ RUN echo source /ci.env | tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/. # add a few envvars to runtime user's bashrc and profile files RUN echo export CIRCLE_WORKING_DIRECTORY="${CIRCLE_WORKING_DIRECTORY}" | \ - tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ - && echo export CIRCLE_ARTIFACTS="${CIRCLE_ARTIFACTS}" | \ - tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ - && echo export GOOGLE_APPLICATION_CREDENTIALS="${GOOGLE_APPLICATION_CREDENTIALS}" | \ - tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ - && echo export CURATION_SCRIPTS_DIR="${CURATION_SCRIPTS_DIR}" | \ - tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ - && echo export CURATION_COMMANDS_DIR="${CURATION_COMMANDS_DIR}" | \ - tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile + tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ + && echo export CIRCLE_ARTIFACTS="${CIRCLE_ARTIFACTS}" | \ + tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ + && echo export GOOGLE_APPLICATION_CREDENTIALS="${GOOGLE_APPLICATION_CREDENTIALS}" | \ + tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ + && echo export CURATION_SCRIPTS_DIR="${CURATION_SCRIPTS_DIR}" | \ + tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile \ + && echo export CURATION_COMMANDS_DIR="${CURATION_COMMANDS_DIR}" | \ + tee -a "${CURATION_HOME}"/.bashrc "${CURATION_HOME}"/.profile # download gsdk and verify checksum RUN wget --quiet "${GSDK_DOWNLOAD}" \ - && if ! $(echo "${GSDK_CHECKSUM}" "${GSDK_TAR_FILE}" | sha256sum --check --status); \ - then echo "GSDK tar integrity check failure, please update build arg with correct checksum and/or version.";\ - exit 1; \ - fi; + && if ! $(echo "${GSDK_CHECKSUM}" "${GSDK_TAR_FILE}" | sha256sum --check --status); \ + then echo "GSDK tar integrity check failure, please update build arg with correct checksum and/or version.";\ + exit 1; \ + fi; # install gsdk, RUN mkdir -p "${GSDK_INSTALL_PATH}" \ - && tar -xzf "${GSDK_TAR_FILE}" -C "${CURATION_HOME}" \ - && rm "${GSDK_TAR_FILE}" \ - && cd "${GSDK_INSTALL_PATH}" \ - && ./install.sh --quiet \ - && echo "source ${GSDK_INSTALL_PATH}/path.bash.inc" | tee --append "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ - && echo "source ${GSDK_INSTALL_PATH}/completion.bash.inc" | tee --append "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ - && cd .. \ - && ./google-cloud-sdk/bin/gcloud components update --quiet + && tar -xzf "${GSDK_TAR_FILE}" -C "${CURATION_HOME}" \ + && rm "${GSDK_TAR_FILE}" \ + && cd "${GSDK_INSTALL_PATH}" \ + && ./install.sh --quiet \ + && echo "source ${GSDK_INSTALL_PATH}/path.bash.inc" | tee --append "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ + && echo "source ${GSDK_INSTALL_PATH}/completion.bash.inc" | tee --append "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ + && cd .. \ + && ./google-cloud-sdk/bin/gcloud components update --quiet # set container working directory to one created above WORKDIR "${CIRCLE_WORKING_DIRECTORY}" @@ -163,12 +163,12 @@ COPY --chown=${CURATION_USER} --chmod=755 data_steward/requirements.txt "${CIRCL # initialize venv and install python deps RUN python -m venv "${VENV_PATH}" \ - && echo "source ${VENV_ACTIVATE}" | tee -a "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ - && echo "export PYTHONPATH=:${CIRCLE_WORKING_DIRECTORY}:${CIRCLE_WORKING_DIRECTORY}/data_steward:${CIRCLE_WORKING_DIRECTORY}/tests:\"\${PYTHONPATH}\"" \ - | tee -a "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ - && . "${VENV_ACTIVATE}" \ - && python -m pip install --upgrade pip setuptools wheel \ - && python -m pip install -r data_steward/requirements.txt + && echo "source ${VENV_ACTIVATE}" | tee -a "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ + && echo "export PYTHONPATH=:${CIRCLE_WORKING_DIRECTORY}:${CIRCLE_WORKING_DIRECTORY}/data_steward:${CIRCLE_WORKING_DIRECTORY}/tests:\"\${PYTHONPATH}\"" \ + | tee -a "${CURATION_HOME}/.bashrc" "${CURATION_HOME}/.profile" \ + && . "${VENV_ACTIVATE}" \ + && python -m pip install --upgrade pip setuptools wheel \ + && python -m pip install -r data_steward/requirements.txt # remove data_steward dir as it will be mounted inside the container as a volume later. RUN rm -rf "${CIRCLE_WORKING_DIRECTORY}/data_steward" diff --git a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py index 661bc7d2a7..e7b1045f14 100644 --- a/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py +++ b/tests/integration_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py @@ -248,9 +248,10 @@ def get_dataset_table_map(self) -> Dict[str, Set[str]]: """ cols = list(self.client.query(cols_query).result()) dataset_tables = { - dataset_id: set(col.table_name - for col in cols - if col.table_schema == dataset_id) + dataset_id: + set(col.table_name + for col in cols + if col.table_schema == dataset_id) for dataset_id in (self.dataset_id, self.sandbox_dataset_id) } return dataset_tables diff --git a/tests/integration_tests/data_steward/retraction/retract_data_bq_test.py b/tests/integration_tests/data_steward/retraction/retract_data_bq_test.py index 8cd0dbd6fc..b5e3ef439e 100644 --- a/tests/integration_tests/data_steward/retraction/retract_data_bq_test.py +++ b/tests/integration_tests/data_steward/retraction/retract_data_bq_test.py @@ -277,9 +277,10 @@ def setUp(self): @mock_patch_bundle def test_retract_unioned_ehr_rdr_and_ehr( - self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, mock_is_rdr, - mock_is_ehr, mock_is_unioned, mock_is_combined, mock_is_deid, - mock_is_fitbit, mock_ru_is_sandbox, mock_rdb_is_sandbox): + self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, + mock_is_rdr, mock_is_ehr, mock_is_unioned, mock_is_combined, + mock_is_deid, mock_is_fitbit, mock_ru_is_sandbox, + mock_rdb_is_sandbox): """ Test for unioned ehr dataset. run_bq_retraction with retraction_type = 'rdr_and_ehr'. @@ -313,9 +314,10 @@ def test_retract_unioned_ehr_rdr_and_ehr( @mock_patch_bundle def test_retract_unioned_ehr_only_ehr( - self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, mock_is_rdr, - mock_is_ehr, mock_is_unioned, mock_is_combined, mock_is_deid, - mock_is_fitbit, mock_ru_is_sandbox, mock_rdb_is_sandbox): + self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, + mock_is_rdr, mock_is_ehr, mock_is_unioned, mock_is_combined, + mock_is_deid, mock_is_fitbit, mock_ru_is_sandbox, + mock_rdb_is_sandbox): """ Test for unioned ehr dataset. run_bq_retraction with retraction_type = 'only_ehr'. @@ -350,9 +352,10 @@ def test_retract_unioned_ehr_only_ehr( @mock_patch_bundle def test_retract_combined_rdr_and_ehr( - self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, mock_is_rdr, - mock_is_ehr, mock_is_unioned, mock_is_combined, mock_is_deid, - mock_is_fitbit, mock_ru_is_sandbox, mock_rdb_is_sandbox): + self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, + mock_is_rdr, mock_is_ehr, mock_is_unioned, mock_is_combined, + mock_is_deid, mock_is_fitbit, mock_ru_is_sandbox, + mock_rdb_is_sandbox): """ Test for combined dataset. run_bq_retraction with retraction_type = 'rdr_and_ehr'. @@ -842,9 +845,10 @@ def test_retract_fitbit_only_ehr(self, mock_ru_get_dataset_type, @mock_patch_bundle def test_retract_deid_fitbit_rdr_and_ehr( - self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, mock_is_rdr, - mock_is_ehr, mock_is_unioned, mock_is_combined, mock_is_deid, - mock_is_fitbit, mock_ru_is_sandbox, mock_rdb_is_sandbox): + self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, + mock_is_rdr, mock_is_ehr, mock_is_unioned, mock_is_combined, + mock_is_deid, mock_is_fitbit, mock_ru_is_sandbox, + mock_rdb_is_sandbox): """ Test for fitbit dataset. run_bq_retraction with retraction_type = 'rdr_and_ehr'. @@ -873,9 +877,10 @@ def test_retract_deid_fitbit_rdr_and_ehr( @mock_patch_bundle def test_retract_deid_fitbit_only_ehr( - self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, mock_is_rdr, - mock_is_ehr, mock_is_unioned, mock_is_combined, mock_is_deid, - mock_is_fitbit, mock_ru_is_sandbox, mock_rdb_is_sandbox): + self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, + mock_is_rdr, mock_is_ehr, mock_is_unioned, mock_is_combined, + mock_is_deid, mock_is_fitbit, mock_ru_is_sandbox, + mock_rdb_is_sandbox): """ Test for deid fitbit dataset. run_bq_retraction with retraction_type = 'only_ehr'. @@ -1202,9 +1207,10 @@ def test_retract_sandbox_only_ehr(self, mock_ru_get_dataset_type, @mock_patch_bundle def test_retract_deid_sandbox_rdr_and_ehr( - self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, mock_is_rdr, - mock_is_ehr, mock_is_unioned, mock_is_combined, mock_is_deid, - mock_is_fitbit, mock_ru_is_sandbox, mock_rdb_is_sandbox): + self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, + mock_is_rdr, mock_is_ehr, mock_is_unioned, mock_is_combined, + mock_is_deid, mock_is_fitbit, mock_ru_is_sandbox, + mock_rdb_is_sandbox): """ Test for sandbox datasets. - run_bq_retraction with retraction_type = 'rdr_and_ehr'. @@ -1247,9 +1253,10 @@ def test_retract_deid_sandbox_rdr_and_ehr( @mock_patch_bundle def test_retract_deid_sandbox_only_ehr( - self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, mock_is_rdr, - mock_is_ehr, mock_is_unioned, mock_is_combined, mock_is_deid, - mock_is_fitbit, mock_ru_is_sandbox, mock_rdb_is_sandbox): + self, mock_ru_get_dataset_type, mock_rdb_get_dataset_type, + mock_is_rdr, mock_is_ehr, mock_is_unioned, mock_is_combined, + mock_is_deid, mock_is_fitbit, mock_ru_is_sandbox, + mock_rdb_is_sandbox): """ Test for sandbox datasets. - run_bq_retraction with retraction_type = 'only_ehr'. diff --git a/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py b/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py index 57357746ee..294e1a4c0d 100644 --- a/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py +++ b/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py @@ -51,7 +51,7 @@ def setUp(self): @patch('retraction.retract_data_gcs.extract_pids_from_table') def test_integration_five_person_data_retraction_skip( - self, mock_extract_pids): + self, mock_extract_pids): """ Test for GCS bucket retraction. When PIDs to retract are not in the CSV file, no records will be deleted diff --git a/tests/integration_tests/data_steward/validation/ehr_union_test.py b/tests/integration_tests/data_steward/validation/ehr_union_test.py index 7cb8bf1657..9fcca0c1ab 100644 --- a/tests/integration_tests/data_steward/validation/ehr_union_test.py +++ b/tests/integration_tests/data_steward/validation/ehr_union_test.py @@ -5,8 +5,6 @@ import mock # Third party imports -import dpath -import moz_sql_parser # Project imports import bq_utils @@ -36,10 +34,6 @@ ''' -def first_or_none(l): - return next(iter(l or []), None) - - class EhrUnionTest(unittest.TestCase): dataset_id = BIGQUERY_DATASET_ID project_id = get_application_id() @@ -536,31 +530,21 @@ def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out): subquery = ehr_union.table_hpo_subquery(table, NYC_HPO_ID, dataset_in, dataset_out) - # moz-sql-parser doesn't support the ROW_NUMBER() OVER() a analytical function of sql we are removing - # that statement from the returned query for the parser be able to parse out the query without erroring out. - - subquery = re.sub( - r",\s+ROW_NUMBER\(\) OVER \(PARTITION BY nm\..+?_id\) AS row_num", - " ", subquery) - # offset is being used as a column-name in note_nlp table. - # Although, BigQuery does not throw any errors for this, moz_sql_parser indentifies as a SQL Keyword. - # So, change required only in Test Script as a workaround. - if 'offset,' in subquery: - subquery = subquery.replace('offset,', '"offset",') - stmt = moz_sql_parser.parse(subquery) + # Lowercase the subquery, remove breaklines and duplicate spaces + # to use the string for the test easily + subquery = subquery.replace('\n', ' ').lower() + subquery = re.sub(r'\s+', ' ', subquery) # Sanity check it is a select statement - if 'select' not in stmt: + if 'select' not in subquery: return SUBQUERY_FAIL_MSG.format(expr='query type', table=table, expected='select', actual=str(stmt), subquery=subquery) - # Input table should be first in FROM expression - actual_from = first_or_none( - dpath.util.values(stmt, 'from/0/value/from/value') or - dpath.util.values(stmt, 'from')) + # Input table should be the first table name enclosed by `` in FROM clause + actual_from = subquery.split('from `', 1)[1].split('`', 1)[0] expected_from = f'{dataset_in}.{resources.get_table_id(table, hpo_id=NYC_HPO_ID)}' if expected_from != actual_from: return SUBQUERY_FAIL_MSG.format(expr='first object in FROM', @@ -574,17 +558,21 @@ def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out): fields = resources.fields_for(table) id_field = f'{table}_id' key_ind = 0 - expected_join = None - actual_join = None + expr = '' + expected_join = '' + actual_joins = [] for field in fields: if field['name'] in self.mapped_fields: - # key_ind += 1 # TODO use this increment when we generalize solution for all foreign keys if field['name'] == id_field: # Primary key, mapping table associated with this one should be INNER joined key_ind += 1 expr = 'inner join on primary key' - actual_join = first_or_none( - dpath.util.values(stmt, 'from/%s/join/value' % key_ind)) + + # Inner join always comes at the beginning of joins in this case, + # so [0] is specified in actual_joins here. + actual_joins = [ + subquery.split('join ', 1)[1].split(' ', 1)[0] + ] expected_join = dataset_out + '.' + ehr_union.mapping_table_for( table) @@ -592,18 +580,10 @@ def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out): # Foreign key, mapping table associated with the referenced table should be LEFT joined key_ind += 1 expr = 'left join on foreign key' - # Visit_detail table has 'visit_occurrence' column after 'care_site', which is different from - # other cdm tables, where 'visit_occurrence' comes before other foreign_keys. - # The test expects the same order as other cmd tables, so the expected-query has - # 'visit_occurrence' before 'care_site'. The following reorder is required to match the sequence - # to the actual-query. - if table == 'visit_detail' and key_ind == 2: - stmt['from'][2], stmt['from'][3], stmt['from'][4], stmt[ - 'from'][5] = stmt['from'][3], stmt['from'][4], stmt[ - 'from'][5], stmt['from'][2] - actual_join = first_or_none( - dpath.util.values(stmt, - 'from/%s/left join/value' % key_ind)) + actual_joins = [ + _.split(' ', 1)[0] for _ in subquery.split('left join') + ] + joined_table = field['name'].replace('_id', '') if field['name'] in self.self_reference_keys: @@ -611,12 +591,13 @@ def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out): else: expected_join = f'{dataset_out}.{ehr_union.mapping_table_for(joined_table)}' - if expected_join != actual_join: - return SUBQUERY_FAIL_MSG.format(expr=expr, - table=table, - expected=expected_join, - actual=actual_join, - subquery=subquery) + if expected_join in actual_joins: + return SUBQUERY_FAIL_MSG.format( + expr=expr, + table=table, + expected=expected_join, + actual=', '.join(actual_joins), + subquery=subquery) def test_hpo_subquery(self): input_dataset_id = 'input' diff --git a/tests/integration_tests/data_steward/validation/export_test.py b/tests/integration_tests/data_steward/validation/export_test.py index 1c14c59d6a..cb198a860a 100644 --- a/tests/integration_tests/data_steward/validation/export_test.py +++ b/tests/integration_tests/data_steward/validation/export_test.py @@ -126,7 +126,7 @@ def test_run_export_without_datasource_id(self): @mock.patch("gcloud.gcs.LOOKUP_TABLES_DATASET_ID", dataset_id) @mock.patch('validation.export.is_hpo_id') def test_run_export_with_target_bucket_and_datasource_id( - self, mock_is_hpo_id): + self, mock_is_hpo_id): # validation/main.py INTEGRATION TEST mock_is_hpo_id.return_value = True folder_prefix: str = 'dummy-prefix-2018-03-24/' diff --git a/tests/test_util.py b/tests/test_util.py index e4fc166342..cb2b5bc68d 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -392,5 +392,5 @@ def mock_google_cloud_error(content: bytes = b'418: I\'m a teapot'): def mock_google_service_unavailable_error( - content: bytes = b'418: I\'m a teapot'): + content: bytes = b'418: I\'m a teapot'): return ServiceUnavailable(message=content.decode()) diff --git a/tests/unit_tests/data_steward/admin/admin_api_test.py b/tests/unit_tests/data_steward/admin/admin_api_test.py index 12c995cdf8..96e59cb8f7 100644 --- a/tests/unit_tests/data_steward/admin/admin_api_test.py +++ b/tests/unit_tests/data_steward/admin/admin_api_test.py @@ -1,7 +1,12 @@ import unittest import mock -from admin import admin_api +with mock.patch('google.cloud.logging.Client') as mock_gc_logging_client: + # mocking the client at the time of import so the script will not check the credential. + mock_client = mock.MagicMock() + mock_gc_logging_client.return_value = mock_client + + from admin import admin_api class AdminApiTest(unittest.TestCase): diff --git a/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py b/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py index 726163dc1a..5558e6fb19 100644 --- a/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py +++ b/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/ppi_branching_test.py @@ -10,6 +10,7 @@ from cdr_cleaner.cleaning_rules.ppi_branching import PpiBranching, OBSERVATION, BACKUP_ROWS_QUERY, RULES_LOOKUP_TABLE_ID from common import JINJA_ENV from constants.utils import bq as consts +from resources import get_bq_col_type def _get_csv_row_count() -> int: @@ -49,20 +50,13 @@ def _get_create_or_replace_table_ddl(project, as_query: str = None, **table_options) -> str: - def _to_standard_sql_type(field_type) -> str: - upper_field_type = field_type.upper() - standard_sql_type_code = bigquery.schema.LEGACY_TO_STANDARD_TYPES.get( - upper_field_type) - if not standard_sql_type_code: - raise ValueError(f'{field_type} is not a valid field type') - standard_sql_type = bigquery.StandardSqlDataTypes( - standard_sql_type_code) - return standard_sql_type.name - def _to_sql_field(field): - return bigquery.SchemaField(field.name, - _to_standard_sql_type(field.field_type), - field.mode, field.description, field.fields) + return bigquery.SchemaField(name=field.name, + field_type=get_bq_col_type( + field.field_type), + mode=field.mode, + description=field.description, + fields=field.fields) CREATE_OR_REPLACE_TABLE_TPL = JINJA_ENV.from_string( consts.CREATE_OR_REPLACE_TABLE_QUERY) diff --git a/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/replace_standard_id_in_domain_tables_test.py b/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/replace_standard_id_in_domain_tables_test.py index ab9b485659..28c7c7cb79 100644 --- a/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/replace_standard_id_in_domain_tables_test.py +++ b/tests/unit_tests/data_steward/cdr_cleaner/cleaning_rules/replace_standard_id_in_domain_tables_test.py @@ -64,11 +64,11 @@ def tearDown(self): @patch.object(ReplaceWithStandardConceptId, 'get_src_concept_id_logging_queries') def test_replace_standard_id_in_domain_tables( - self, mock_get_src_concept_id_logging_queries, - mock_get_sandbox_src_concept_id_update_queries, - mock_get_src_concept_id_update_queries, - mock_get_mapping_table_update_queries, - mock_get_delete_empty_sandbox_tables_queries): + self, mock_get_src_concept_id_logging_queries, + mock_get_sandbox_src_concept_id_update_queries, + mock_get_src_concept_id_update_queries, + mock_get_mapping_table_update_queries, + mock_get_delete_empty_sandbox_tables_queries): query = 'select this query' mock_get_src_concept_id_logging_queries.return_value = [{ @@ -155,8 +155,8 @@ def test_replace_standard_id_in_domain_tables( @patch.object(ReplaceWithStandardConceptId, 'parse_src_concept_id_logging_query') def test_get_src_concept_id_logging_queries( - self, mock_parse_src_concept_id_logging_query, - mock_parse_duplicate_id_update_query): + self, mock_parse_src_concept_id_logging_query, + mock_parse_duplicate_id_update_query): src_concept_id_logging_condition = 'condition UNION ALL procedure' duplicate_id_update_query = ('UPDATE `test_project_id.dataset_id' '._logging_standard_concept_id_replacement' @@ -186,8 +186,8 @@ def test_get_src_concept_id_logging_queries( @patch.object(ReplaceWithStandardConceptId, 'parse_sandbox_src_concept_id_update_query') def test_get_sandbox_src_concept_id_update_queries( - self, mock_parse_sandbox_src_concept_id_update_query, - mock_sandbox_table_for): + self, mock_parse_sandbox_src_concept_id_update_query, + mock_sandbox_table_for): sandbox_query_condition = 'condition sandbox query' sandbox_query_procedure = 'procedure sandbox query' condition_sandbox_table = 'condition_sandbox_table' @@ -232,7 +232,7 @@ def test_get_sandbox_src_concept_id_update_queries( @patch.object(ReplaceWithStandardConceptId, 'parse_src_concept_id_update_query') def test_get_src_concept_id_update_queries( - self, mock_parse_src_concept_id_update_query): + self, mock_parse_src_concept_id_update_query): update_query_condition = 'select a random value' update_query_procedure = 'select a random value procedure' mock_parse_src_concept_id_update_query.side_effect = [ @@ -260,8 +260,8 @@ def test_get_src_concept_id_update_queries( @mock.patch('resources.get_domain_id_field') @mock.patch('resources.fields_for') def test_parse_src_concept_id_update_query( - self, mock_fields_for, mock_get_domain_id_field, - mock_get_domain_concept_id, mock_get_domain_source_concept_id): + self, mock_fields_for, mock_get_domain_id_field, + mock_get_domain_concept_id, mock_get_domain_source_concept_id): mock_fields_for.return_value = [{ 'name': 'condition_occurrence_id' }, { @@ -298,7 +298,8 @@ def test_parse_src_concept_id_update_query( 'cdr_cleaner.cleaning_rules.replace_standard_id_in_domain_tables.mapping_table_for' ) def test_get_mapping_table_update_queries( - self, mock_mapping_table_for, mock_parse_mapping_table_update_query): + self, mock_mapping_table_for, + mock_parse_mapping_table_update_query): mock_mapping_table_for.side_effect = [ self.condition_mapping_table, self.procedure_mapping_table ] diff --git a/tests/unit_tests/data_steward/curation_logging/gcp_stackdriver_logger_test.py b/tests/unit_tests/data_steward/curation_logging/gcp_stackdriver_logger_test.py deleted file mode 100644 index c25c94edab..0000000000 --- a/tests/unit_tests/data_steward/curation_logging/gcp_stackdriver_logger_test.py +++ /dev/null @@ -1,328 +0,0 @@ -import logging -import mock -import unittest -from datetime import datetime, timedelta -from logging import LogRecord -from mock import patch -from mock import MagicMock, PropertyMock - -from google.api.monitored_resource_pb2 import MonitoredResource -from google.cloud.logging_v2.proto.log_entry_pb2 import LogEntryOperation -from google.protobuf import json_format as gcp_json_format, any_pb2 as gcp_any_pb2 -import pytz - -from curation_logging import curation_gae_handler -from curation_logging.curation_gae_handler import GCPStackDriverLogger, LogCompletionStatusEnum -from curation_logging.curation_gae_handler import GAE_LOGGING_MODULE_ID, GAE_LOGGING_VERSION_ID - -LOG_BUFFER_SIZE = 3 -SEVERITY_DEBUG = 100 # 100 is the equivalence of logging.DEBUG -SEVERITY_INFO = 200 # 200 is the equivalence of logging.INFO -SEVERITY_ERROR = 300 # 300 is the equivalence of logging.ERROR - - -class GCPStackDriverLoggerTest(unittest.TestCase): - - @classmethod - def setUpClass(cls): - print('**************************************************************') - print(cls.__name__) - print('**************************************************************') - - def setUp(self): - self.project_id = 'aou-res-curation-test' - self.request_method = 'GET' - self.request_full_path = '/admin/v1/RemoveExpiredServiceAccountKeys' - self.request_user_agent = 'AppEngine-Google; (+http://code.google.com/appengine)' - self.request_ip = '0.1.0.1' - self.request_host_name = 'py3.aou-res-curation-test.appspot.com' - self.request_log_id = 'fake_request_id' - self.request_trace_id = 'fake_trace_id' - self.request_trace = 'projects/{0}/traces/{1}'.format( - self.project_id, self.request_trace_id) - - self.request_start_time = pytz.utc.localize(datetime(2020, 1, 1)) - self.request_end_time = pytz.utc.localize(datetime( - 2020, 1, 1)) + timedelta(minutes=1) - self.request_log_entry_ts = self.request_start_time + timedelta( - seconds=10) - self.log_record_created = self.request_start_time - timedelta( - seconds=10) - - self.mock_get_application_id_patcher = patch( - 'app_identity.get_application_id') - self.mock_get_application_id = self.mock_get_application_id_patcher.start( - ) - self.mock_get_application_id.return_value = self.project_id - - # Mock a flask request for testing - self.request = MagicMock() - type(self.request).method = PropertyMock( - return_value=self.request_method) - type(self.request).full_path = PropertyMock( - return_value=self.request_full_path) - type(self.request).user_agent = PropertyMock( - return_value=self.request_user_agent) - - headers = dict() - headers['X-Appengine-User-Ip'] = self.request_ip - headers['X-Appengine-Default-Version-Hostname'] = self.request_host_name - headers['X-Appengine-Request-Log-Id'] = self.request_log_id - headers['X-Appengine-Taskname'] = None - headers['X-Appengine-Queuename'] = None - headers['X-Cloud-Trace-Context'] = self.request_trace_id - - type(self.request).headers = PropertyMock(return_value=headers) - - # Define the log records for testing - self.file_path = 'data_steward/validation/main' - self.file_name = 'main' - self.info_log_record = self.create_log_record( - 'info', self.log_record_created, logging.INFO, self.file_name, - self.file_path, 10, 'info message') - self.debug_log_record = self.create_log_record( - 'debug', self.log_record_created, logging.DEBUG, self.file_name, - self.file_path, 11, 'debug message') - self.error_log_record = self.create_log_record( - 'error', self.log_record_created, logging.ERROR, self.file_name, - self.file_path, 12, 'error message') - - self.info_log_line = { - 'logMessage': 'info message', - 'severity': SEVERITY_INFO, - 'time': self.log_record_created.isoformat(), - 'sourceLocation': { - 'file': self.file_path, - 'functionName': self.file_name, - 'line': 10 - } - } - - self.debug_log_line = { - 'logMessage': 'debug message', - 'severity': SEVERITY_DEBUG, - 'time': self.log_record_created.isoformat(), - 'sourceLocation': { - 'file': self.file_path, - 'functionName': self.file_name, - 'line': 11 - } - } - - self.error_log_line = { - 'logMessage': 'error message', - 'severity': SEVERITY_ERROR, - 'time': self.log_record_created.isoformat(), - 'sourceLocation': { - 'file': self.file_path, - 'functionName': self.file_name, - 'line': 12 - } - } - - self.mock_logging_service_client_patcher = patch( - 'curation_logging.curation_gae_handler.gcp_logging_v2.LoggingServiceV2Client' - ) - self.mock_logging_service_client = self.mock_logging_service_client_patcher.start( - ) - - def tearDown(self): - self.mock_logging_service_client_patcher.stop() - self.request.stop() - - @staticmethod - def create_log_record(name, record_created, level_no, func_name, pathname, - lineno, msg): - log_record = LogRecord(name=name, - levelno=level_no, - lineno=lineno, - func=func_name, - pathname=pathname, - msg=msg, - level=level_no, - args={}, - exc_info={}) - log_record.created = record_created - return log_record - - @mock.patch('curation_logging.curation_gae_handler.datetime') - def test_gcp_stackdriver_logger(self, mock_datetime): - mock_datetime.now.return_value.isoformat.return_value = self.request_start_time.isoformat( - ) - mock_datetime.utcnow.return_value = self.request_start_time - mock_datetime.utcfromtimestamp.return_value = self.log_record_created - - # Initialize GCPStackDriverLogger - self.gcp_stackdriver_logger = GCPStackDriverLogger(LOG_BUFFER_SIZE) - self.gcp_stackdriver_logger.setup_from_request(self.request) - - self.assertIsNone(self.gcp_stackdriver_logger._first_log_ts) - self.assertEqual(self.gcp_stackdriver_logger._start_time, - self.request_start_time.isoformat()) - self.assertEqual(self.gcp_stackdriver_logger._request_method, - self.request_method) - self.assertEqual(self.gcp_stackdriver_logger._request_resource, - self.request_full_path) - self.assertEqual(self.gcp_stackdriver_logger._request_agent, - self.request_user_agent) - self.assertEqual(self.gcp_stackdriver_logger._request_remote_addr, - self.request_ip) - self.assertEqual(self.gcp_stackdriver_logger._request_host, - self.request_host_name) - self.assertEqual(self.gcp_stackdriver_logger._request_log_id, - self.request_log_id) - self.assertEqual(self.gcp_stackdriver_logger._trace, self.request_trace) - - self.gcp_stackdriver_logger.log_event(self.info_log_record) - self.gcp_stackdriver_logger.log_event(self.debug_log_record) - self.gcp_stackdriver_logger.log_event(self.error_log_record) - - self.assertEqual( - len(self.gcp_stackdriver_logger._buffer), 0, - 'expected log buffer to flush itself after being filled') - self.assertEqual( - self.mock_logging_service_client.return_value.write_log_entries. - call_count, 1) - - self.gcp_stackdriver_logger.finalize() - self.assertIsNone(self.gcp_stackdriver_logger._first_log_ts) - self.assertEqual(self.gcp_stackdriver_logger._start_time, None) - self.assertEqual(self.gcp_stackdriver_logger._request_method, None) - self.assertEqual(self.gcp_stackdriver_logger._request_resource, None) - self.assertEqual(self.gcp_stackdriver_logger._request_agent, None) - self.assertEqual(self.gcp_stackdriver_logger._request_remote_addr, None) - self.assertEqual(self.gcp_stackdriver_logger._request_host, None) - self.assertEqual(self.gcp_stackdriver_logger._request_log_id, None) - self.assertEqual(self.gcp_stackdriver_logger._trace, None) - - @mock.patch('curation_logging.curation_gae_handler.get_gcp_logger') - def test_initialize_logging(self, mock_get_gcp_logger): - with patch.dict('os.environ', {'GAE_ENV': ''}): - curation_gae_handler.initialize_logging(logging.DEBUG) - logging.info(self.info_log_record) - logging.debug(self.debug_log_line) - logging.error(self.error_log_record) - self.assertEqual(3, mock_get_gcp_logger.call_count) - - @mock.patch('requests.get') - def test_setup_logging_zone(self, mock_requests_get): - with patch.dict('os.environ', {'GAE_SERVICE': ''}): - timezone = 'test time zone' - mock_response = MagicMock() - type(mock_response).status_code = PropertyMock(return_value=200) - type(mock_response).text = timezone - mock_requests_get.return_value = mock_response - actual_time_zone = curation_gae_handler.setup_logging_zone() - self.assertEqual(actual_time_zone, timezone) - - type(mock_response).status_code = PropertyMock(return_value=500) - actual_time_zone = curation_gae_handler.setup_logging_zone() - self.assertEqual(actual_time_zone, 'local-machine') - - with patch.dict('os.environ', dict()): - actual_time_zone = curation_gae_handler.setup_logging_zone() - self.assertEqual(actual_time_zone, 'local-machine') - - @mock.patch('curation_logging.curation_gae_handler.setup_logging_zone') - def test_setup_logging_resource(self, mock_setup_logging_zone): - timezone = 'test time zone' - mock_setup_logging_zone.return_value = timezone - actual_resource = curation_gae_handler.setup_logging_resource() - expected_resource = MonitoredResource( - type='gae_app', - labels={ - 'project_id': self.project_id, - 'module_id': GAE_LOGGING_MODULE_ID, - 'version_id': GAE_LOGGING_VERSION_ID, - 'zone': timezone - }) - self.assertEqual(expected_resource, actual_resource) - - @mock.patch('curation_logging.curation_gae_handler.datetime') - @mock.patch( - 'curation_logging.curation_gae_handler.gcp_logging._helpers._normalize_severity' - ) - def test_setup_log_line(self, mock_normalize_severity, mock_datetime): - mock_datetime.utcfromtimestamp.return_value = self.log_record_created - mock_normalize_severity.side_effect = [ - SEVERITY_INFO, SEVERITY_DEBUG, SEVERITY_ERROR - ] - - actual_info_log_line = curation_gae_handler.setup_log_line( - self.info_log_record) - self.assertDictEqual(self.info_log_line, actual_info_log_line) - - actual_debug_log_line = curation_gae_handler.setup_log_line( - self.debug_log_record) - self.assertDictEqual(self.debug_log_line, actual_debug_log_line) - - actual_error_log_line = curation_gae_handler.setup_log_line( - self.error_log_record) - self.assertDictEqual(self.error_log_line, actual_error_log_line) - - def test_get_highest_severity_level_from_lines(self): - lines = [self.info_log_line, self.debug_log_line, self.error_log_line] - actual_severity_level = curation_gae_handler.get_highest_severity_level_from_lines( - lines) - self.assertEqual(SEVERITY_ERROR, actual_severity_level) - - def test_setup_proto_payload(self): - lines = [self.info_log_line, self.debug_log_line, self.error_log_line] - - proto_payload_args = { - 'startTime': self.request_start_time.isoformat(), - 'endTime': self.request_end_time.isoformat(), - 'method': self.request_method, - 'resource': self.request_full_path, - 'userAgent': self.request_user_agent, - 'host': self.request_host_name, - 'ip': self.request_ip, - 'responseSize': None - } - - actual_proto_payload = curation_gae_handler.setup_proto_payload( - lines, LogCompletionStatusEnum.PARTIAL_BEGIN, **proto_payload_args) - - expected_dict = dict( - { - '@type': curation_gae_handler.REQUEST_LOG_TYPE, - 'first': True, - 'finished': False, - 'line': lines - }, **proto_payload_args) - - expected_proto_payload = gcp_json_format.ParseDict( - expected_dict, gcp_any_pb2.Any()) - - self.assertEqual(expected_proto_payload, actual_proto_payload) - - def test_update_long_operation(self): - expected_operation = LogEntryOperation( - id=self.request_log_id, - producer='appengine.googleapis.com/request_id', - first=True, - last=True) - - actual_operation = curation_gae_handler.update_long_operation( - self.request_log_id, LogCompletionStatusEnum.COMPLETE) - self.assertEqual(expected_operation, actual_operation) - - expected_operation = LogEntryOperation( - id=self.request_log_id, - producer='appengine.googleapis.com/request_id', - first=True, - last=False) - - actual_operation = curation_gae_handler.update_long_operation( - self.request_log_id, LogCompletionStatusEnum.PARTIAL_BEGIN) - self.assertEqual(expected_operation, actual_operation) - - expected_operation = LogEntryOperation( - id=self.request_log_id, - producer='appengine.googleapis.com/request_id', - first=False, - last=False) - - actual_operation = curation_gae_handler.update_long_operation( - self.request_log_id, LogCompletionStatusEnum.PARTIAL_MORE) - self.assertEqual(expected_operation, actual_operation) diff --git a/tests/unit_tests/data_steward/gcloud/bq/bq_test.py b/tests/unit_tests/data_steward/gcloud/bq/bq_test.py index 5ae350edd7..b883335725 100644 --- a/tests/unit_tests/data_steward/gcloud/bq/bq_test.py +++ b/tests/unit_tests/data_steward/gcloud/bq/bq_test.py @@ -225,19 +225,6 @@ def test_get_table_schema(self): if field.field_type.upper() == "RECORD": self.assertEqual(len(field.fields), 2) - def test_to_standard_sql_type(self): - # All types used in schema files should successfully map to standard sql types - all_field_types = self.client._get_all_field_types() - for field_type in all_field_types: - result = self.client._to_standard_sql_type(field_type) - self.assertTrue(result) - - # Unknown types should raise ValueError - with self.assertRaises(ValueError) as c: - self.client._to_standard_sql_type('unknown_type') - self.assertEqual(str(c.exception), - f'unknown_type is not a valid field type') - @patch.object(BigQueryClient, 'copy_table') @patch('gcloud.bq.Client.list_tables') @patch('gcloud.bq.Client.list_jobs') diff --git a/tests/unit_tests/data_steward/utils/bq_test.py b/tests/unit_tests/data_steward/utils/bq_test.py index e2b264c0a1..81f88f6901 100644 --- a/tests/unit_tests/data_steward/utils/bq_test.py +++ b/tests/unit_tests/data_steward/utils/bq_test.py @@ -127,19 +127,6 @@ def test_update_labels_and_tags(self): new_labels_or_tags={'label': 'oranges'}, overwrite_ok=False) - def test_to_standard_sql_type(self): - # All types used in schema files should successfully map to standard sql types - all_field_types = _get_all_field_types() - for field_type in all_field_types: - result = bq._to_standard_sql_type(field_type) - self.assertTrue(result) - - # Unknown types should raise ValueError - with self.assertRaises(ValueError) as c: - bq._to_standard_sql_type('unknown_type') - self.assertEqual(str(c.exception), - f'unknown_type is not a valid field type') - def test_get_table_ddl(self): # Schema is determined by table name ddl = bq.get_create_or_replace_table_ddl(self.project_id, diff --git a/tests/unit_tests/data_steward/validation/app_errors_test.py b/tests/unit_tests/data_steward/validation/app_errors_test.py index 75bb3abc57..4fa9e6a86a 100644 --- a/tests/unit_tests/data_steward/validation/app_errors_test.py +++ b/tests/unit_tests/data_steward/validation/app_errors_test.py @@ -7,8 +7,14 @@ from googleapiclient.errors import HttpError import httplib2 -from validation import app_errors, main +from validation import app_errors from constants.validation import main as main_consts +with mock.patch('google.cloud.logging.Client') as mock_gc_logging_client: + # mocking the client at the time of import so the script will not check the credential. + mock_client = mock.MagicMock() + mock_gc_logging_client.return_value = mock_client + + from validation import main class AppErrorHandlersTest(TestCase): diff --git a/tests/unit_tests/data_steward/validation/ehr_union_test.py b/tests/unit_tests/data_steward/validation/ehr_union_test.py index fa0d0b8f2f..a2812c55d1 100644 --- a/tests/unit_tests/data_steward/validation/ehr_union_test.py +++ b/tests/unit_tests/data_steward/validation/ehr_union_test.py @@ -225,8 +225,8 @@ def test_mapping_query(self, mock_hpo_info): @mock.patch('validation.ehr_union.get_person_to_observation_query') @mock.patch('validation.ehr_union.query') def test_move_ehr_person_to_observation( - self, mock_query, mock_get_person_to_observation_query, - mock_output_table_for, mock_hpo_info): + self, mock_query, mock_get_person_to_observation_query, + mock_output_table_for, mock_hpo_info): dataset_id = 'fake_dataset' output_table = 'fake_table' diff --git a/tests/unit_tests/data_steward/validation/main_test.py b/tests/unit_tests/data_steward/validation/main_test.py index 4a1661c1ba..7cee58ffad 100644 --- a/tests/unit_tests/data_steward/validation/main_test.py +++ b/tests/unit_tests/data_steward/validation/main_test.py @@ -15,9 +15,15 @@ from constants.validation import hpo_report as report_consts from constants.validation import main as main_consts from constants.validation.participants import identity_match as id_match_consts -from validation import main from tests.test_util import mock_google_http_error, mock_google_cloud_error, mock_google_service_unavailable_error +with mock.patch('google.cloud.logging.Client') as mock_gc_logging_client: + # mocking the client at the time of import so the script will not check the credential. + mock_client = mock.MagicMock() + mock_gc_logging_client.return_value = mock_client + + from validation import main + class ValidationMainTest(TestCase): @@ -332,11 +338,11 @@ def test_validate_all_hpos_exception(self, check_cron, mock_logging_error, @mock.patch('validation.main.is_first_validation_run') @mock.patch('validation.main.is_valid_rdr') def test_process_hpo_ignore_dirs( - self, mock_valid_rdr, mock_first_validation, - mock_has_all_required_files, mock_folder_items, mock_validation, - mock_get_hpo_name, mock_get_duplicate_counts_query, mock_query_rows, - mock_all_required_files_loaded, mock_run_achilles, mock_export, - mock_valid_folder_name, mock_query): + self, mock_valid_rdr, mock_first_validation, + mock_has_all_required_files, mock_folder_items, mock_validation, + mock_get_hpo_name, mock_get_duplicate_counts_query, mock_query_rows, + mock_all_required_files_loaded, mock_run_achilles, mock_export, + mock_valid_folder_name, mock_query): """ Test process_hpo with directories we want to ignore. diff --git a/tests/unit_tests/data_steward/validation/participants/identity_match_test.py b/tests/unit_tests/data_steward/validation/participants/identity_match_test.py index 7611958e89..0932153d9d 100644 --- a/tests/unit_tests/data_steward/validation/participants/identity_match_test.py +++ b/tests/unit_tests/data_steward/validation/participants/identity_match_test.py @@ -379,7 +379,7 @@ def test_match_participants_same_participant_simulate_write_errors(self): self.assertEqual(self.mock_validation_report.call_count, 0) def test_match_participants_same_participant_simulate_location_pii_read_errors( - self): + self): # pre conditions self.mock_location_pii.side_effect = test_util.mock_google_http_error( status_code=500, content=b'content', reason='reason') From ff2638714764c95c7c6b5fdb67e995536edc0816 Mon Sep 17 00:00:00 2001 From: Ratul <91090217+ratuagga@users.noreply.github.com> Date: Tue, 16 Jan 2024 09:39:35 -0600 Subject: [PATCH 15/19] [DC-3660] Update Combined QC notebook to include CE (#1838) * [DC-3660] Updated combined QC check. * [DC-3660] Added comment --- data_steward/analytics/cdr_ops/combined.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data_steward/analytics/cdr_ops/combined.py b/data_steward/analytics/cdr_ops/combined.py index e3b31bc2e1..8b788e0461 100644 --- a/data_steward/analytics/cdr_ops/combined.py +++ b/data_steward/analytics/cdr_ops/combined.py @@ -867,6 +867,7 @@ def verify_dataset_labels(dataset): ext_tables = execute(client, ext_tables_query) result = [] for _, row in ext_tables.iterrows(): + # Only participant portal, EHR sites, and participant mediated EHR are allowed tpl = JINJA_ENV.from_string(""" SELECT \'{{table_name}}\' AS table_name, @@ -875,7 +876,7 @@ def verify_dataset_labels(dataset): FROM `{{project_id}}.{{dataset}}.{{table_name}}` WHERE NOT - REGEXP_CONTAINS(src_id, r'(?i)(Portal)|(EHR site)') + REGEXP_CONTAINS(src_id, r'(?i)(Portal)|(EHR)') OR src_id IS NULL GROUP BY 1,2 From d6efe80b8a9b5c13c08ee3f52e5dcc92b98afca8 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Tue, 16 Jan 2024 10:50:01 -0600 Subject: [PATCH 16/19] [DC-3673] Remove notebook check regarding cause_source_concept_id for aou_death (#1845) * [DC-3673] Delete query * [DC-3673] change "df1" to "result" * [DC-3673] change "df" to "summary" * [DC-3673] Minor formatting --- .../rt_cdr_qc/cdr_deid_clean_qa_report1.py | 137 ++++++------------ 1 file changed, 48 insertions(+), 89 deletions(-) diff --git a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_clean_qa_report1.py b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_clean_qa_report1.py index 3256cb3b7f..52f332ae06 100644 --- a/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_clean_qa_report1.py +++ b/data_steward/analytics/cdr_ops/rt_cdr_qc/cdr_deid_clean_qa_report1.py @@ -36,8 +36,8 @@ client = BigQueryClient(project_id, credentials=impersonation_creds) # - -# df will have a summary in the end -df = pd.DataFrame(columns=['query', 'result']) +# summary will have a summary in the end +summary = pd.DataFrame(columns=['query', 'result']) # ## QA queries on new CDR_deid_clean drop rows with 0 OR null @@ -56,20 +56,20 @@ FROM `{{project_id}}.{{deid_clean_cdr}}.observation` """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append({ +result = execute(client, q) +if result.loc[0].sum() == 0: + summary = summary.append({ 'query': 'Query1 observation', 'result': 'PASS' }, ignore_index=True) else: - df = df.append({ + summary = summary.append({ 'query': 'Query1 observation', 'result': 'Failure' }, ignore_index=True) -df1.T +result.T # + [markdown] papermill={"duration": 0.023633, "end_time": "2021-02-02T22:30:36.860798", "exception": false, "start_time": "2021-02-02T22:30:36.837165", "status": "completed"} tags=[] # # 2 Verify that in condition_occurrence if condition_occurrence_source_concept_id AND the condition_occurrence_concept_id both of those fields are null OR zero, the row should be removed. @@ -89,20 +89,20 @@ OR ( condition_source_concept_id IS NULL AND condition_concept_id IS NULL) """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append({ +result = execute(client, q) +if result.loc[0].sum() == 0: + summary = summary.append({ 'query': 'Query2 condition', 'result': 'PASS' }, ignore_index=True) else: - df = df.append({ + summary = summary.append({ 'query': 'Query2 condition', 'result': 'Failure' }, ignore_index=True) -df1.T +result.T # # 3 Verify that in procedure_occurrence table if procedure_occurrence_source_concept_id AND the procedure_occurrence_concept_id both of those fields are null OR zero, the row should be removed. @@ -117,20 +117,20 @@ FROM `{{project_id}}.{{deid_clean_cdr}}.procedure_occurrence` """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append({ +result = execute(client, q) +if result.loc[0].sum() == 0: + summary = summary.append({ 'query': 'Query3 procedure', 'result': 'PASS' }, ignore_index=True) else: - df = df.append({ + summary = summary.append({ 'query': 'Query3 procedure', 'result': 'Failure' }, ignore_index=True) -df1.T +result.T # + [markdown] papermill={"duration": 0.023649, "end_time": "2021-02-02T22:30:39.115495", "exception": false, "start_time": "2021-02-02T22:30:39.091846", "status": "completed"} tags=[] # # 4 Verify that in visit_occurrence table if visit_occurrence_source_concept_id AND the visit_occurrence_concept_id both of those fields are null OR zero, the row should be removed. @@ -146,20 +146,21 @@ FROM `{{project_id}}.{{deid_clean_cdr}}.visit_occurrence` """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append({ +result = execute(client, q) +if result.loc[0].sum() == 0: + summary = summary.append({ 'query': 'Query4 visit', 'result': 'PASS' }, ignore_index=True) else: - df = df.append({ + summary = summary.append({ 'query': 'Query4 visit', 'result': 'Failure' }, ignore_index=True) -df1.T +result.T + # + [markdown] papermill={"duration": 0.023649, "end_time": "2021-02-02T22:30:39.115495", "exception": false, "start_time": "2021-02-02T22:30:39.091846", "status": "completed"} tags=[] # # 5 Verify that in drug_exposure table if drug_exposure_source_concept_id AND the drug_exposure_concept_id both of those fields are null OR zero, the row should be removed. @@ -175,20 +176,20 @@ FROM `{{project_id}}.{{deid_clean_cdr}}.drug_exposure` """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append({ +result = execute(client, q) +if result.loc[0].sum() == 0: + summary = summary.append({ 'query': 'Query5 drug_exposure', 'result': 'PASS' }, ignore_index=True) else: - df = df.append({ + summary = summary.append({ 'query': 'Query5 drug_exposure', 'result': 'Failure' }, ignore_index=True) -df1.T +result.T # + [markdown] papermill={"duration": 0.023649, "end_time": "2021-02-02T22:30:39.115495", "exception": false, "start_time": "2021-02-02T22:30:39.091846", "status": "completed"} tags=[] # # 6 Verify that in device_exposure table if device_exposure_source_concept_id AND the device_exposure_concept_id both of those fields are null OR zero, the row should be removed. @@ -204,20 +205,20 @@ FROM `{{project_id}}.{{deid_clean_cdr}}.device_exposure` """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append({ +result = execute(client, q) +if result.loc[0].sum() == 0: + summary = summary.append({ 'query': 'Query6 device', 'result': 'PASS' }, ignore_index=True) else: - df = df.append({ + summary = summary.append({ 'query': 'Query6 device', 'result': 'Failure' }, ignore_index=True) -df1.T +result.T # + [markdown] papermill={"duration": 0.023649, "end_time": "2021-02-02T22:30:39.115495", "exception": false, "start_time": "2021-02-02T22:30:39.091846", "status": "completed"} tags=[] # # 7 Verify that in measurement table if measurement_source_concept_id AND the measurement_concept_id both of those fields are null OR zero, the row should be removed. @@ -233,64 +234,23 @@ FROM`{{project_id}}.{{deid_clean_cdr}}.measurement` """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append({ +result = execute(client, q) +if result.loc[0].sum() == 0: + summary = summary.append({ 'query': 'Query7 measurement', 'result': 'PASS' }, ignore_index=True) else: - df = df.append({ - 'query': 'Query7 measurement', + summary = summary.append({ + 'query': 'Query7, measurement', 'result': 'Failure' }, ignore_index=True) -df1.T +result.T # - -# # 8 Verify that in aou_death table both cause_source_concept_id and cause_concept_id are null OR zero. -# -# - -# + -# how to COUNT NaN -# in old cdr_clean , the value is NaN; in contrast, the value is empty in new cdr_clean -query = JINJA_ENV.from_string(""" - -SELECT -SUM(CASE WHEN cause_source_concept_id != 0 AND cause_concept_id != 0 THEN 1 ELSE 0 END) AS n_cause_source_concept_id_both_not_0, -SUM(CASE WHEN cause_source_concept_id IS NOT NULL AND cause_concept_id IS NOT NULL THEN 1 ELSE 0 END) AS n_cause_source_concept_id_both_not_null, -SUM(CASE WHEN cause_source_concept_id != 0 AND cause_concept_id IS NOT NULL THEN 1 ELSE 0 END) AS n_cause_source_concept_id_either_0, -SUM(CASE WHEN cause_source_concept_id IS NOT NULL AND cause_concept_id !=0 THEN 1 ELSE 0 END) AS n_cause_source_concept_id_either_null -FROM `{{project_id}}.{{deid_clean_cdr}}.aou_death` - -""") -q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append( - { - 'query': - 'Query8 cause_source_concept_id/cause_concept_id is null in aou_death table', - 'result': - 'PASS' - }, - ignore_index=True) -else: - df = df.append( - { - 'query': - 'Query8 cause_source_concept_id/cause_concept_id is null in aou_death table', - 'result': - 'Failure' - }, - ignore_index=True) - -df1.T -# - - -# # 9 check State_of_Residence fields in the person_ext table in deid_clean +# # 8 check State_of_Residence fields in the person_ext table in deid_clean # # Generalization Rules for reference # @@ -308,24 +268,23 @@ WHERE state_of_residence_concept_id IS NOT NULL OR state_of_residence_source_value IS NOT NULL """) q = query.render(project_id=project_id, deid_clean_cdr=deid_clean_cdr) -df1 = execute(client, q) +result = execute(client, q) -if df1.loc[0].sum() == 0: - df = df.append( +if result.loc[0].sum() == 0: + summary = summary.append( { - 'query': 'Query9 State_of_Residence in person_ext', + 'query': 'Query8, State_of_Residence in person_ext', 'result': ' Failure' }, ignore_index=True) else: - df = df.append( + summary = summary.append( { - 'query': 'Query9 State_of_Residence in person_ext', + 'query': 'Query8, State_of_Residence in person_ext', 'result': 'PASS' }, ignore_index=True) -df1 - +result # - # # Summary_row_ICD_suppression @@ -337,4 +296,4 @@ def highlight_cells(val): return f'background-color: {color}' -df.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) +summary.style.applymap(highlight_cells).set_properties(**{'text-align': 'left'}) From 875058180cd0843743501ee7d87b1e29da424df0 Mon Sep 17 00:00:00 2001 From: brendagutman <77469967+brendagutman@users.noreply.github.com> Date: Wed, 17 Jan 2024 16:27:46 -0600 Subject: [PATCH 17/19] [DC-3661] Update `fitbit_qc` parameters and descriptions (#1843) * [DC-3361] Update `fitbit_qc` parameters and descriptions * [DC-3661] suggested changes --- data_steward/analytics/cdr_ops/fitbit_qc.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/data_steward/analytics/cdr_ops/fitbit_qc.py b/data_steward/analytics/cdr_ops/fitbit_qc.py index dd1eb6d4e3..f1f1210f95 100644 --- a/data_steward/analytics/cdr_ops/fitbit_qc.py +++ b/data_steward/analytics/cdr_ops/fitbit_qc.py @@ -16,7 +16,7 @@ project_id: str = "" # identifies the project where datasets are located fitbit_dataset: str = "" # identifies the name of the new fitbit dataset sandbox_dataset: str = "" # the pipeline tables sandbox -source_dataset: str = "" # identifies the name of the rdr dataset +source_dataset: str = "" # identifies the name of the clean rdr dataset deid_dataset: str = "" # dataset contains wear_study table cutoff_date: str = "" # CDR cutoff date in YYYY--MM-DD format run_as: str = "" # service account email to impersonate @@ -166,7 +166,7 @@ project=project_id, dataset=fitbit_dataset, table_name=table, - sandbox_dataset=sandbox_dataset, + sandbox_dataset=f"{fitbit_dataset}_sandbox", date_column=date_columns[table], secondary_date_column=secondary_date_column.get(table))) union_all_query = '\nUNION ALL\n'.join(queries_list) @@ -291,13 +291,14 @@ SELECT src_id, ROUND(COUNT(CASE WHEN fb.person_id IS NULL THEN 1 ELSE NULL END) * 100 / COUNT(c_ws),1) AS percent_without_fb, -FROM (SELECT * FROM {{project_id}}.{{raw_rdr}}.observation WHERE observation_source_concept_id = 2100000010) o +FROM (SELECT * FROM {{project_id}}.{{source_dataset}}.observation WHERE observation_source_concept_id = 2100000010) o +JOIN {{project_id}}.{{source_dataset}}._mapping_observation USING(observation_id) JOIN consenting_ws_ids c_ws USING(person_id) LEFT JOIN fb_person_ids fb ON o.person_id = fb.person_id GROUP BY 1 """).render(project_id=project_id, dataset=fitbit_dataset, - raw_rdr=source_dataset, + source_dataset=source_dataset, pipeline=sandbox_dataset, deid_dataset=deid_dataset) From dbec3b70c8555b56e3aad4e825597348f9ef52eb Mon Sep 17 00:00:00 2001 From: brendagutman <77469967+brendagutman@users.noreply.github.com> Date: Wed, 17 Jan 2024 16:28:44 -0600 Subject: [PATCH 18/19] [DC-3668] Ignore race/ethnicity sub categories in ct notebook (#1846) * [DC-3668] Ignore race/ethnicity sub categories in ct notebook * [DC-3668] remove unsuppressed concepts from csv --- .../csv/Controlled_Tier_Concept_Level.csv | 35 ------------------- .../controlled_tier_qc/sql/query_templates.py | 2 ++ 2 files changed, 2 insertions(+), 35 deletions(-) diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv index a032d67973..dae3b45c76 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Concept_Level.csv @@ -218,45 +218,10 @@ observation,observation_source_value,,NHPI_NHPISpecific,YES,DC-1366 observation,observation_source_value,,White_WhiteSpecific,YES,DC-1366 observation,observation_source_value,,Hispanic_HispanicSpecific,YES,DC-1366 observation,observation_source_value,,RaceEthnicityNoneOfThese_RaceEthnicityFreeTextBox,YES,DC-1366 -observation,observation_source_concept_id,1586151,,YES,DC-1366 -observation,observation_source_concept_id,1586150,,YES,DC-1366 -observation,observation_source_concept_id,1586152,,YES,DC-1366 -observation,observation_source_concept_id,1586153,,YES,DC-1366 -observation,observation_source_concept_id,1586154,,YES,DC-1366 -observation,observation_source_concept_id,1586155,,YES,DC-1366 -observation,observation_source_concept_id,1586156,,YES,DC-1366 observation,observation_source_concept_id,1586149,,YES,DC-1366 -observation,observation_concept_id,1586151,,YES,DC-1366 -observation,observation_concept_id,1586150,,YES,DC-1366 -observation,observation_concept_id,1586152,,YES,DC-1366 -observation,observation_concept_id,1586153,,YES,DC-1366 -observation,observation_concept_id,1586154,,YES,DC-1366 -observation,observation_concept_id,1586155,,YES,DC-1366 -observation,observation_concept_id,1586156,,YES,DC-1366 observation,observation_concept_id,1586149,,YES,DC-1366 -observation,value_as_concept_id,1586151,,YES,DC-1366 -observation,value_as_concept_id,1586150,,YES,DC-1366 -observation,value_as_concept_id,1586152,,YES,DC-1366 -observation,value_as_concept_id,1586153,,YES,DC-1366 -observation,value_as_concept_id,1586154,,YES,DC-1366 -observation,value_as_concept_id,1586155,,YES,DC-1366 -observation,value_as_concept_id,1586156,,YES,DC-1366 observation,value_as_concept_id,1586149,,YES,DC-1366 -observation,value_source_concept_id,1586151,,YES,DC-1366 -observation,value_source_concept_id,1586150,,YES,DC-1366 -observation,value_source_concept_id,1586152,,YES,DC-1366 -observation,value_source_concept_id,1586153,,YES,DC-1366 -observation,value_source_concept_id,1586154,,YES,DC-1366 -observation,value_source_concept_id,1586155,,YES,DC-1366 -observation,value_source_concept_id,1586156,,YES,DC-1366 observation,value_source_concept_id,1586149,,YES,DC-1366 -observation,value_source_value,,Asian_AsianSpecific,YES,DC-1366 -observation,value_source_value,,AIAN_AIANSpecific,YES,DC-1366 -observation,value_source_value,,Black_BlackSpecific,YES,DC-1366 -observation,value_source_value,,MENA_MENASpecific,YES,DC-1366 -observation,value_source_value,,NHPI_NHPISpecific,YES,DC-1366 -observation,value_source_value,,White_WhiteSpecific,YES,DC-1366 -observation,value_source_value,,Hispanic_HispanicSpecific,YES,DC-1366 observation,value_source_value,,RaceEthnicityNoneOfThese_RaceEthnicityFreeTextBox,YES,DC-1366 observation,observation_source_concept_id,1333234,,YES,DC-1496 observation,observation_source_concept_id,1310066,,YES,DC-1496 diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py b/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py index 23693cd325..87be381182 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/sql/query_templates.py @@ -116,6 +116,8 @@ SELECT {{ new_id }} {% if mapping_table == 'site_maskings' %} FROM `{{ project_id }}.{{ pipeline_dataset }}.{{ mapping_table }}` + {% elif mapping_table == '_deid_questionnaire_response_map' %} + FROM `{{ project_id }}.{{ questionnaire_response_dataset }}.{{ mapping_table }}` {% else %} FROM `{{ project_id }}.{{ mapping_dataset }}.{{ mapping_table }}` {% endif %} From 9961da61a7e6ca096814920ec1423e7bbb3cefcc Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Mon, 22 Jan 2024 08:33:52 -0600 Subject: [PATCH 19/19] [DC-3271] Remove irrelevant bq_utils import statements (#1841) * [DC-3632] Remove five cases of unused imports * [DC-3632] Update 5 more usages of imports, some formatting occurred * [DC-3632] remove 5 more usages of imports, some formatting occurred * [DC-3632] remove 5 more usages of imports, some formatting occurred * [DC-3632] remove final 2 occurrences, some formatting occurred * [DC-3632] Add required import back --- .../cdr_ops/ad_hoc_analyses/cdr_person_id.py | 25 ++-- .../cdr_ops/ad_hoc_analyses/cohort_testing.py | 29 ++-- .../cdr_ops/ad_hoc_analyses/coverage.py | 27 ++-- .../cdr_ops/ad_hoc_analyses/deid_race.py | 15 +- .../ehr_demographics_by_site.py | 77 +++++----- .../ad_hoc_analyses/ehr_ops_row_counts.py | 11 +- .../ad_hoc_analyses/generalized_dupes.py | 9 +- .../ad_hoc_analyses/identify_required_labs.py | 75 +++++----- .../list_tables_with_duplicate_domain_ids.py | 5 +- .../ad_hoc_analyses/person_gender_sex.py | 5 +- .../cdr_ops/ad_hoc_analyses/pop_retract.py | 21 ++- .../cdr_ops/ad_hoc_analyses/sex_gender_540.py | 1 - .../cdr_ops/ad_hoc_analyses/site_mapping.py | 11 +- .../standard_concepts_cdr_389.py | 13 +- .../data_loss_through_pipeline.py | 117 ++++++++------- .../date_disparity_with_respect_to_visit.py | 139 +++++++++--------- .../ehr_data_quality_dashboard_testing.py | 37 +++-- .../notes_volume_distribution.py | 19 ++- .../validation/metrics/required_labs.py | 4 +- .../retraction/retract_data_gcs_test.py | 1 - .../data_steward/validation/export_test.py | 1 - 21 files changed, 311 insertions(+), 331 deletions(-) diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py index 6a4cae5c22..137cf93d92 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py @@ -1,7 +1,6 @@ # # Person # ## Person ID validation -import bq_utils import utils.bq from notebooks.parameters import RDR_DATASET_ID, EHR_DATASET_ID @@ -13,7 +12,7 @@ hpo_ids = utils.bq.query(""" SELECT REPLACE(table_id, '_person', '') AS hpo_id FROM `{EHR_DATASET_ID}.__TABLES__` -WHERE table_id LIKE '%person' +WHERE table_id LIKE '%person' AND table_id NOT LIKE '%unioned_ehr_%' AND table_id NOT LIKE '\\\_%' """.format(EHR_DATASET_ID=EHR_DATASET_ID)).hpo_id.tolist() @@ -31,7 +30,7 @@ (SELECT COUNT(1) AS n FROM {EHR_DATASET_ID}.{h}_person e WHERE NOT EXISTS( - SELECT 1 + SELECT 1 FROM {RDR_DATASET_ID}.person r WHERE r.person_id = e.person_id)) not_in_rdr ON TRUE @@ -63,31 +62,31 @@ RDR_EHR_NAME_MATCH_QUERY = ''' WITH rdr_first_name AS - (SELECT DISTINCT person_id, - FIRST_VALUE(value_as_string) + (SELECT DISTINCT person_id, + FIRST_VALUE(value_as_string) OVER (PARTITION BY person_id, observation_source_value ORDER BY value_as_string) val FROM {RDR_DATASET_ID}.observation WHERE observation_source_value = 'PIIName_First'), rdr_last_name AS - (SELECT DISTINCT person_id, - FIRST_VALUE(value_as_string) + (SELECT DISTINCT person_id, + FIRST_VALUE(value_as_string) OVER (PARTITION BY person_id, observation_source_value ORDER BY value_as_string) val FROM {RDR_DATASET_ID}.observation WHERE observation_source_value = 'PIIName_Last'), rdr_name AS - (SELECT + (SELECT f.person_id person_id, - f.val first_name, + f.val first_name, l.val last_name FROM rdr_first_name f JOIN rdr_last_name l USING (person_id)) SELECT '{HPO_ID}' hpo_id, - rdr.person_id rdr_person_id, - rdr.first_name rdr_first_name, - rdr.last_name rdr_last_name, + rdr.person_id rdr_person_id, + rdr.first_name rdr_first_name, + rdr.last_name rdr_last_name, pii.person_id pii_person_id, pii.first_name pii_first_name, pii.middle_name pii_middle_name, @@ -97,7 +96,7 @@ FROM rdr_name rdr JOIN `{EHR_DATASET_ID}.{HPO_ID}_pii_name` pii ON pii.first_name = rdr.first_name - AND pii.last_name = rdr.last_name + AND pii.last_name = rdr.last_name LEFT JOIN `{EHR_DATASET_ID}.{HPO_ID}_person` p ON pii.person_id = p.person_id ''' diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py index a119882de8..8bdcb29c29 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py @@ -21,7 +21,6 @@ # - Record count (condition_occurrence) # # - We want to determine if these fluctations are potentially caused by OMOP vocabulary issues. If this is the case, we should be able to determine similar trends in AoU data. -import bq_utils import utils.bq from notebooks import parameters @@ -42,8 +41,8 @@ q4_2018_hypo_obs_card_query = """ SELECT DISTINCT -co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, -COUNT(DISTINCT co.condition_occurrence_id) as num_records, +co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, +COUNT(DISTINCT co.condition_occurrence_id) as num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as records_per_capita FROM @@ -190,8 +189,8 @@ q2_2019_hypo_obs_card_query = """ SELECT DISTINCT -co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, -COUNT(DISTINCT co.condition_occurrence_id) as num_records, +co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, +COUNT(DISTINCT co.condition_occurrence_id) as num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as records_per_capita FROM @@ -339,14 +338,14 @@ SELECT DISTINCT q4.*, q2.*, (SUM(q2.num_persons) - SUM(q4.old_num_persons)) as person_difference, -(SUM(q2.num_records) - SUM(q4.old_num_records)) as record_difference +(SUM(q2.num_records) - SUM(q4.old_num_records)) as record_difference FROM (SELECT DISTINCT - co.condition_concept_id as old_condition_concept_id, c.concept_name as old_concept_name, - COUNT(DISTINCT p.person_id) AS old_num_persons, - COUNT(DISTINCT co.condition_occurrence_id) as old_num_records, + co.condition_concept_id as old_condition_concept_id, c.concept_name as old_concept_name, + COUNT(DISTINCT p.person_id) AS old_num_persons, + COUNT(DISTINCT co.condition_occurrence_id) as old_num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as old_records_per_capita FROM @@ -378,13 +377,13 @@ GROUP BY 1, 2 ORDER BY old_num_persons DESC) q4 - + LEFT JOIN - + (SELECT DISTINCT - co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, - COUNT(DISTINCT co.condition_occurrence_id) as num_records, + co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, + COUNT(DISTINCT co.condition_occurrence_id) as num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as records_per_capita FROM @@ -416,10 +415,10 @@ GROUP BY 1, 2 ORDER BY num_persons DESC) q2 - + ON q4.old_condition_concept_id = q2.condition_concept_id - + GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ORDER BY old_num_persons DESC diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py index 23755d5bf7..b136297338 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py @@ -14,7 +14,6 @@ import warnings -import bq_utils import utils.bq from notebooks import parameters warnings.filterwarnings('ignore') @@ -33,11 +32,11 @@ def get_hpo_table_columns(hpo_id): :param hpo_id: hpo site id :return: dataframe with table name, column name and table row count """ - query = """SELECT table_name, column_name, t.row_count as table_row_count, '{hpo_id}' as hpo_id + query = """SELECT table_name, column_name, t.row_count as table_row_count, '{hpo_id}' as hpo_id FROM {dataset}.INFORMATION_SCHEMA.COLUMNS c JOIN {dataset}.__TABLES__ t on c.table_name=t.table_id WHERE STARTS_WITH(table_id, lower('{hpo_id}'))=true AND - NOT(table_id like '_mapping%') AND + NOT(table_id like '_mapping%') AND ( table_id like '%person' OR table_id like '%visit_occurrence' OR @@ -59,25 +58,25 @@ def get_hpo_table_columns(hpo_id): def create_hpo_completeness_query(table_columns, hpo_id): - query_with_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated + query_with_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated FROM ( SELECT '{table_name}' as table_name, '{column_name}' as column_name, '{hpo_id}' as site_name, - {table_row_count} as total_rows, + {table_row_count} as total_rows, sum(case when {column_name}=0 then 0 else 1 end) as num_nonnulls_zeros, - ({table_row_count} - count({column_name})) as non_populated_rows - FROM {dataset}.{table_name} - ) as x + ({table_row_count} - count({column_name})) as non_populated_rows + FROM {dataset}.{table_name} + ) as x """ - query_without_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated + query_without_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated FROM ( SELECT '{table_name}' as table_name, '{column_name}' as column_name, '{hpo_id}' as site_name, - {table_row_count} as total_rows, - count({column_name}) as num_nonnulls_zeros, - ({table_row_count} - count({column_name})) as non_populated_rows - FROM {dataset}.{table_name} - ) as x + {table_row_count} as total_rows, + count({column_name}) as num_nonnulls_zeros, + ({table_row_count} - count({column_name})) as non_populated_rows + FROM {dataset}.{table_name} + ) as x """ queries = [] for i, row in table_columns.iterrows(): diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py index d9836fe200..9a74fe4da5 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # + -import bq_utils import utils.bq from notebooks import render, parameters import pandas as pd @@ -27,19 +26,19 @@ MULTIRACIAL_DIST_QUERY = """ WITH race_combo AS -(SELECT o.person_id, - o.questionnaire_response_id, +(SELECT o.person_id, + o.questionnaire_response_id, STRING_AGG(REPLACE(c.concept_code, 'WhatRaceEthnicity_', ''), ' ' ORDER BY value_source_value) selected_races FROM {DATASET}.observation o - JOIN {VOCAB}.concept c ON o.value_source_concept_id = c.concept_id + JOIN {VOCAB}.concept c ON o.value_source_concept_id = c.concept_id WHERE observation_source_concept_id = 1586140 GROUP BY person_id, questionnaire_response_id) - -SELECT - selected_races, + +SELECT + selected_races, (LENGTH(selected_races) - LENGTH(REPLACE(selected_races, ' ', '')) + 1) AS selected_count, COUNT(DISTINCT person_id) row_count -FROM race_combo +FROM race_combo GROUP BY selected_races ORDER BY selected_count, selected_races """ diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py index db02e480c9..442b4d6c51 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py @@ -26,7 +26,6 @@ client = bigquery.Client() # %load_ext google.cloud.bigquery -import bq_utils import utils.bq from notebooks import parameters # %matplotlib inline @@ -95,18 +94,18 @@ racial_distribution_by_site_query = """ SELECT DISTINCT -a.*, b.number_from_site, ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as percent_of_site_persons +a.*, b.number_from_site, ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as percent_of_site_persons FROM (SELECT DISTINCT - mp.src_hpo_id, p.race_concept_id, c.concept_name, + mp.src_hpo_id, p.race_concept_id, c.concept_name, COUNT(p.race_concept_id) as number_of_demographic, FROM `{DATASET}.unioned_ehr_person` p LEFT JOIN `{DATASET}._mapping_person` mp ON - p.person_id = mp.src_person_id + p.person_id = mp.src_person_id LEFT JOIN `{DATASET}.concept` c ON @@ -141,17 +140,17 @@ def return_hpos_to_display(hpo_names, max_num_sites_to_display): Function is intended to return a means for divide the number of HPOs into an appropriate number of lists based on the maximum number of sites a user wants to display. - + This is useful for creating graphs that will only display a fraction of the total HPOs. - + Parameters ---------- hpo_names (list): list of all the health provider organizations (in string form) - + num_sites_to_display (int): user-specified number of sites to display in each graph - - + + Returns ------- all_hpos (list): contains several lists, each of which contains a number of sites @@ -195,17 +194,17 @@ def create_information_dictionary_for_sites(hpo_dfs, selected_hpo_names, """ Function is used to create a dictionary that contains the racial makeup of a selected number of sites (expressed as a percentage, from a source dataframe) - + Parameters ---------- hpo_dfs (dictonary): has the following structure key: string representing an HPO ID value: dataframe that contains information about the different race concepts (IDs and names) and their relative spread within the site - + selected_hpo_names (list): contains strings that represent the different HPOs that will ultimately be translated to a dictionary - + most_popular_race_cids (list): list of the most popular concept IDs (across all sites) @@ -253,23 +252,23 @@ def create_information_dictionary_for_sites(hpo_dfs, selected_hpo_names, def create_graphs(hpo_names_to_display, num_races_for_legend, racial_percentages, img_name): """ - Function is used to create and save graphs that show the racial distribution for + Function is used to create and save graphs that show the racial distribution for a selected number of sites - + Parameters ---------- hpo_names_to_display (list): list with a user-specified number of HPOs that are to be displayed in the graph - + num_races_for_legend (int): the number of races that are to be displayed next to the graph - + racial_percentages (dictionary): has the following structure key: race concept ID value: list, each index represents one of the sites in the 'selected_hpo_names' parameter. the value represents the proportion of persons from the HPO who have the reported race concept ID - + img_name (string): name for the image to be displayed """ num_sites_to_display = len(hpo_names_to_display) @@ -408,46 +407,46 @@ def create_query_for_particular_table(dataset, percent_of_table, table_name): - number of IDs for that particular group in the specified table - total number of IDs for the HPO - percentage of the records for the site that belong to that demographic class - + This query is then run through bigquery and returns a dataframe - - + + Parameters ---------- dataset (str): dataset to be queried (defined at the top of the workbook) - + percent_of_table (str): the string to represent the percentage of the records for the site that belong to the particular demographic class - + table_name (str): name of the table to be investigated - - + + Returns ------- dataframe (df): contains the information specified in the top of the docstring - + """ query = """ SELECT DISTINCT - a.src_hpo_id, a.race_concept_id, a.concept_name, - ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as {percent_of_table} + a.src_hpo_id, a.race_concept_id, a.concept_name, + ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as {percent_of_table} FROM (SELECT DISTINCT - mp.src_hpo_id, p.race_concept_id, c.concept_name, + mp.src_hpo_id, p.race_concept_id, c.concept_name, COUNT(p.race_concept_id) as number_of_demographic, FROM `{dataset}.unioned_ehr_{table_name}` x LEFT JOIN `{dataset}.unioned_ehr_person` p ON - x.person_id = p.person_id + x.person_id = p.person_id LEFT JOIN `{dataset}._mapping_person` mp ON - p.person_id = mp.src_person_id + p.person_id = mp.src_person_id LEFT JOIN `{dataset}.concept` c ON @@ -464,7 +463,7 @@ def create_query_for_particular_table(dataset, percent_of_table, table_name): LEFT JOIN `{dataset}.unioned_ehr_person` p ON - x.person_id = p.person_id + x.person_id = p.person_id LEFT JOIN `{dataset}._mapping_person` mp ON @@ -549,13 +548,13 @@ def find_all_distributions_for_site_race_combo(df, hpo, race, This function is used to calculate the relative 'underrepresentation' of a given race for a particular table when compared to the race's overall representation in the person table. - + For instance, a site may have 65% participants who identify as 'White'. The persons who identify with this race, however, only make up 60% of the drug_exposure_ids in the drug exposure table. This would result in a 'underrepresentation' of 5% for persons at this particular site for this particular table. - - + + Parameters ---------- df (df): dataframe that contains the following information in its fields: @@ -567,15 +566,15 @@ def find_all_distributions_for_site_race_combo(df, hpo, race, aforementioned race_concept_id e. the same metric as d but also for the condition, observation, procedure, and visit tables - + hpo (string): HPO whose 'representation' metric is going to be assessed - + race (string): race concept name that will be evaluated for 'representation' - + person_distribution: the proportion of person_ids for the particular site that belong to the aforementioned race - - + + Returns ------- difference_df: contains the 'difference' between the proportion of records diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py index deb579b9d3..2f2501fb8f 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py @@ -16,7 +16,6 @@ # + import datetime -import bq_utils import utils.bq from notebooks.parameters import RDR_PROJECT_ID, RDR_DATASET_ID, EHR_DATASET_ID @@ -65,17 +64,17 @@ # ## EHR Site Submission Counts utils.bq.query(''' -SELECT +SELECT l.Org_ID AS org_id, l.HPO_ID AS hpo_id, l.Site_Name AS site_name, - table_id AS table_id, + table_id AS table_id, row_count AS row_count FROM `{EHR_DATASET_ID}.__TABLES__` AS t -JOIN `lookup_tables.hpo_site_id_mappings` AS l +JOIN `lookup_tables.hpo_site_id_mappings` AS l ON STARTS_WITH(table_id,lower(l.HPO_ID))=true WHERE table_id like '%person%' AND -NOT(table_id like '%unioned_ehr_%') AND +NOT(table_id like '%unioned_ehr_%') AND l.hpo_id <> '' ORDER BY Display_Order '''.format(EHR_DATASET_ID=EHR_DATASET_ID)) @@ -84,7 +83,7 @@ hpo_ids = utils.bq.query(""" SELECT REPLACE(table_id, '_person', '') AS hpo_id FROM `{EHR_DATASET_ID}.__TABLES__` -WHERE table_id LIKE '%person' +WHERE table_id LIKE '%person' AND table_id NOT LIKE '%unioned_ehr_%' AND table_id NOT LIKE '\\\_%' """.format(EHR_DATASET_ID=EHR_DATASET_ID)).hpo_id.tolist() diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py index 271dde74a1..d6dc0ce230 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py @@ -1,5 +1,4 @@ # + -import bq_utils import utils.bq from notebooks import parameters @@ -17,7 +16,7 @@ COUNT(*) FROM `{DEID}.observation` AS o -JOIN +JOIN ( SELECT observation_id @@ -31,7 +30,7 @@ observation_id DESC) AS rank_order, observation_id FROM - `{DEID}.observation` + `{DEID}.observation` JOIN `{COMBINED}._mapping_observation` as map USING @@ -40,9 +39,9 @@ AND value_source_concept_id IN (2000000008, 2000000005, 2000000004, 2000000002) AND map.src_hpo_id like "rdr" ) o - WHERE + WHERE o.rank_order <> 1 -) unique_observation_ids +) unique_observation_ids ON o.observation_id = unique_observation_ids.observation_id """ q = DUPLICATE_GEN_RACE_QUERY.format(DEID=DEID, COMBINED=COMBINED) diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py index c2bc3db3f9..b9b58a0c3f 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import bq_utils import utils.bq from notebooks import render @@ -27,13 +26,13 @@ -- 36208195 Lab terms not yet categorized -- 36207527 Clinical terms not yet categorized -- 36210656 Survey terms not yet categorized - - -- Exclude the list of the "coarse" generalized concept ids + + -- Exclude the list of the "coarse" generalized concept ids -- 40772590: Cholesterol -- 40782521: Leukocytes -- 40779250: Protein in the grandparent lookup - SELECT - excluded_ancestor_concept_id + SELECT + excluded_ancestor_concept_id FROM UNNEST([36208978, 36206173, 36208195, 36207527, 36210656, 40782521, 40779250, 40772590]) AS excluded_ancestor_concept_id ), @@ -51,22 +50,22 @@ IF(ex.excluded_ancestor_concept_id IS NULL, COALESCE(ca.min_levels_of_separation, -1), -1) AS distance FROM `ehr_ops.measurement_concept_sets` AS m - JOIN + JOIN `{VOCAB_DATASET_ID}.concept` AS c1 - ON + ON m.Measurement_OMOP_ID = c1.concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept_ancestor` AS ca ON - m.Measurement_OMOP_ID = ca.descendant_concept_id + m.Measurement_OMOP_ID = ca.descendant_concept_id AND ca.min_levels_of_separation = 1 - LEFT JOIN + LEFT JOIN get_excluded_ancestor_ids AS ex - ON + ON ca.ancestor_concept_id = ex.excluded_ancestor_concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept` AS c2 - ON + ON ca.ancestor_concept_id = c2.concept_id WHERE c2.concept_class_id IS NULL OR c2.concept_class_id = 'LOINC Group' ), @@ -85,23 +84,23 @@ IF(ex.excluded_ancestor_concept_id IS NULL, COALESCE(ca.min_levels_of_separation, -1), -1) AS distance FROM `ehr_ops.measurement_concept_sets` AS m - JOIN + JOIN `{VOCAB_DATASET_ID}.concept` AS c1 - ON + ON m.Measurement_OMOP_ID = c1.concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept_ancestor` AS ca ON - m.Measurement_OMOP_ID = ca.descendant_concept_id + m.Measurement_OMOP_ID = ca.descendant_concept_id AND ca.min_levels_of_separation IN (1, 2) - LEFT JOIN + LEFT JOIN get_excluded_ancestor_ids AS ex - ON + ON ca.ancestor_concept_id = ex.excluded_ancestor_concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept` AS c2 - ON - ca.ancestor_concept_id = c2.concept_id + ON + ca.ancestor_concept_id = c2.concept_id WHERE -- if there is not ancestors for the measurement_concept_id (ca.descendant_concept_id IS NULL) @@ -112,15 +111,15 @@ -- if the level of seperation is 2, we keep them only when the concept_name subsumes the grandparent concept_name (c2.concept_class_id = 'LOINC Hierarchy' AND ca.min_levels_of_separation = 2 AND c1.concept_name LIKE CONCAT('%', c2.concept_name , '%')) OR - -- if the level of seperation is 2, the 6 concept names (such as MCH [Entitic mass], MCV [Entitic volume]) do not follow the previous rule, + -- if the level of seperation is 2, the 6 concept names (such as MCH [Entitic mass], MCV [Entitic volume]) do not follow the previous rule, -- because the acronyms are used in the concept_name and full names are used in the grandparent concept_name (c2.concept_class_id = 'LOINC Hierarchy' AND ca.min_levels_of_separation = 2 AND c1.concept_id IN (3035941, 3024731, 3003338, 3012030, 3009744, 3023599)) ), -get_ancestors_loinc_hierarchy_distinct AS +get_ancestors_loinc_hierarchy_distinct AS ( - # For some concepts in LONIC Hierarchy, we include both parent and grandparent concept_ids, - # We want to remove the parent concept_id if the grandparent concept_id is present. + # For some concepts in LONIC Hierarchy, we include both parent and grandparent concept_ids, + # We want to remove the parent concept_id if the grandparent concept_id is present. SELECT DISTINCT Panel_OMOP_ID, Panel_Name, @@ -132,7 +131,7 @@ distance FROM ( - SELECT DISTINCT + SELECT DISTINCT *, dense_rank() over(PARTITION BY measurement_concept_id ORDER BY distance DESC) AS rank_order FROM get_ancestors_loinc_hierarchy @@ -142,9 +141,9 @@ get_loinc_group_descendant_concept_ids AS ( - # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case + # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case # we make the measurement_concept_id its own ancestor - SELECT + SELECT lg.Panel_OMOP_ID, lg.Panel_Name, lg.measurement_concept_id, @@ -157,18 +156,18 @@ COALESCE(c1.concept_class_id, lg.parent_concept_class_id) AS loinc_groupy_descendant_concept_class_id, COALESCE(ca1.min_levels_of_separation, -1) AS distance FROM get_direct_parents_loinc_group AS lg - LEFT JOIN + LEFT JOIN {VOCAB_DATASET_ID}.concept_ancestor AS ca1 ON - lg.parent_concept_id = ca1.ancestor_concept_id + lg.parent_concept_id = ca1.ancestor_concept_id AND ca1.min_levels_of_separation <> 0 LEFT JOIN {VOCAB_DATASET_ID}.concept AS c1 - ON ca1.descendant_concept_id = c1.concept_id + ON ca1.descendant_concept_id = c1.concept_id ), get_loinc_hierarchy_descendant_concept_ids AS ( - # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case + # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case # we make the measurement_concept_id its own ancestor SELECT lh.Panel_OMOP_ID, @@ -183,19 +182,19 @@ COALESCE(c1.concept_class_id, lh.ancestor_concept_class_id) AS loinc_hierarchy_descendant_concept_class_id, COALESCE(ca1.min_levels_of_separation, -1) AS distance FROM get_ancestors_loinc_hierarchy_distinct AS lh - LEFT JOIN + LEFT JOIN {VOCAB_DATASET_ID}.concept_ancestor AS ca1 ON lh.ancestor_concept_id = ca1.ancestor_concept_id AND ca1.min_levels_of_separation <> 0 LEFT JOIN {VOCAB_DATASET_ID}.concept AS c1 - ON ca1.descendant_concept_id = c1.concept_id + ON ca1.descendant_concept_id = c1.concept_id ), get_measurement_concept_sets_descendants AS ( - # We use a full outer join between the loinc_hierarchy descendants and loinc_group descendants - # in order to maximize the number of descendants retrieved by both classficiation systems. + # We use a full outer join between the loinc_hierarchy descendants and loinc_group descendants + # in order to maximize the number of descendants retrieved by both classficiation systems. SELECT DISTINCT COALESCE(lh.Panel_OMOP_ID, lg.Panel_OMOP_ID) AS panel_omop_id, COALESCE(lh.Panel_Name, lg.Panel_Name) AS panel_name, @@ -213,7 +212,7 @@ COALESCE(lh.loinc_hierarchy_descendant_concept_name, lg.loinc_groupy_descendant_concept_name) AS descendant_concept_name, COALESCE(lh.loinc_hierarchy_descendant_concept_class_id, lg.loinc_groupy_descendant_concept_class_id) AS descendant_concept_class_id FROM get_loinc_hierarchy_descendant_concept_ids AS lh - FULL OUTER JOIN + FULL OUTER JOIN get_loinc_group_descendant_concept_ids AS lg ON lh.loinc_hierarchy_descendant_concept_id = lg.loinc_groupy_descendant_concept_id @@ -228,20 +227,20 @@ COUNT(DISTINCT person_id) AS n_person, COUNT(DISTINCT measurement_id) AS n_meas, COUNT(DISTINCT descendant_concept_id) AS n_descendant -FROM +FROM ( SELECT measurement_id, person_id, IF(measurement_concept_id IS NULL OR measurement_concept_id=0, measurement_source_concept_id, measurement_concept_id) AS measurement_concept_id FROM - `{DATASET_ID}.measurement` + `{DATASET_ID}.measurement` ) meas JOIN `{DATASET_ID}._mapping_measurement` USING (measurement_id) -JOIN +JOIN get_measurement_concept_sets_descendants AS valid_lab ON meas.measurement_concept_id = valid_lab.descendant_concept_id @@ -251,7 +250,7 @@ 3, 4, 5 -ORDER BY +ORDER BY 1,2 """ diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py index 4baab0bc7b..ca0370ff17 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py @@ -12,7 +12,6 @@ # name: python3 # --- -import bq_utils import utils.bq from notebooks import parameters @@ -26,7 +25,7 @@ query = """ SELECT REPLACE(table_id, '_person', '') AS hpo_id FROM `{bq_dataset_id}.__TABLES__` -WHERE table_id LIKE '%person' +WHERE table_id LIKE '%person' AND table_id NOT LIKE '%unioned_ehr_%' AND table_id NOT LIKE '\\\_%' """.format(bq_dataset_id=bigquery_dataset_id) hpo_ids = utils.bq.query(query).tolist() @@ -48,7 +47,7 @@ FROM prod_drc_dataset.__TABLES__ T LEFT JOIN (select distinct '{h}_{d}' as table_name, count(*) as num_dups -from `{bq_dataset_id}.{h}_{d}` +from `{bq_dataset_id}.{h}_{d}` group by {d}_id having count(*) > 1 order by num_dups desc diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py index 9311b51082..616d11a5c6 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py @@ -7,7 +7,6 @@ # * `sex_at_birth_concept_id` contains the associated `value_as_concept_id` # * `sex_at_birth_source_concept_id` contains the associated `value_source_concept_id` # * `sex_at_birth_source_value` contains the `concept_code` associated with `sex_at_birth_source_concept_id` -import bq_utils import utils.bq from notebooks import render from notebooks.parameters import SANDBOX, DEID_DATASET_ID @@ -88,7 +87,7 @@ def df_to_gbq(df, destination_table, table_schema=None): # - UPDATED_PERSON_QUERY = """ -SELECT +SELECT p.person_id, g.gender_concept_id, p.year_of_birth, @@ -127,7 +126,7 @@ def df_to_gbq(df, destination_table, table_schema=None): table_schema=person_schema) PERSON_HIST_QUERY = """ -SELECT +SELECT p.gender_concept_id, p.gender_source_value, p.gender_source_concept_id, diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py index 654cd90dd0..1fdf9b2241 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py @@ -1,7 +1,6 @@ # + from jinja2 import Template -import bq_utils import utils.bq from notebooks import render from notebooks.defaults import is_deid_dataset @@ -36,14 +35,14 @@ # Determine associated research IDs for RDR participants whose data must be retracted AIAN_PID_QUERY = """ -SELECT DISTINCT +SELECT DISTINCT rdr.person_id AS person_id, deid.research_id AS research_id FROM `{RDR}.observation` rdr JOIN `{COMBINED}.deid_map` deid ON rdr.person_id = deid.person_id -WHERE - rdr.observation_source_concept_id = 1586140 +WHERE + rdr.observation_source_concept_id = 1586140 AND rdr.value_source_concept_id = 1586141 """ q = AIAN_PID_QUERY.format(RDR=RDR, COMBINED=COMBINED) @@ -80,24 +79,24 @@ def get_tables_with_person_id(input_dataset): WITH delete_row_counts AS ( {% for table in TABLES %} ( - SELECT '{{ table }}' AS table_name, + SELECT '{{ table }}' AS table_name, COUNT(1) AS rows_to_delete, (SELECT row_count FROM {{ INPUT_DATASET }}.__TABLES__ WHERE table_id = '{{ table }}') AS total_rows FROM `{{ INPUT_DATASET }}.{{ table }}` t WHERE EXISTS ( - SELECT 1 FROM `{{ ID_TABLE }}` + SELECT 1 FROM `{{ ID_TABLE }}` WHERE {{ 'research_id' if IS_INPUT_DATASET_DEID else 'person_id' }} = t.person_id) - ) + ) {% if not loop.last %} UNION ALL - {% endif %} + {% endif %} {% endfor %} ) -SELECT - d.table_name, +SELECT + d.table_name, d.total_rows AS input_row_count, d.rows_to_delete AS rows_to_delete, - d.total_rows - d.rows_to_delete AS expected_output_row_count, + d.total_rows - d.rows_to_delete AS expected_output_row_count, t.row_count AS actual_output_row_count, t.row_count = (d.total_rows - d.rows_to_delete) AS pass FROM delete_row_counts d diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py index a12fb65d5e..aaa610554a 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py @@ -28,7 +28,6 @@ # - gender_concept_id = value_as_concept_id # - gender_source_value = concept_code associated with value_source_concept_id # - gender_source_concept_id = value_source_concept_id -import bq_utils import utils.bq from notebooks import render, parameters import pandas as pd diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py index 57b6730374..71cfbf19cd 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py @@ -1,5 +1,4 @@ # + -import bq_utils import utils.bq from notebooks import render, parameters @@ -12,10 +11,10 @@ # ## Row counts in combined `_mapping*` and deid `*_ext` tables ROW_COUNTS_QUERY = """ -SELECT dataset_id, - REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', '') mapped_table, - table_id, - creation_time, +SELECT dataset_id, + REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', '') mapped_table, + table_id, + creation_time, last_modified_time, row_count FROM @@ -25,7 +24,7 @@ UNION ALL - SELECT * + SELECT * FROM {COMBINED}.__TABLES__ d1 WHERE table_id LIKE '\\\_mapping\\\_%') diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py index bac69c94ec..47af6fe87d 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py @@ -33,7 +33,6 @@ # # # #### This notebook also does not exclude instances where the concept_id = 0. -import bq_utils import utils.bq from notebooks import parameters @@ -53,7 +52,7 @@ co_query = """ SELECT DISTINCT -co.condition_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +co.condition_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, co_combined.condition_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%condition%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mco.src_hpo_id) as num_sites_w_change @@ -127,7 +126,7 @@ de_query = """ SELECT DISTINCT -de.drug_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +de.drug_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, de_combined.drug_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%drug%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mde.src_hpo_id) as num_sites_w_change @@ -202,7 +201,7 @@ m_query = """ SELECT DISTINCT -m.measurement_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +m.measurement_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, m_combined.measurement_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%measurement%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mm.src_hpo_id) as num_sites_w_change @@ -272,7 +271,7 @@ v_query = """ SELECT DISTINCT -v.visit_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +v.visit_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, v_combined.visit_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%visit%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mv.src_hpo_id) as num_sites_w_change @@ -342,7 +341,7 @@ p_query = """ SELECT DISTINCT -p.procedure_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +p.procedure_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, p_combined.procedure_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%procedure%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mp.src_hpo_id) as num_sites_w_change @@ -411,7 +410,7 @@ o_query = """ SELECT DISTINCT -o.observation_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +o.observation_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, o_combined.observation_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%observation%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mo.src_hpo_id) as num_sites_w_change diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py b/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py index f18712905c..984cf7e996 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py @@ -29,7 +29,6 @@ # %load_ext google.cloud.bigquery # %matplotlib inline -import bq_utils import utils.bq from notebooks import parameters import pandas as pd @@ -58,25 +57,25 @@ def create_dicts_w_info(df, x_label, column_label): """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + x_label (string): the column of the dataframe whose rows will then be converted to they keys of a dictionary - + column_label (string): the column that contains the data quality metric being investigated - + Returns ------- data_qual_info (dictionary): has the following structure - + keys: the column for a particular dataframe that represents the elements that whose data quality is being compared (e.g. HPOs, different measurement/unit combinations) - + values: the data quality metric being compared """ rows = df[x_label].unique().tolist() @@ -98,28 +97,28 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, color, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + color (str): character used to specify the colours of the bars - + total_diff_color (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric across all of the sites) - + turnoff_x (bool): used to disable the x-axis labels (for each of the bars). This is typically used when there are so many x-axis labels that they overlap and obscure legibility """ @@ -191,23 +190,23 @@ def create_pie_chart(dataframe, title, img_name): """ Function is used to create a pie chart that can show how much each site contributes to the overall 'drop' between the unioned and combined datasets - + Function also saves the outputted pie chart to the current directory - + Parameters ---------- - dataframe (df): dataframe for a particular table. shows the following for + dataframe (df): dataframe for a particular table. shows the following for HPOs that uploaded data: - + a. the number of rows in the unioned dataset b. the number of rows in the combined dataset c. the total 'drop' of rows across unioned to combined, expressed as a percentage d. the relative 'contribution' of each site to the overall drop from unioned to combined - - + + title (str): title of the graph - + img_name (str): title of the image to be saved """ hpo_list = dataframe['source_hpo'].tolist()[1:] # do not take 'total' @@ -246,24 +245,24 @@ def generate_query(dataset, person_var, record_var, table_name, field_name): a. generate a string that can be fed into BigQuery b. create a dataframe that contains information about the number of people and records for a particular dataset - + Parameters ---------- dataset (string): name of the dataset that will be queried (originally from the parameters file) - + person_var (string): variable that dictates how the 'number of people' will be displayed in the resultant dataframe - + record_var (string): variable that dictates how the 'number of records' will be displayed in the resultant dataframe - + table_name (string): represents the table that is being queried - + field_name (string): represents the field that should count the number of records for a particular dataset/table combination. this is usually 'table name'_id - + Returns ------- @@ -295,11 +294,11 @@ def generate_query(dataset, person_var, record_var, table_name, field_name): def extract_first_int_from_series(series): """ Function is used to extract the first integer from a Pandas series object. - + Parameters ---------- series (series): Pandas series object - + Returns ------- integer (int): the first integer from a Pandas series object @@ -319,38 +318,38 @@ def create_aggregate_table_df(unioned, combined, deid, unioned_persons_string, record_string): """ Function is used to create a dataframe that can display the 'drop off' of records across multiple - stages of the pipeline. - - + stages of the pipeline. + + Parameters: ----------- - + unioned (dataframe): contains information regarding the number of persons and record in the unioned dataset - + combined (dataframe): contains information regarding the number of persons and record in the combined dataset - + deid (dataframe): contains information regarding the number of persons and record in the deid dataset - + unioned_person_string (str): column name to determine the number of persons in the unioned dataset - + combined_person_string (str): column name to determine the number of persons in the combined dataset - + deid_person_string (str): column name to determine the number of persons in the deid dataset - + unioned_records_string (str): column name to determine the number of records in the unioned dataset - + combined_records_string (str): column name to determine the number of records in the combined dataset deid_records_string (str): column name to determine the number of records in the deid dataset - + person_string (str): row title to indicate the person drop for each stage of the pipeline - + record_string (str): row title to indicate the record drop for each stage of the pipeline - - + + Returns: -------- df (dataframe): contains information about the record and person count drop across each stage of @@ -761,19 +760,19 @@ def generate_site_level_query(id_name, unioned, table_name, combined): b. the number of rows for the HPO for a particular table in the unioned dataset c. the number of rows for the HPO for a particular table in the combined dataset d. the total 'drop' of rows across unioned to combined, expressed as a percentage - + Parameters ---------- - id_name (string): represents the 'primary key' of the table (the unique identifier + id_name (string): represents the 'primary key' of the table (the unique identifier for each row) - + unioned (string): the name of the unioned dataset to be queried - + table_name (string): name of the table that is being investigated - + combined (string): the name of the combined dataset to be queried - - + + Returns ------- dataframe (df): contains all of the information outlined in the top of the docstring @@ -827,17 +826,17 @@ def add_total_drop_row(dataframe): """ Function is used to add a 'total' row at the bottom of a dataframe that shows the relative 'drop' across the pipeline (unioned to combined) for the different sites. - + This row will show: a. the number of rows in the unioned dataset b. the number of rows in the combined dataset c. the total 'drop' of rows across unioned to combined, expressed as a percentage - + Parameters: ---------- dataframe (df): dataframe for a particular table. shows a-c (above) for each of the HPOs that uploaded data - + Returns: -------- dataframe (df): the inputted dataframe with an additional 'total' row at the end @@ -869,16 +868,16 @@ def add_percent_of_drop_column(dataframe): Function is used to add a 'percent_of_drop' column that shows how much each site's 'drop' contributed to the 'overall' drop from the unioned to the combined steps of the pipeline. - + Parameters ---------- - dataframe (df): dataframe for a particular table. shows the following for + dataframe (df): dataframe for a particular table. shows the following for HPOs that uploaded data: - + a. the number of rows in the unioned dataset b. the number of rows in the combined dataset c. the total 'drop' of rows across unioned to combined, expressed as a percentage - + Returns ------- dataframe (df): the above dataframe with a new column that shows each site's diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py b/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py index 532ca1febf..b75d78c410 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py @@ -38,7 +38,6 @@ # %load_ext google.cloud.bigquery # + -import bq_utils import utils.bq from notebooks import parameters @@ -72,11 +71,11 @@ p_v_query = """ SELECT DISTINCT -a.*, +a.*, (a.procedure_vis_start_diff + a.procedure_vis_end_diff + a.procedure_vis_start_dt_diff + a.procedure_vis_end_dt_diff + a.procedure_dt_vis_start_dt_diff + a.procedure_dt_vis_end_dt_diff) as total_diff -FROM +FROM ( SELECT - mpo.src_hpo_id, COUNT(mpo.src_hpo_id) as num_bad_records, + mpo.src_hpo_id, COUNT(mpo.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(po.procedure_date, vo.visit_start_date, DAY)), 0) as procedure_vis_start_diff, IFNULL(ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)), 0) as procedure_vis_end_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)), 0) as procedure_vis_start_dt_diff, @@ -85,19 +84,19 @@ IFNULL(ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)), 0) as procedure_dt_vis_end_dt_diff, ( - ABS(DATE_DIFF(po.procedure_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)) + ABS(DATE_DIFF(po.procedure_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)) AND ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)) + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)) = ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), po.procedure_date, DAY)) AND - ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), po.procedure_date, DAY)) = - ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), po.procedure_date, DAY)) = + ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) AND - ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = + ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)) ) as all_discrepancies_equal @@ -131,7 +130,7 @@ OR po.procedure_date > vo.visit_end_date) - OR + OR -- problem with datetime (po.procedure_datetime < vo.visit_start_datetime OR @@ -142,9 +141,9 @@ (po.procedure_date < CAST(vo.visit_start_datetime AS DATE) OR po.procedure_date > CAST(vo.visit_end_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(po.procedure_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE) OR @@ -188,28 +187,28 @@ def create_dicts_w_info(df, """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + table_visit_diff_string (string): the column that is used to calculate the 'average' difference between a date of interest and the visit start date. for instance, this would allow someone to specify the difference between the observation date and the visit start date. - + bad_records_string (string): the column of the dataframe whose rows will be summed and then converted to the keys of a dictionary. for instance 'num_bad_records' is often used to show the total number of 'bad' (discrepant) records for a particular site - + Returns ------- num_bad_records (dictionary): has the following structure keys: the HPOs values: the total number of 'bad' (discrepant) records for the particular column of interest - + table_visit_diff_dict (dictionary): has the following structure keys: the HPOs values: the 'average' difference between the two types of dates as specified @@ -262,27 +261,27 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + colour (str): character used to specify the colours of the bars - + total_diff_colour (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric - across all of the sites) + across all of the sites) """ bar_list = plt.bar(range(len(info_dict)), list(info_dict.values()), @@ -339,11 +338,11 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, observation_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.observation_vis_start_diff + a.observation_vis_end_diff + a.observation_vis_start_dt_diff + a.observation_vis_end_dt_diff + a.observation_dt_vis_start_dt_diff + a.observation_dt_vis_end_dt_diff) as total_diff -FROM +FROM ( SELECT - mo.src_hpo_id, COUNT(mo.src_hpo_id) as num_bad_records, + mo.src_hpo_id, COUNT(mo.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(o.observation_date, vo.visit_start_date, DAY)), 0) as observation_vis_start_diff, IFNULL(ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)), 0) as observation_vis_end_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)), 0) as observation_vis_start_dt_diff, @@ -352,19 +351,19 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, IFNULL(ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)), 0) as observation_dt_vis_end_dt_diff, ( - ABS(DATE_DIFF(o.observation_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)) + ABS(DATE_DIFF(o.observation_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)) AND ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)) + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)) = ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), o.observation_date, DAY)) AND - ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), o.observation_date, DAY)) = - ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), o.observation_date, DAY)) = + ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) AND - ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = + ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)) ) as all_discrepancies_equal @@ -398,7 +397,7 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, OR o.observation_date > vo.visit_end_date) - OR + OR -- problem with datetime (o.observation_datetime < vo.visit_start_datetime OR @@ -409,9 +408,9 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, (o.observation_date < CAST(vo.visit_start_datetime AS DATE) OR o.observation_date > CAST(vo.visit_end_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(o.observation_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE) OR @@ -487,11 +486,11 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, measurement_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.measurement_vis_start_diff + a.measurement_vis_end_diff + a.measurement_vis_start_dt_diff + a.measurement_vis_end_dt_diff + a.measurement_dt_vis_start_dt_diff + a.measurement_dt_vis_end_dt_diff) as total_diff -FROM +FROM ( SELECT - mm.src_hpo_id, COUNT(mm.src_hpo_id) as num_bad_records, + mm.src_hpo_id, COUNT(mm.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(m.measurement_date, vo.visit_start_date, DAY)), 0) as measurement_vis_start_diff, IFNULL(ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)), 0) as measurement_vis_end_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)), 0) as measurement_vis_start_dt_diff, @@ -500,19 +499,19 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, IFNULL(ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)), 0) as measurement_dt_vis_end_dt_diff, ( - ABS(DATE_DIFF(m.measurement_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)) + ABS(DATE_DIFF(m.measurement_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)) AND ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)) + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)) = ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), m.measurement_date, DAY)) AND - ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), m.measurement_date, DAY)) = - ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), m.measurement_date, DAY)) = + ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) AND - ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = + ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)) ) as all_discrepancies_equal @@ -546,7 +545,7 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, OR m.measurement_date > vo.visit_end_date) - OR + OR -- problem with datetime (m.measurement_datetime < vo.visit_start_datetime OR @@ -557,9 +556,9 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, (m.measurement_date < CAST(vo.visit_start_datetime AS DATE) OR m.measurement_date > CAST(vo.visit_end_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(m.measurement_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE) OR @@ -634,21 +633,21 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, condition_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.condition_vis_start_diff + a.condition_vis_start_dt_diff + a.condition_dt_vis_start_dt_diff) as total_diff -FROM +FROM ( SELECT - mco.src_hpo_id, COUNT(mco.src_hpo_id) as num_bad_records, + mco.src_hpo_id, COUNT(mco.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(co.condition_start_date, vo.visit_start_date, DAY)), 0) as condition_vis_start_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)), 0) as condition_vis_start_dt_diff, IFNULL(ABS(DATE_DIFF(CAST(co.condition_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)), 0) as condition_dt_vis_start_dt_diff, - + ( - ABS(DATE_DIFF(co.condition_start_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)) + ABS(DATE_DIFF(co.condition_start_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)) = - ABS(DATE_DIFF(CAST(co.condition_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(co.condition_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) ) as all_discrepancies_equal FROM @@ -679,16 +678,16 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, -- problem with procedure date (co.condition_start_date < vo.visit_start_date) - OR + OR -- problem with datetime (co.condition_start_datetime < vo.visit_start_datetime) OR -- problem with the datetime (extracting date for comparison) (co.condition_start_date < CAST(vo.visit_start_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(co.condition_start_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE)) ) @@ -752,21 +751,21 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, drug_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.drug_vis_start_diff + a.drug_vis_start_dt_diff + a.drug_dt_vis_start_dt_diff) as total_diff -FROM +FROM ( SELECT - mde.src_hpo_id, COUNT(mde.src_hpo_id) as num_bad_records, + mde.src_hpo_id, COUNT(mde.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(de.drug_exposure_start_date, vo.visit_start_date, DAY)), 0) as drug_vis_start_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)), 0) as drug_vis_start_dt_diff, IFNULL(ABS(DATE_DIFF(CAST(de.drug_exposure_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)), 0) as drug_dt_vis_start_dt_diff, - + ( - ABS(DATE_DIFF(de.drug_exposure_start_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)) + ABS(DATE_DIFF(de.drug_exposure_start_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)) = - ABS(DATE_DIFF(CAST(de.drug_exposure_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(de.drug_exposure_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) ) as all_discrepancies_equal FROM @@ -797,16 +796,16 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, -- problem with procedure date (de.drug_exposure_start_date < vo.visit_start_date) - OR + OR -- problem with datetime (de.drug_exposure_start_datetime < vo.visit_start_datetime) OR -- problem with the datetime (extracting date for comparison) (de.drug_exposure_start_date < CAST(vo.visit_start_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(de.drug_exposure_start_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE)) ) diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py b/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py index fb691788c8..469889ef28 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py @@ -21,7 +21,6 @@ # %load_ext google.cloud.bigquery # + -import bq_utils import utils.bq from notebooks import parameters @@ -66,7 +65,7 @@ JOIN `{}._mapping_measurement` mm ON -mm.measurement_id = m.measurement_id +mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -108,7 +107,7 @@ JOIN `{}._mapping_measurement` mm ON -mm.measurement_id = m.measurement_id +mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -196,7 +195,7 @@ JOIN `{}._mapping_measurement` mm ON - mm.measurement_id = m.measurement_id + mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -238,7 +237,7 @@ JOIN `{}._mapping_measurement` mm ON - mm.measurement_id = m.measurement_id + mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -305,28 +304,28 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, color, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + color (str): character used to specify the colours of the bars - + total_diff_color (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric across all of the sites) - + turnoff_x (bool): used to disable the x-axis labels (for each of the bars). This is typically used when there are so many x-axis labels that they overlap and obscure legibility """ @@ -357,25 +356,25 @@ def create_dicts_w_info(df, x_label, column_label): """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + x_label (string): the column of the dataframe whose rows will then be converted to they keys of a dictionary - + column_label (string): the column that contains the data quality metric being investigated - + Returns ------- data_qual_info (dictionary): has the following structure - + keys: the column for a particular dataframe that represents the elements that whose data quality is being compared (e.g. HPOs, different measurement/unit combinations) - + values: the data quality metric being compared """ rows = df[x_label].unique().tolist() diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py b/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py index 2a6374cead..d2c7662837 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py @@ -21,7 +21,6 @@ # %load_ext google.cloud.bigquery # + -import bq_utils import utils.bq from notebooks import parameters @@ -68,11 +67,11 @@ def create_dicts_w_info(df, column_label): """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + column_label (string): the column of the dataframe whose rows will then be converted to the keys of the dictionary """ @@ -95,23 +94,23 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + colour (str): character used to specify the colours of the bars - + total_diff_colour (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric diff --git a/data_steward/validation/metrics/required_labs.py b/data_steward/validation/metrics/required_labs.py index 2bcdd358f4..3b3b50ec30 100644 --- a/data_steward/validation/metrics/required_labs.py +++ b/data_steward/validation/metrics/required_labs.py @@ -7,9 +7,9 @@ # Project imports import app_identity -import bq_utils import resources import common +import bq_utils from constants import bq_utils as bq_consts from gcloud.bq import BigQueryClient from validation.metrics.required_labs_sql import (IDENTIFY_LABS_QUERY, @@ -165,7 +165,7 @@ def get_lab_concept_summary_query(client, hpo_id): Get the query that checks if the HPO site has submitted the required labs :param client: a BigQueryClient :param hpo_id: Identifies the HPO site - :return: + :return: """ dataset_id = common.BIGQUERY_DATASET_ID hpo_measurement_table = resources.get_table_id(common.MEASUREMENT, diff --git a/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py b/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py index 294e1a4c0d..16998d83da 100644 --- a/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py +++ b/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py @@ -10,7 +10,6 @@ # Project imports import app_identity -import bq_utils from common import BIGQUERY_DATASET_ID from tests import test_util from retraction import retract_data_gcs as rd diff --git a/tests/integration_tests/data_steward/validation/export_test.py b/tests/integration_tests/data_steward/validation/export_test.py index cb198a860a..7586cf559b 100644 --- a/tests/integration_tests/data_steward/validation/export_test.py +++ b/tests/integration_tests/data_steward/validation/export_test.py @@ -7,7 +7,6 @@ # Project imports import app_identity -import bq_utils import common from gcloud.gcs import StorageClient from gcloud.bq import BigQueryClient