diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py index 6a4cae5c22..137cf93d92 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cdr_person_id.py @@ -1,7 +1,6 @@ # # Person # ## Person ID validation -import bq_utils import utils.bq from notebooks.parameters import RDR_DATASET_ID, EHR_DATASET_ID @@ -13,7 +12,7 @@ hpo_ids = utils.bq.query(""" SELECT REPLACE(table_id, '_person', '') AS hpo_id FROM `{EHR_DATASET_ID}.__TABLES__` -WHERE table_id LIKE '%person' +WHERE table_id LIKE '%person' AND table_id NOT LIKE '%unioned_ehr_%' AND table_id NOT LIKE '\\\_%' """.format(EHR_DATASET_ID=EHR_DATASET_ID)).hpo_id.tolist() @@ -31,7 +30,7 @@ (SELECT COUNT(1) AS n FROM {EHR_DATASET_ID}.{h}_person e WHERE NOT EXISTS( - SELECT 1 + SELECT 1 FROM {RDR_DATASET_ID}.person r WHERE r.person_id = e.person_id)) not_in_rdr ON TRUE @@ -63,31 +62,31 @@ RDR_EHR_NAME_MATCH_QUERY = ''' WITH rdr_first_name AS - (SELECT DISTINCT person_id, - FIRST_VALUE(value_as_string) + (SELECT DISTINCT person_id, + FIRST_VALUE(value_as_string) OVER (PARTITION BY person_id, observation_source_value ORDER BY value_as_string) val FROM {RDR_DATASET_ID}.observation WHERE observation_source_value = 'PIIName_First'), rdr_last_name AS - (SELECT DISTINCT person_id, - FIRST_VALUE(value_as_string) + (SELECT DISTINCT person_id, + FIRST_VALUE(value_as_string) OVER (PARTITION BY person_id, observation_source_value ORDER BY value_as_string) val FROM {RDR_DATASET_ID}.observation WHERE observation_source_value = 'PIIName_Last'), rdr_name AS - (SELECT + (SELECT f.person_id person_id, - f.val first_name, + f.val first_name, l.val last_name FROM rdr_first_name f JOIN rdr_last_name l USING (person_id)) SELECT '{HPO_ID}' hpo_id, - rdr.person_id rdr_person_id, - rdr.first_name rdr_first_name, - rdr.last_name rdr_last_name, + rdr.person_id rdr_person_id, + rdr.first_name rdr_first_name, + rdr.last_name rdr_last_name, pii.person_id pii_person_id, pii.first_name pii_first_name, pii.middle_name pii_middle_name, @@ -97,7 +96,7 @@ FROM rdr_name rdr JOIN `{EHR_DATASET_ID}.{HPO_ID}_pii_name` pii ON pii.first_name = rdr.first_name - AND pii.last_name = rdr.last_name + AND pii.last_name = rdr.last_name LEFT JOIN `{EHR_DATASET_ID}.{HPO_ID}_person` p ON pii.person_id = p.person_id ''' diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py index a119882de8..8bdcb29c29 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/cohort_testing.py @@ -21,7 +21,6 @@ # - Record count (condition_occurrence) # # - We want to determine if these fluctations are potentially caused by OMOP vocabulary issues. If this is the case, we should be able to determine similar trends in AoU data. -import bq_utils import utils.bq from notebooks import parameters @@ -42,8 +41,8 @@ q4_2018_hypo_obs_card_query = """ SELECT DISTINCT -co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, -COUNT(DISTINCT co.condition_occurrence_id) as num_records, +co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, +COUNT(DISTINCT co.condition_occurrence_id) as num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as records_per_capita FROM @@ -190,8 +189,8 @@ q2_2019_hypo_obs_card_query = """ SELECT DISTINCT -co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, -COUNT(DISTINCT co.condition_occurrence_id) as num_records, +co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, +COUNT(DISTINCT co.condition_occurrence_id) as num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as records_per_capita FROM @@ -339,14 +338,14 @@ SELECT DISTINCT q4.*, q2.*, (SUM(q2.num_persons) - SUM(q4.old_num_persons)) as person_difference, -(SUM(q2.num_records) - SUM(q4.old_num_records)) as record_difference +(SUM(q2.num_records) - SUM(q4.old_num_records)) as record_difference FROM (SELECT DISTINCT - co.condition_concept_id as old_condition_concept_id, c.concept_name as old_concept_name, - COUNT(DISTINCT p.person_id) AS old_num_persons, - COUNT(DISTINCT co.condition_occurrence_id) as old_num_records, + co.condition_concept_id as old_condition_concept_id, c.concept_name as old_concept_name, + COUNT(DISTINCT p.person_id) AS old_num_persons, + COUNT(DISTINCT co.condition_occurrence_id) as old_num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as old_records_per_capita FROM @@ -378,13 +377,13 @@ GROUP BY 1, 2 ORDER BY old_num_persons DESC) q4 - + LEFT JOIN - + (SELECT DISTINCT - co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, - COUNT(DISTINCT co.condition_occurrence_id) as num_records, + co.condition_concept_id, c.concept_name, COUNT(DISTINCT p.person_id) AS num_persons, + COUNT(DISTINCT co.condition_occurrence_id) as num_records, ROUND(COUNT(DISTINCT co.condition_occurrence_id) / COUNT(DISTINCT p.person_id), 2) as records_per_capita FROM @@ -416,10 +415,10 @@ GROUP BY 1, 2 ORDER BY num_persons DESC) q2 - + ON q4.old_condition_concept_id = q2.condition_concept_id - + GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ORDER BY old_num_persons DESC diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py index 23755d5bf7..b136297338 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py @@ -14,7 +14,6 @@ import warnings -import bq_utils import utils.bq from notebooks import parameters warnings.filterwarnings('ignore') @@ -33,11 +32,11 @@ def get_hpo_table_columns(hpo_id): :param hpo_id: hpo site id :return: dataframe with table name, column name and table row count """ - query = """SELECT table_name, column_name, t.row_count as table_row_count, '{hpo_id}' as hpo_id + query = """SELECT table_name, column_name, t.row_count as table_row_count, '{hpo_id}' as hpo_id FROM {dataset}.INFORMATION_SCHEMA.COLUMNS c JOIN {dataset}.__TABLES__ t on c.table_name=t.table_id WHERE STARTS_WITH(table_id, lower('{hpo_id}'))=true AND - NOT(table_id like '_mapping%') AND + NOT(table_id like '_mapping%') AND ( table_id like '%person' OR table_id like '%visit_occurrence' OR @@ -59,25 +58,25 @@ def get_hpo_table_columns(hpo_id): def create_hpo_completeness_query(table_columns, hpo_id): - query_with_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated + query_with_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated FROM ( SELECT '{table_name}' as table_name, '{column_name}' as column_name, '{hpo_id}' as site_name, - {table_row_count} as total_rows, + {table_row_count} as total_rows, sum(case when {column_name}=0 then 0 else 1 end) as num_nonnulls_zeros, - ({table_row_count} - count({column_name})) as non_populated_rows - FROM {dataset}.{table_name} - ) as x + ({table_row_count} - count({column_name})) as non_populated_rows + FROM {dataset}.{table_name} + ) as x """ - query_without_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated + query_without_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated FROM ( SELECT '{table_name}' as table_name, '{column_name}' as column_name, '{hpo_id}' as site_name, - {table_row_count} as total_rows, - count({column_name}) as num_nonnulls_zeros, - ({table_row_count} - count({column_name})) as non_populated_rows - FROM {dataset}.{table_name} - ) as x + {table_row_count} as total_rows, + count({column_name}) as num_nonnulls_zeros, + ({table_row_count} - count({column_name})) as non_populated_rows + FROM {dataset}.{table_name} + ) as x """ queries = [] for i, row in table_columns.iterrows(): diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py index d9836fe200..9a74fe4da5 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/deid_race.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # + -import bq_utils import utils.bq from notebooks import render, parameters import pandas as pd @@ -27,19 +26,19 @@ MULTIRACIAL_DIST_QUERY = """ WITH race_combo AS -(SELECT o.person_id, - o.questionnaire_response_id, +(SELECT o.person_id, + o.questionnaire_response_id, STRING_AGG(REPLACE(c.concept_code, 'WhatRaceEthnicity_', ''), ' ' ORDER BY value_source_value) selected_races FROM {DATASET}.observation o - JOIN {VOCAB}.concept c ON o.value_source_concept_id = c.concept_id + JOIN {VOCAB}.concept c ON o.value_source_concept_id = c.concept_id WHERE observation_source_concept_id = 1586140 GROUP BY person_id, questionnaire_response_id) - -SELECT - selected_races, + +SELECT + selected_races, (LENGTH(selected_races) - LENGTH(REPLACE(selected_races, ' ', '')) + 1) AS selected_count, COUNT(DISTINCT person_id) row_count -FROM race_combo +FROM race_combo GROUP BY selected_races ORDER BY selected_count, selected_races """ diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py index db02e480c9..442b4d6c51 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_demographics_by_site.py @@ -26,7 +26,6 @@ client = bigquery.Client() # %load_ext google.cloud.bigquery -import bq_utils import utils.bq from notebooks import parameters # %matplotlib inline @@ -95,18 +94,18 @@ racial_distribution_by_site_query = """ SELECT DISTINCT -a.*, b.number_from_site, ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as percent_of_site_persons +a.*, b.number_from_site, ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as percent_of_site_persons FROM (SELECT DISTINCT - mp.src_hpo_id, p.race_concept_id, c.concept_name, + mp.src_hpo_id, p.race_concept_id, c.concept_name, COUNT(p.race_concept_id) as number_of_demographic, FROM `{DATASET}.unioned_ehr_person` p LEFT JOIN `{DATASET}._mapping_person` mp ON - p.person_id = mp.src_person_id + p.person_id = mp.src_person_id LEFT JOIN `{DATASET}.concept` c ON @@ -141,17 +140,17 @@ def return_hpos_to_display(hpo_names, max_num_sites_to_display): Function is intended to return a means for divide the number of HPOs into an appropriate number of lists based on the maximum number of sites a user wants to display. - + This is useful for creating graphs that will only display a fraction of the total HPOs. - + Parameters ---------- hpo_names (list): list of all the health provider organizations (in string form) - + num_sites_to_display (int): user-specified number of sites to display in each graph - - + + Returns ------- all_hpos (list): contains several lists, each of which contains a number of sites @@ -195,17 +194,17 @@ def create_information_dictionary_for_sites(hpo_dfs, selected_hpo_names, """ Function is used to create a dictionary that contains the racial makeup of a selected number of sites (expressed as a percentage, from a source dataframe) - + Parameters ---------- hpo_dfs (dictonary): has the following structure key: string representing an HPO ID value: dataframe that contains information about the different race concepts (IDs and names) and their relative spread within the site - + selected_hpo_names (list): contains strings that represent the different HPOs that will ultimately be translated to a dictionary - + most_popular_race_cids (list): list of the most popular concept IDs (across all sites) @@ -253,23 +252,23 @@ def create_information_dictionary_for_sites(hpo_dfs, selected_hpo_names, def create_graphs(hpo_names_to_display, num_races_for_legend, racial_percentages, img_name): """ - Function is used to create and save graphs that show the racial distribution for + Function is used to create and save graphs that show the racial distribution for a selected number of sites - + Parameters ---------- hpo_names_to_display (list): list with a user-specified number of HPOs that are to be displayed in the graph - + num_races_for_legend (int): the number of races that are to be displayed next to the graph - + racial_percentages (dictionary): has the following structure key: race concept ID value: list, each index represents one of the sites in the 'selected_hpo_names' parameter. the value represents the proportion of persons from the HPO who have the reported race concept ID - + img_name (string): name for the image to be displayed """ num_sites_to_display = len(hpo_names_to_display) @@ -408,46 +407,46 @@ def create_query_for_particular_table(dataset, percent_of_table, table_name): - number of IDs for that particular group in the specified table - total number of IDs for the HPO - percentage of the records for the site that belong to that demographic class - + This query is then run through bigquery and returns a dataframe - - + + Parameters ---------- dataset (str): dataset to be queried (defined at the top of the workbook) - + percent_of_table (str): the string to represent the percentage of the records for the site that belong to the particular demographic class - + table_name (str): name of the table to be investigated - - + + Returns ------- dataframe (df): contains the information specified in the top of the docstring - + """ query = """ SELECT DISTINCT - a.src_hpo_id, a.race_concept_id, a.concept_name, - ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as {percent_of_table} + a.src_hpo_id, a.race_concept_id, a.concept_name, + ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as {percent_of_table} FROM (SELECT DISTINCT - mp.src_hpo_id, p.race_concept_id, c.concept_name, + mp.src_hpo_id, p.race_concept_id, c.concept_name, COUNT(p.race_concept_id) as number_of_demographic, FROM `{dataset}.unioned_ehr_{table_name}` x LEFT JOIN `{dataset}.unioned_ehr_person` p ON - x.person_id = p.person_id + x.person_id = p.person_id LEFT JOIN `{dataset}._mapping_person` mp ON - p.person_id = mp.src_person_id + p.person_id = mp.src_person_id LEFT JOIN `{dataset}.concept` c ON @@ -464,7 +463,7 @@ def create_query_for_particular_table(dataset, percent_of_table, table_name): LEFT JOIN `{dataset}.unioned_ehr_person` p ON - x.person_id = p.person_id + x.person_id = p.person_id LEFT JOIN `{dataset}._mapping_person` mp ON @@ -549,13 +548,13 @@ def find_all_distributions_for_site_race_combo(df, hpo, race, This function is used to calculate the relative 'underrepresentation' of a given race for a particular table when compared to the race's overall representation in the person table. - + For instance, a site may have 65% participants who identify as 'White'. The persons who identify with this race, however, only make up 60% of the drug_exposure_ids in the drug exposure table. This would result in a 'underrepresentation' of 5% for persons at this particular site for this particular table. - - + + Parameters ---------- df (df): dataframe that contains the following information in its fields: @@ -567,15 +566,15 @@ def find_all_distributions_for_site_race_combo(df, hpo, race, aforementioned race_concept_id e. the same metric as d but also for the condition, observation, procedure, and visit tables - + hpo (string): HPO whose 'representation' metric is going to be assessed - + race (string): race concept name that will be evaluated for 'representation' - + person_distribution: the proportion of person_ids for the particular site that belong to the aforementioned race - - + + Returns ------- difference_df: contains the 'difference' between the proportion of records diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py index deb579b9d3..2f2501fb8f 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/ehr_ops_row_counts.py @@ -16,7 +16,6 @@ # + import datetime -import bq_utils import utils.bq from notebooks.parameters import RDR_PROJECT_ID, RDR_DATASET_ID, EHR_DATASET_ID @@ -65,17 +64,17 @@ # ## EHR Site Submission Counts utils.bq.query(''' -SELECT +SELECT l.Org_ID AS org_id, l.HPO_ID AS hpo_id, l.Site_Name AS site_name, - table_id AS table_id, + table_id AS table_id, row_count AS row_count FROM `{EHR_DATASET_ID}.__TABLES__` AS t -JOIN `lookup_tables.hpo_site_id_mappings` AS l +JOIN `lookup_tables.hpo_site_id_mappings` AS l ON STARTS_WITH(table_id,lower(l.HPO_ID))=true WHERE table_id like '%person%' AND -NOT(table_id like '%unioned_ehr_%') AND +NOT(table_id like '%unioned_ehr_%') AND l.hpo_id <> '' ORDER BY Display_Order '''.format(EHR_DATASET_ID=EHR_DATASET_ID)) @@ -84,7 +83,7 @@ hpo_ids = utils.bq.query(""" SELECT REPLACE(table_id, '_person', '') AS hpo_id FROM `{EHR_DATASET_ID}.__TABLES__` -WHERE table_id LIKE '%person' +WHERE table_id LIKE '%person' AND table_id NOT LIKE '%unioned_ehr_%' AND table_id NOT LIKE '\\\_%' """.format(EHR_DATASET_ID=EHR_DATASET_ID)).hpo_id.tolist() diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py index 271dde74a1..d6dc0ce230 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/generalized_dupes.py @@ -1,5 +1,4 @@ # + -import bq_utils import utils.bq from notebooks import parameters @@ -17,7 +16,7 @@ COUNT(*) FROM `{DEID}.observation` AS o -JOIN +JOIN ( SELECT observation_id @@ -31,7 +30,7 @@ observation_id DESC) AS rank_order, observation_id FROM - `{DEID}.observation` + `{DEID}.observation` JOIN `{COMBINED}._mapping_observation` as map USING @@ -40,9 +39,9 @@ AND value_source_concept_id IN (2000000008, 2000000005, 2000000004, 2000000002) AND map.src_hpo_id like "rdr" ) o - WHERE + WHERE o.rank_order <> 1 -) unique_observation_ids +) unique_observation_ids ON o.observation_id = unique_observation_ids.observation_id """ q = DUPLICATE_GEN_RACE_QUERY.format(DEID=DEID, COMBINED=COMBINED) diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py index c2bc3db3f9..b9b58a0c3f 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/identify_required_labs.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import bq_utils import utils.bq from notebooks import render @@ -27,13 +26,13 @@ -- 36208195 Lab terms not yet categorized -- 36207527 Clinical terms not yet categorized -- 36210656 Survey terms not yet categorized - - -- Exclude the list of the "coarse" generalized concept ids + + -- Exclude the list of the "coarse" generalized concept ids -- 40772590: Cholesterol -- 40782521: Leukocytes -- 40779250: Protein in the grandparent lookup - SELECT - excluded_ancestor_concept_id + SELECT + excluded_ancestor_concept_id FROM UNNEST([36208978, 36206173, 36208195, 36207527, 36210656, 40782521, 40779250, 40772590]) AS excluded_ancestor_concept_id ), @@ -51,22 +50,22 @@ IF(ex.excluded_ancestor_concept_id IS NULL, COALESCE(ca.min_levels_of_separation, -1), -1) AS distance FROM `ehr_ops.measurement_concept_sets` AS m - JOIN + JOIN `{VOCAB_DATASET_ID}.concept` AS c1 - ON + ON m.Measurement_OMOP_ID = c1.concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept_ancestor` AS ca ON - m.Measurement_OMOP_ID = ca.descendant_concept_id + m.Measurement_OMOP_ID = ca.descendant_concept_id AND ca.min_levels_of_separation = 1 - LEFT JOIN + LEFT JOIN get_excluded_ancestor_ids AS ex - ON + ON ca.ancestor_concept_id = ex.excluded_ancestor_concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept` AS c2 - ON + ON ca.ancestor_concept_id = c2.concept_id WHERE c2.concept_class_id IS NULL OR c2.concept_class_id = 'LOINC Group' ), @@ -85,23 +84,23 @@ IF(ex.excluded_ancestor_concept_id IS NULL, COALESCE(ca.min_levels_of_separation, -1), -1) AS distance FROM `ehr_ops.measurement_concept_sets` AS m - JOIN + JOIN `{VOCAB_DATASET_ID}.concept` AS c1 - ON + ON m.Measurement_OMOP_ID = c1.concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept_ancestor` AS ca ON - m.Measurement_OMOP_ID = ca.descendant_concept_id + m.Measurement_OMOP_ID = ca.descendant_concept_id AND ca.min_levels_of_separation IN (1, 2) - LEFT JOIN + LEFT JOIN get_excluded_ancestor_ids AS ex - ON + ON ca.ancestor_concept_id = ex.excluded_ancestor_concept_id LEFT JOIN `{VOCAB_DATASET_ID}.concept` AS c2 - ON - ca.ancestor_concept_id = c2.concept_id + ON + ca.ancestor_concept_id = c2.concept_id WHERE -- if there is not ancestors for the measurement_concept_id (ca.descendant_concept_id IS NULL) @@ -112,15 +111,15 @@ -- if the level of seperation is 2, we keep them only when the concept_name subsumes the grandparent concept_name (c2.concept_class_id = 'LOINC Hierarchy' AND ca.min_levels_of_separation = 2 AND c1.concept_name LIKE CONCAT('%', c2.concept_name , '%')) OR - -- if the level of seperation is 2, the 6 concept names (such as MCH [Entitic mass], MCV [Entitic volume]) do not follow the previous rule, + -- if the level of seperation is 2, the 6 concept names (such as MCH [Entitic mass], MCV [Entitic volume]) do not follow the previous rule, -- because the acronyms are used in the concept_name and full names are used in the grandparent concept_name (c2.concept_class_id = 'LOINC Hierarchy' AND ca.min_levels_of_separation = 2 AND c1.concept_id IN (3035941, 3024731, 3003338, 3012030, 3009744, 3023599)) ), -get_ancestors_loinc_hierarchy_distinct AS +get_ancestors_loinc_hierarchy_distinct AS ( - # For some concepts in LONIC Hierarchy, we include both parent and grandparent concept_ids, - # We want to remove the parent concept_id if the grandparent concept_id is present. + # For some concepts in LONIC Hierarchy, we include both parent and grandparent concept_ids, + # We want to remove the parent concept_id if the grandparent concept_id is present. SELECT DISTINCT Panel_OMOP_ID, Panel_Name, @@ -132,7 +131,7 @@ distance FROM ( - SELECT DISTINCT + SELECT DISTINCT *, dense_rank() over(PARTITION BY measurement_concept_id ORDER BY distance DESC) AS rank_order FROM get_ancestors_loinc_hierarchy @@ -142,9 +141,9 @@ get_loinc_group_descendant_concept_ids AS ( - # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case + # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case # we make the measurement_concept_id its own ancestor - SELECT + SELECT lg.Panel_OMOP_ID, lg.Panel_Name, lg.measurement_concept_id, @@ -157,18 +156,18 @@ COALESCE(c1.concept_class_id, lg.parent_concept_class_id) AS loinc_groupy_descendant_concept_class_id, COALESCE(ca1.min_levels_of_separation, -1) AS distance FROM get_direct_parents_loinc_group AS lg - LEFT JOIN + LEFT JOIN {VOCAB_DATASET_ID}.concept_ancestor AS ca1 ON - lg.parent_concept_id = ca1.ancestor_concept_id + lg.parent_concept_id = ca1.ancestor_concept_id AND ca1.min_levels_of_separation <> 0 LEFT JOIN {VOCAB_DATASET_ID}.concept AS c1 - ON ca1.descendant_concept_id = c1.concept_id + ON ca1.descendant_concept_id = c1.concept_id ), get_loinc_hierarchy_descendant_concept_ids AS ( - # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case + # We use left join to concept_ancestor because not all the concepts have an ancestor, in which case # we make the measurement_concept_id its own ancestor SELECT lh.Panel_OMOP_ID, @@ -183,19 +182,19 @@ COALESCE(c1.concept_class_id, lh.ancestor_concept_class_id) AS loinc_hierarchy_descendant_concept_class_id, COALESCE(ca1.min_levels_of_separation, -1) AS distance FROM get_ancestors_loinc_hierarchy_distinct AS lh - LEFT JOIN + LEFT JOIN {VOCAB_DATASET_ID}.concept_ancestor AS ca1 ON lh.ancestor_concept_id = ca1.ancestor_concept_id AND ca1.min_levels_of_separation <> 0 LEFT JOIN {VOCAB_DATASET_ID}.concept AS c1 - ON ca1.descendant_concept_id = c1.concept_id + ON ca1.descendant_concept_id = c1.concept_id ), get_measurement_concept_sets_descendants AS ( - # We use a full outer join between the loinc_hierarchy descendants and loinc_group descendants - # in order to maximize the number of descendants retrieved by both classficiation systems. + # We use a full outer join between the loinc_hierarchy descendants and loinc_group descendants + # in order to maximize the number of descendants retrieved by both classficiation systems. SELECT DISTINCT COALESCE(lh.Panel_OMOP_ID, lg.Panel_OMOP_ID) AS panel_omop_id, COALESCE(lh.Panel_Name, lg.Panel_Name) AS panel_name, @@ -213,7 +212,7 @@ COALESCE(lh.loinc_hierarchy_descendant_concept_name, lg.loinc_groupy_descendant_concept_name) AS descendant_concept_name, COALESCE(lh.loinc_hierarchy_descendant_concept_class_id, lg.loinc_groupy_descendant_concept_class_id) AS descendant_concept_class_id FROM get_loinc_hierarchy_descendant_concept_ids AS lh - FULL OUTER JOIN + FULL OUTER JOIN get_loinc_group_descendant_concept_ids AS lg ON lh.loinc_hierarchy_descendant_concept_id = lg.loinc_groupy_descendant_concept_id @@ -228,20 +227,20 @@ COUNT(DISTINCT person_id) AS n_person, COUNT(DISTINCT measurement_id) AS n_meas, COUNT(DISTINCT descendant_concept_id) AS n_descendant -FROM +FROM ( SELECT measurement_id, person_id, IF(measurement_concept_id IS NULL OR measurement_concept_id=0, measurement_source_concept_id, measurement_concept_id) AS measurement_concept_id FROM - `{DATASET_ID}.measurement` + `{DATASET_ID}.measurement` ) meas JOIN `{DATASET_ID}._mapping_measurement` USING (measurement_id) -JOIN +JOIN get_measurement_concept_sets_descendants AS valid_lab ON meas.measurement_concept_id = valid_lab.descendant_concept_id @@ -251,7 +250,7 @@ 3, 4, 5 -ORDER BY +ORDER BY 1,2 """ diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py index 4baab0bc7b..ca0370ff17 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/list_tables_with_duplicate_domain_ids.py @@ -12,7 +12,6 @@ # name: python3 # --- -import bq_utils import utils.bq from notebooks import parameters @@ -26,7 +25,7 @@ query = """ SELECT REPLACE(table_id, '_person', '') AS hpo_id FROM `{bq_dataset_id}.__TABLES__` -WHERE table_id LIKE '%person' +WHERE table_id LIKE '%person' AND table_id NOT LIKE '%unioned_ehr_%' AND table_id NOT LIKE '\\\_%' """.format(bq_dataset_id=bigquery_dataset_id) hpo_ids = utils.bq.query(query).tolist() @@ -48,7 +47,7 @@ FROM prod_drc_dataset.__TABLES__ T LEFT JOIN (select distinct '{h}_{d}' as table_name, count(*) as num_dups -from `{bq_dataset_id}.{h}_{d}` +from `{bq_dataset_id}.{h}_{d}` group by {d}_id having count(*) > 1 order by num_dups desc diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py index 9311b51082..616d11a5c6 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/person_gender_sex.py @@ -7,7 +7,6 @@ # * `sex_at_birth_concept_id` contains the associated `value_as_concept_id` # * `sex_at_birth_source_concept_id` contains the associated `value_source_concept_id` # * `sex_at_birth_source_value` contains the `concept_code` associated with `sex_at_birth_source_concept_id` -import bq_utils import utils.bq from notebooks import render from notebooks.parameters import SANDBOX, DEID_DATASET_ID @@ -88,7 +87,7 @@ def df_to_gbq(df, destination_table, table_schema=None): # - UPDATED_PERSON_QUERY = """ -SELECT +SELECT p.person_id, g.gender_concept_id, p.year_of_birth, @@ -127,7 +126,7 @@ def df_to_gbq(df, destination_table, table_schema=None): table_schema=person_schema) PERSON_HIST_QUERY = """ -SELECT +SELECT p.gender_concept_id, p.gender_source_value, p.gender_source_concept_id, diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py index 654cd90dd0..1fdf9b2241 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/pop_retract.py @@ -1,7 +1,6 @@ # + from jinja2 import Template -import bq_utils import utils.bq from notebooks import render from notebooks.defaults import is_deid_dataset @@ -36,14 +35,14 @@ # Determine associated research IDs for RDR participants whose data must be retracted AIAN_PID_QUERY = """ -SELECT DISTINCT +SELECT DISTINCT rdr.person_id AS person_id, deid.research_id AS research_id FROM `{RDR}.observation` rdr JOIN `{COMBINED}.deid_map` deid ON rdr.person_id = deid.person_id -WHERE - rdr.observation_source_concept_id = 1586140 +WHERE + rdr.observation_source_concept_id = 1586140 AND rdr.value_source_concept_id = 1586141 """ q = AIAN_PID_QUERY.format(RDR=RDR, COMBINED=COMBINED) @@ -80,24 +79,24 @@ def get_tables_with_person_id(input_dataset): WITH delete_row_counts AS ( {% for table in TABLES %} ( - SELECT '{{ table }}' AS table_name, + SELECT '{{ table }}' AS table_name, COUNT(1) AS rows_to_delete, (SELECT row_count FROM {{ INPUT_DATASET }}.__TABLES__ WHERE table_id = '{{ table }}') AS total_rows FROM `{{ INPUT_DATASET }}.{{ table }}` t WHERE EXISTS ( - SELECT 1 FROM `{{ ID_TABLE }}` + SELECT 1 FROM `{{ ID_TABLE }}` WHERE {{ 'research_id' if IS_INPUT_DATASET_DEID else 'person_id' }} = t.person_id) - ) + ) {% if not loop.last %} UNION ALL - {% endif %} + {% endif %} {% endfor %} ) -SELECT - d.table_name, +SELECT + d.table_name, d.total_rows AS input_row_count, d.rows_to_delete AS rows_to_delete, - d.total_rows - d.rows_to_delete AS expected_output_row_count, + d.total_rows - d.rows_to_delete AS expected_output_row_count, t.row_count AS actual_output_row_count, t.row_count = (d.total_rows - d.rows_to_delete) AS pass FROM delete_row_counts d diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py index a12fb65d5e..aaa610554a 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/sex_gender_540.py @@ -28,7 +28,6 @@ # - gender_concept_id = value_as_concept_id # - gender_source_value = concept_code associated with value_source_concept_id # - gender_source_concept_id = value_source_concept_id -import bq_utils import utils.bq from notebooks import render, parameters import pandas as pd diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py index 57b6730374..71cfbf19cd 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py @@ -1,5 +1,4 @@ # + -import bq_utils import utils.bq from notebooks import render, parameters @@ -12,10 +11,10 @@ # ## Row counts in combined `_mapping*` and deid `*_ext` tables ROW_COUNTS_QUERY = """ -SELECT dataset_id, - REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', '') mapped_table, - table_id, - creation_time, +SELECT dataset_id, + REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', '') mapped_table, + table_id, + creation_time, last_modified_time, row_count FROM @@ -25,7 +24,7 @@ UNION ALL - SELECT * + SELECT * FROM {COMBINED}.__TABLES__ d1 WHERE table_id LIKE '\\\_mapping\\\_%') diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py index bac69c94ec..47af6fe87d 100644 --- a/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/standard_concepts_cdr_389.py @@ -33,7 +33,6 @@ # # # #### This notebook also does not exclude instances where the concept_id = 0. -import bq_utils import utils.bq from notebooks import parameters @@ -53,7 +52,7 @@ co_query = """ SELECT DISTINCT -co.condition_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +co.condition_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, co_combined.condition_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%condition%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mco.src_hpo_id) as num_sites_w_change @@ -127,7 +126,7 @@ de_query = """ SELECT DISTINCT -de.drug_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +de.drug_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, de_combined.drug_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%drug%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mde.src_hpo_id) as num_sites_w_change @@ -202,7 +201,7 @@ m_query = """ SELECT DISTINCT -m.measurement_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +m.measurement_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, m_combined.measurement_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%measurement%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mm.src_hpo_id) as num_sites_w_change @@ -272,7 +271,7 @@ v_query = """ SELECT DISTINCT -v.visit_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +v.visit_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, v_combined.visit_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%visit%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mv.src_hpo_id) as num_sites_w_change @@ -342,7 +341,7 @@ p_query = """ SELECT DISTINCT -p.procedure_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +p.procedure_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, p_combined.procedure_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%procedure%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mp.src_hpo_id) as num_sites_w_change @@ -411,7 +410,7 @@ o_query = """ SELECT DISTINCT -o.observation_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, +o.observation_concept_id as pre_cr_concept_id, c1.standard_concept as pre_cr_standard_concept, c1.concept_name as pre_cr_cn, o_combined.observation_concept_id as post_cr_concept_id, c2.standard_concept as post_cr_standard_concept, c2.concept_name as post_cr_cn, (LOWER(c2.domain_id) LIKE '%observation%') as post_cr_domain_correct, COUNT(*) as count, COUNT(DISTINCT mo.src_hpo_id) as num_sites_w_change diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py b/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py index f18712905c..984cf7e996 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/data_loss_through_pipeline.py @@ -29,7 +29,6 @@ # %load_ext google.cloud.bigquery # %matplotlib inline -import bq_utils import utils.bq from notebooks import parameters import pandas as pd @@ -58,25 +57,25 @@ def create_dicts_w_info(df, x_label, column_label): """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + x_label (string): the column of the dataframe whose rows will then be converted to they keys of a dictionary - + column_label (string): the column that contains the data quality metric being investigated - + Returns ------- data_qual_info (dictionary): has the following structure - + keys: the column for a particular dataframe that represents the elements that whose data quality is being compared (e.g. HPOs, different measurement/unit combinations) - + values: the data quality metric being compared """ rows = df[x_label].unique().tolist() @@ -98,28 +97,28 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, color, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + color (str): character used to specify the colours of the bars - + total_diff_color (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric across all of the sites) - + turnoff_x (bool): used to disable the x-axis labels (for each of the bars). This is typically used when there are so many x-axis labels that they overlap and obscure legibility """ @@ -191,23 +190,23 @@ def create_pie_chart(dataframe, title, img_name): """ Function is used to create a pie chart that can show how much each site contributes to the overall 'drop' between the unioned and combined datasets - + Function also saves the outputted pie chart to the current directory - + Parameters ---------- - dataframe (df): dataframe for a particular table. shows the following for + dataframe (df): dataframe for a particular table. shows the following for HPOs that uploaded data: - + a. the number of rows in the unioned dataset b. the number of rows in the combined dataset c. the total 'drop' of rows across unioned to combined, expressed as a percentage d. the relative 'contribution' of each site to the overall drop from unioned to combined - - + + title (str): title of the graph - + img_name (str): title of the image to be saved """ hpo_list = dataframe['source_hpo'].tolist()[1:] # do not take 'total' @@ -246,24 +245,24 @@ def generate_query(dataset, person_var, record_var, table_name, field_name): a. generate a string that can be fed into BigQuery b. create a dataframe that contains information about the number of people and records for a particular dataset - + Parameters ---------- dataset (string): name of the dataset that will be queried (originally from the parameters file) - + person_var (string): variable that dictates how the 'number of people' will be displayed in the resultant dataframe - + record_var (string): variable that dictates how the 'number of records' will be displayed in the resultant dataframe - + table_name (string): represents the table that is being queried - + field_name (string): represents the field that should count the number of records for a particular dataset/table combination. this is usually 'table name'_id - + Returns ------- @@ -295,11 +294,11 @@ def generate_query(dataset, person_var, record_var, table_name, field_name): def extract_first_int_from_series(series): """ Function is used to extract the first integer from a Pandas series object. - + Parameters ---------- series (series): Pandas series object - + Returns ------- integer (int): the first integer from a Pandas series object @@ -319,38 +318,38 @@ def create_aggregate_table_df(unioned, combined, deid, unioned_persons_string, record_string): """ Function is used to create a dataframe that can display the 'drop off' of records across multiple - stages of the pipeline. - - + stages of the pipeline. + + Parameters: ----------- - + unioned (dataframe): contains information regarding the number of persons and record in the unioned dataset - + combined (dataframe): contains information regarding the number of persons and record in the combined dataset - + deid (dataframe): contains information regarding the number of persons and record in the deid dataset - + unioned_person_string (str): column name to determine the number of persons in the unioned dataset - + combined_person_string (str): column name to determine the number of persons in the combined dataset - + deid_person_string (str): column name to determine the number of persons in the deid dataset - + unioned_records_string (str): column name to determine the number of records in the unioned dataset - + combined_records_string (str): column name to determine the number of records in the combined dataset deid_records_string (str): column name to determine the number of records in the deid dataset - + person_string (str): row title to indicate the person drop for each stage of the pipeline - + record_string (str): row title to indicate the record drop for each stage of the pipeline - - + + Returns: -------- df (dataframe): contains information about the record and person count drop across each stage of @@ -761,19 +760,19 @@ def generate_site_level_query(id_name, unioned, table_name, combined): b. the number of rows for the HPO for a particular table in the unioned dataset c. the number of rows for the HPO for a particular table in the combined dataset d. the total 'drop' of rows across unioned to combined, expressed as a percentage - + Parameters ---------- - id_name (string): represents the 'primary key' of the table (the unique identifier + id_name (string): represents the 'primary key' of the table (the unique identifier for each row) - + unioned (string): the name of the unioned dataset to be queried - + table_name (string): name of the table that is being investigated - + combined (string): the name of the combined dataset to be queried - - + + Returns ------- dataframe (df): contains all of the information outlined in the top of the docstring @@ -827,17 +826,17 @@ def add_total_drop_row(dataframe): """ Function is used to add a 'total' row at the bottom of a dataframe that shows the relative 'drop' across the pipeline (unioned to combined) for the different sites. - + This row will show: a. the number of rows in the unioned dataset b. the number of rows in the combined dataset c. the total 'drop' of rows across unioned to combined, expressed as a percentage - + Parameters: ---------- dataframe (df): dataframe for a particular table. shows a-c (above) for each of the HPOs that uploaded data - + Returns: -------- dataframe (df): the inputted dataframe with an additional 'total' row at the end @@ -869,16 +868,16 @@ def add_percent_of_drop_column(dataframe): Function is used to add a 'percent_of_drop' column that shows how much each site's 'drop' contributed to the 'overall' drop from the unioned to the combined steps of the pipeline. - + Parameters ---------- - dataframe (df): dataframe for a particular table. shows the following for + dataframe (df): dataframe for a particular table. shows the following for HPOs that uploaded data: - + a. the number of rows in the unioned dataset b. the number of rows in the combined dataset c. the total 'drop' of rows across unioned to combined, expressed as a percentage - + Returns ------- dataframe (df): the above dataframe with a new column that shows each site's diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py b/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py index 532ca1febf..b75d78c410 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/date_disparity_with_respect_to_visit.py @@ -38,7 +38,6 @@ # %load_ext google.cloud.bigquery # + -import bq_utils import utils.bq from notebooks import parameters @@ -72,11 +71,11 @@ p_v_query = """ SELECT DISTINCT -a.*, +a.*, (a.procedure_vis_start_diff + a.procedure_vis_end_diff + a.procedure_vis_start_dt_diff + a.procedure_vis_end_dt_diff + a.procedure_dt_vis_start_dt_diff + a.procedure_dt_vis_end_dt_diff) as total_diff -FROM +FROM ( SELECT - mpo.src_hpo_id, COUNT(mpo.src_hpo_id) as num_bad_records, + mpo.src_hpo_id, COUNT(mpo.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(po.procedure_date, vo.visit_start_date, DAY)), 0) as procedure_vis_start_diff, IFNULL(ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)), 0) as procedure_vis_end_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)), 0) as procedure_vis_start_dt_diff, @@ -85,19 +84,19 @@ IFNULL(ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)), 0) as procedure_dt_vis_end_dt_diff, ( - ABS(DATE_DIFF(po.procedure_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)) + ABS(DATE_DIFF(po.procedure_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)) AND ABS(DATE_DIFF(po.procedure_date, vo.visit_end_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)) + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), po.procedure_date, DAY)) = ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), po.procedure_date, DAY)) AND - ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), po.procedure_date, DAY)) = - ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), po.procedure_date, DAY)) = + ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) AND - ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = + ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = ABS(DATE_DIFF(CAST(po.procedure_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)) ) as all_discrepancies_equal @@ -131,7 +130,7 @@ OR po.procedure_date > vo.visit_end_date) - OR + OR -- problem with datetime (po.procedure_datetime < vo.visit_start_datetime OR @@ -142,9 +141,9 @@ (po.procedure_date < CAST(vo.visit_start_datetime AS DATE) OR po.procedure_date > CAST(vo.visit_end_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(po.procedure_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE) OR @@ -188,28 +187,28 @@ def create_dicts_w_info(df, """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + table_visit_diff_string (string): the column that is used to calculate the 'average' difference between a date of interest and the visit start date. for instance, this would allow someone to specify the difference between the observation date and the visit start date. - + bad_records_string (string): the column of the dataframe whose rows will be summed and then converted to the keys of a dictionary. for instance 'num_bad_records' is often used to show the total number of 'bad' (discrepant) records for a particular site - + Returns ------- num_bad_records (dictionary): has the following structure keys: the HPOs values: the total number of 'bad' (discrepant) records for the particular column of interest - + table_visit_diff_dict (dictionary): has the following structure keys: the HPOs values: the 'average' difference between the two types of dates as specified @@ -262,27 +261,27 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + colour (str): character used to specify the colours of the bars - + total_diff_colour (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric - across all of the sites) + across all of the sites) """ bar_list = plt.bar(range(len(info_dict)), list(info_dict.values()), @@ -339,11 +338,11 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, observation_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.observation_vis_start_diff + a.observation_vis_end_diff + a.observation_vis_start_dt_diff + a.observation_vis_end_dt_diff + a.observation_dt_vis_start_dt_diff + a.observation_dt_vis_end_dt_diff) as total_diff -FROM +FROM ( SELECT - mo.src_hpo_id, COUNT(mo.src_hpo_id) as num_bad_records, + mo.src_hpo_id, COUNT(mo.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(o.observation_date, vo.visit_start_date, DAY)), 0) as observation_vis_start_diff, IFNULL(ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)), 0) as observation_vis_end_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)), 0) as observation_vis_start_dt_diff, @@ -352,19 +351,19 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, IFNULL(ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)), 0) as observation_dt_vis_end_dt_diff, ( - ABS(DATE_DIFF(o.observation_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)) + ABS(DATE_DIFF(o.observation_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)) AND ABS(DATE_DIFF(o.observation_date, vo.visit_end_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)) + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), o.observation_date, DAY)) = ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), o.observation_date, DAY)) AND - ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), o.observation_date, DAY)) = - ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), o.observation_date, DAY)) = + ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) AND - ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = + ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = ABS(DATE_DIFF(CAST(o.observation_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)) ) as all_discrepancies_equal @@ -398,7 +397,7 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, OR o.observation_date > vo.visit_end_date) - OR + OR -- problem with datetime (o.observation_datetime < vo.visit_start_datetime OR @@ -409,9 +408,9 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, (o.observation_date < CAST(vo.visit_start_datetime AS DATE) OR o.observation_date > CAST(vo.visit_end_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(o.observation_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE) OR @@ -487,11 +486,11 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, measurement_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.measurement_vis_start_diff + a.measurement_vis_end_diff + a.measurement_vis_start_dt_diff + a.measurement_vis_end_dt_diff + a.measurement_dt_vis_start_dt_diff + a.measurement_dt_vis_end_dt_diff) as total_diff -FROM +FROM ( SELECT - mm.src_hpo_id, COUNT(mm.src_hpo_id) as num_bad_records, + mm.src_hpo_id, COUNT(mm.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(m.measurement_date, vo.visit_start_date, DAY)), 0) as measurement_vis_start_diff, IFNULL(ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)), 0) as measurement_vis_end_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)), 0) as measurement_vis_start_dt_diff, @@ -500,19 +499,19 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, IFNULL(ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)), 0) as measurement_dt_vis_end_dt_diff, ( - ABS(DATE_DIFF(m.measurement_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)) + ABS(DATE_DIFF(m.measurement_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)) AND ABS(DATE_DIFF(m.measurement_date, vo.visit_end_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)) + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), m.measurement_date, DAY)) = ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), m.measurement_date, DAY)) AND - ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), m.measurement_date, DAY)) = - ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(vo.visit_end_datetime AS DATE), m.measurement_date, DAY)) = + ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) AND - ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = + ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) = ABS(DATE_DIFF(CAST(m.measurement_datetime AS DATE), CAST(vo.visit_end_datetime AS DATE), DAY)) ) as all_discrepancies_equal @@ -546,7 +545,7 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, OR m.measurement_date > vo.visit_end_date) - OR + OR -- problem with datetime (m.measurement_datetime < vo.visit_start_datetime OR @@ -557,9 +556,9 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, (m.measurement_date < CAST(vo.visit_start_datetime AS DATE) OR m.measurement_date > CAST(vo.visit_end_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(m.measurement_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE) OR @@ -634,21 +633,21 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, condition_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.condition_vis_start_diff + a.condition_vis_start_dt_diff + a.condition_dt_vis_start_dt_diff) as total_diff -FROM +FROM ( SELECT - mco.src_hpo_id, COUNT(mco.src_hpo_id) as num_bad_records, + mco.src_hpo_id, COUNT(mco.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(co.condition_start_date, vo.visit_start_date, DAY)), 0) as condition_vis_start_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)), 0) as condition_vis_start_dt_diff, IFNULL(ABS(DATE_DIFF(CAST(co.condition_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)), 0) as condition_dt_vis_start_dt_diff, - + ( - ABS(DATE_DIFF(co.condition_start_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)) + ABS(DATE_DIFF(co.condition_start_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), co.condition_start_date, DAY)) = - ABS(DATE_DIFF(CAST(co.condition_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(co.condition_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) ) as all_discrepancies_equal FROM @@ -679,16 +678,16 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, -- problem with procedure date (co.condition_start_date < vo.visit_start_date) - OR + OR -- problem with datetime (co.condition_start_datetime < vo.visit_start_datetime) OR -- problem with the datetime (extracting date for comparison) (co.condition_start_date < CAST(vo.visit_start_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(co.condition_start_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE)) ) @@ -752,21 +751,21 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, drug_visit_query = """ SELECT DISTINCT -a.*, +a.*, (a.drug_vis_start_diff + a.drug_vis_start_dt_diff + a.drug_dt_vis_start_dt_diff) as total_diff -FROM +FROM ( SELECT - mde.src_hpo_id, COUNT(mde.src_hpo_id) as num_bad_records, + mde.src_hpo_id, COUNT(mde.src_hpo_id) as num_bad_records, IFNULL(ABS(DATE_DIFF(de.drug_exposure_start_date, vo.visit_start_date, DAY)), 0) as drug_vis_start_diff, IFNULL(ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)), 0) as drug_vis_start_dt_diff, IFNULL(ABS(DATE_DIFF(CAST(de.drug_exposure_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)), 0) as drug_dt_vis_start_dt_diff, - + ( - ABS(DATE_DIFF(de.drug_exposure_start_date, vo.visit_start_date, DAY)) = - ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)) + ABS(DATE_DIFF(de.drug_exposure_start_date, vo.visit_start_date, DAY)) = + ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)) AND ABS(DATE_DIFF(CAST(vo.visit_start_datetime AS DATE), de.drug_exposure_start_date, DAY)) = - ABS(DATE_DIFF(CAST(de.drug_exposure_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) + ABS(DATE_DIFF(CAST(de.drug_exposure_start_datetime AS DATE), CAST(vo.visit_start_datetime AS DATE), DAY)) ) as all_discrepancies_equal FROM @@ -797,16 +796,16 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, -- problem with procedure date (de.drug_exposure_start_date < vo.visit_start_date) - OR + OR -- problem with datetime (de.drug_exposure_start_datetime < vo.visit_start_datetime) OR -- problem with the datetime (extracting date for comparison) (de.drug_exposure_start_date < CAST(vo.visit_start_datetime AS DATE)) - + OR - + --problem with the datetime (CAST(de.drug_exposure_start_datetime AS DATE) < CAST(vo.visit_start_datetime AS DATE)) ) diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py b/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py index fb691788c8..469889ef28 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/ehr_data_quality_dashboard_testing.py @@ -21,7 +21,6 @@ # %load_ext google.cloud.bigquery # + -import bq_utils import utils.bq from notebooks import parameters @@ -66,7 +65,7 @@ JOIN `{}._mapping_measurement` mm ON -mm.measurement_id = m.measurement_id +mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -108,7 +107,7 @@ JOIN `{}._mapping_measurement` mm ON -mm.measurement_id = m.measurement_id +mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -196,7 +195,7 @@ JOIN `{}._mapping_measurement` mm ON - mm.measurement_id = m.measurement_id + mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -238,7 +237,7 @@ JOIN `{}._mapping_measurement` mm ON - mm.measurement_id = m.measurement_id + mm.measurement_id = m.measurement_id JOIN `{}.concept` c ON @@ -305,28 +304,28 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, color, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + color (str): character used to specify the colours of the bars - + total_diff_color (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric across all of the sites) - + turnoff_x (bool): used to disable the x-axis labels (for each of the bars). This is typically used when there are so many x-axis labels that they overlap and obscure legibility """ @@ -357,25 +356,25 @@ def create_dicts_w_info(df, x_label, column_label): """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + x_label (string): the column of the dataframe whose rows will then be converted to they keys of a dictionary - + column_label (string): the column that contains the data quality metric being investigated - + Returns ------- data_qual_info (dictionary): has the following structure - + keys: the column for a particular dataframe that represents the elements that whose data quality is being compared (e.g. HPOs, different measurement/unit combinations) - + values: the data quality metric being compared """ rows = df[x_label].unique().tolist() diff --git a/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py b/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py index 2a6374cead..d2c7662837 100644 --- a/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py +++ b/data_steward/analytics/cdr_ops/systematic_scripts/notes_volume_distribution.py @@ -21,7 +21,6 @@ # %load_ext google.cloud.bigquery # + -import bq_utils import utils.bq from notebooks import parameters @@ -68,11 +67,11 @@ def create_dicts_w_info(df, column_label): """ This function is used to create a dictionary that can be easily converted to a graphical representation based on the values for a particular dataframe - + Parameters ---------- df (dataframe): dataframe that contains the information to be converted - + column_label (string): the column of the dataframe whose rows will then be converted to the keys of the dictionary """ @@ -95,23 +94,23 @@ def create_graphs(info_dict, xlabel, ylabel, title, img_name, colour, """ Function is used to create a bar graph for a particular dictionary with information about data quality - + Parameters ---------- info_dict (dictionary): contains information about data quality. The keys for the dictionary will serve as the x-axis labels whereas the values should serve as the 'y-value' for the particular bar - + xlabel (str): label to display across the x-axis - + ylabel (str): label to display across the y-axis - + title (str): title for the graph - + img_name (str): image used to save the image to the local repository - + colour (str): character used to specify the colours of the bars - + total_diff_colour (bool): indicates whether or not the last bar should be coloured red ( as opposed to the rest of the bars on the graph). This is typically used when the ultimate value of the dictionary is of particular important (e.g. representing an 'aggregate' metric diff --git a/data_steward/validation/metrics/required_labs.py b/data_steward/validation/metrics/required_labs.py index 2bcdd358f4..3b3b50ec30 100644 --- a/data_steward/validation/metrics/required_labs.py +++ b/data_steward/validation/metrics/required_labs.py @@ -7,9 +7,9 @@ # Project imports import app_identity -import bq_utils import resources import common +import bq_utils from constants import bq_utils as bq_consts from gcloud.bq import BigQueryClient from validation.metrics.required_labs_sql import (IDENTIFY_LABS_QUERY, @@ -165,7 +165,7 @@ def get_lab_concept_summary_query(client, hpo_id): Get the query that checks if the HPO site has submitted the required labs :param client: a BigQueryClient :param hpo_id: Identifies the HPO site - :return: + :return: """ dataset_id = common.BIGQUERY_DATASET_ID hpo_measurement_table = resources.get_table_id(common.MEASUREMENT, diff --git a/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py b/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py index 294e1a4c0d..16998d83da 100644 --- a/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py +++ b/tests/integration_tests/data_steward/retraction/retract_data_gcs_test.py @@ -10,7 +10,6 @@ # Project imports import app_identity -import bq_utils from common import BIGQUERY_DATASET_ID from tests import test_util from retraction import retract_data_gcs as rd diff --git a/tests/integration_tests/data_steward/validation/export_test.py b/tests/integration_tests/data_steward/validation/export_test.py index cb198a860a..7586cf559b 100644 --- a/tests/integration_tests/data_steward/validation/export_test.py +++ b/tests/integration_tests/data_steward/validation/export_test.py @@ -7,7 +7,6 @@ # Project imports import app_identity -import bq_utils import common from gcloud.gcs import StorageClient from gcloud.bq import BigQueryClient