From 6ec4fe1f9e667e69b1f2e21553473a260d2e53bb Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 26 Aug 2024 11:07:15 -0700 Subject: [PATCH] [GEN-703] Validation status with version (#574) * Add in process start date for validation status snapshot table * only download file when necessary * Add in hack to not download all the files when validating * Add comment * Format code * Add version and tests * Fix query * Run black * Add black version and remove code * Change to 2 * Use specific black version * Shift comment back * comment out hack for downloads for now * add linting --------- Co-authored-by: rxu17 <26471741+rxu17@users.noreply.github.com> --- .github/workflows/ci.yml | 7 +++--- .pre-commit-config.yaml | 8 +++---- genie/dashboard_table_updater.py | 6 +++--- genie/input_to_database.py | 33 ++++++++++++++++++++++------ tests/test_database_to_staging.py | 5 ----- tests/test_input_to_database.py | 36 ++++++++++++++++++++++++++----- 6 files changed, 69 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 19e79da2..0ec1a25f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,7 +59,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: psf/black@stable - + with: + version: "~=23.12" deploy: needs: [test, lint] runs-on: ubuntu-latest @@ -108,7 +109,7 @@ jobs: uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - + - name: Format tags as registry refs id: registry_refs env: @@ -127,4 +128,4 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: ${{ steps.registry_refs.outputs.tags }},mode=max - cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max \ No newline at end of file + cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4a80f8b8..01be96d7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,10 +42,10 @@ repos: # - id: blacken-docs # additional_dependencies: [black] -- repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.3.0' - hooks: - - id: mypy +# - repo: https://github.com/pre-commit/mirrors-mypy +# rev: 'v1.3.0' +# hooks: +# - id: mypy # additional_dependencies: [pydantic~=1.10] # Checks for missing docstrings diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 33a44d5f..99dfb3ca 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -460,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf): .applymap(int) ) - diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = ( - new_values - ) + diff_between_releasesdf[ + ["Clinical", "Mutation", "CNV", "SEG", "Fusions"] + ] = new_values load._update_table( syn, diff --git a/genie/input_to_database.py b/genie/input_to_database.py index bcedaace..b2426cf2 100644 --- a/genie/input_to_database.py +++ b/genie/input_to_database.py @@ -322,10 +322,9 @@ def validatefile( error_list = check_file_status["error_list"] messages_to_send = [] - # Need to figure out to how to remove this # This must pass in filenames, because filetype is determined by entity - # name Not by actual path of file + # name not by actual path of file validator = validate.GenieValidationHelper( syn=syn, project_id=project_id, @@ -336,8 +335,13 @@ def validatefile( ancillary_files=ancillary_files, ) filetype = validator.file_type - if check_file_status["to_validate"]: + # HACK: Don't download again if only_validate is not True, but all + # files need to be downloaded currently when validation + processing + # isn't split up + # if entities[0].get("path") is None: + # validator.entitylist = [syn.get(entity) for entity in entities] + valid_cls, message = validator.validate_single_file( oncotree_link=genie_config["oncotreeLink"], nosymbol_check=False ) @@ -525,6 +529,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]): "modifiedOn", "fileType", "center", + "version", "entity", ] input_status_rows = [] @@ -539,6 +544,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]): "modifiedOn": entity_date_to_timestamp(entity.properties.modifiedOn), "fileType": input_status["fileType"], "center": input_status["center"], + "version": entity.versionNumber, "entity": entity, } input_status_rows.append(row) @@ -562,7 +568,15 @@ def build_error_tracking_table(invalid_errors: List[dict]): Error tracking dataframe """ - error_table_columns = ["id", "errors", "name", "fileType", "center", "entity"] + error_table_columns = [ + "id", + "errors", + "name", + "fileType", + "center", + "version", + "entity", + ] invalid_error_rows = [] for invalid_error in invalid_errors: entity = invalid_error["entity"] @@ -572,6 +586,7 @@ def build_error_tracking_table(invalid_errors: List[dict]): "name": entity.name, "fileType": invalid_error["fileType"], "center": invalid_error["center"], + "version": entity.versionNumber, "entity": entity, } invalid_error_rows.append(row) @@ -763,9 +778,11 @@ def validation( user_message_dict[user].append(file_messages) validation_statusdf = build_validation_status_table(input_valid_statuses) + error_trackingdf = build_error_tracking_table(invalid_errors) new_tables = _update_tables_content(validation_statusdf, error_trackingdf) + validation_statusdf = new_tables["validation_statusdf"] error_trackingdf = new_tables["error_trackingdf"] duplicated_filesdf = new_tables["duplicated_filesdf"] @@ -793,7 +810,6 @@ def validation( validation_status_table=validation_status_table, error_tracker_table=error_tracker_table, ) - valid_filesdf = validation_statusdf.query('status == "VALIDATED"') return valid_filesdf[["id", "path", "fileType", "name"]] @@ -870,7 +886,12 @@ def center_input_to_database( center_input_synid = genie_config["center_config"][center]["inputSynId"] logger.info("Center: " + center) center_files = extract.get_center_input_files( - syn, center_input_synid, center, process + syn=syn, + synid=center_input_synid, + center=center, + process=process, + # HACK: Don't download all the files when only validate + # downloadFile=(not only_validate), ) # ancillary_files = get_ancillary_files( diff --git a/tests/test_database_to_staging.py b/tests/test_database_to_staging.py index 5ed17fe5..14999afb 100644 --- a/tests/test_database_to_staging.py +++ b/tests/test_database_to_staging.py @@ -250,7 +250,6 @@ def test_that_runMAFinBED_calls_expected_calls(syn, test, staging): ) ), ) as patch_store_maf_in_bed_filtered_variants: - # setting some testing vars test_script_dir = "test_file_dir/" test_notinbed_file_path = os.path.join(test_script_dir, "../R/notinbed.csv") @@ -479,7 +478,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is name="flaggedVariants", ), ) as patch_get_flagged_variants: - # setting some testing vars test_center_mapping_df = pd.DataFrame( dict(Center=["SAGE"], stagingSynId=["synZZZZZ"]) @@ -588,7 +586,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is name="flaggedVariants", ), ) as patch_get_flagged_variants: - # setting some testing vars test_center_mapping_df = pd.DataFrame( dict(Center=["SAGE"], stagingSynId=["synZZZZZ"]) @@ -697,7 +694,6 @@ def test_that_get_mutation_in_cis_flagged_variants_returns_expected_flagged_vari with patch.object( extract, "get_syntabledf", return_value=flagged_df ) as patch_get_syntabledf: - result = database_to_staging.get_mutation_in_cis_flagged_variants( syn, variant_filtering_synId="synZZZZZ" ) @@ -737,7 +733,6 @@ def test_that_get_mutation_in_cis_filtered_samples_returns_expected_variants( with patch.object( extract, "get_syntabledf", return_value=filtered_df ) as patch_get_syntabledf: - result = database_to_staging.get_mutation_in_cis_filtered_samples( syn, variant_filtering_synId="synZZZZZ" ) diff --git a/tests/test_input_to_database.py b/tests/test_input_to_database.py index 59a76ef8..6eb651f5 100644 --- a/tests/test_input_to_database.py +++ b/tests/test_input_to_database.py @@ -26,6 +26,7 @@ name="data_clinical_supp_sample_SAGE.txt", modifiedOn="2019-03-24T12:00:00.Z", md5="44444", + versionNumber=3, ) patient_clinical_synid = "syn11111" @@ -679,6 +680,7 @@ def setup_method(self): 1553428800000, "clinical", center, + sample_clinical_entity.versionNumber, sample_clinical_entity, ] ] @@ -692,6 +694,7 @@ def setup_method(self): "modifiedOn", "fileType", "center", + "version", "entity", ] ) @@ -706,6 +709,7 @@ def setup_method(self): "modifiedOn", "fileType", "center", + "version", "entity", ], ) @@ -716,14 +720,16 @@ def setup_method(self): sample_clinical_entity.name, "clinical", center, + sample_clinical_entity.versionNumber, sample_clinical_entity, ] ] self.errors_df = pd.DataFrame( - error, columns=["id", "errors", "name", "fileType", "center", "entity"] + error, + columns=["id", "errors", "name", "fileType", "center", "version", "entity"], ) self.empty_errors = pd.DataFrame( - columns=["id", "errors", "name", "fileType", "center", "entity"] + columns=["id", "errors", "name", "fileType", "center", "version", "entity"] ) self.with_dupsdf = pd.DataFrame( @@ -744,6 +750,7 @@ def setup_method(self): ], "center": ["SAGE"] * 5, "fileType": ["type"] * 5, + "version": [3] * 5, "entity": ["entity"] * 5, } ) @@ -768,12 +775,15 @@ def setup_method(self): ], "center": ["SAGE"] * 5, "fileType": ["type"] * 5, + "version": [3] * 5, "entity": ["entity"] * 5, } ) self.empty_dup = pd.DataFrame( - columns=["id", "name", "center", "fileType", "entity", "errors"] + columns=["id", "name", "center", "fileType", "version", "entity", "errors"] ) + # self.empty_dup = self.empty_dup.astype({"version": int}) + # self.empty_dup.index = self.empty_dup.index.astype('object') def test_build_validation_status_table(self): input_valid_statuses = [ @@ -836,7 +846,12 @@ def test_dups_get_duplicated_files(self): def test_nodups_get_duplicated_files(self): """Test no duplicated""" dupsdf = input_to_database.get_duplicated_files(self.no_dupsdf) - assert dupsdf.equals(self.empty_dup) + # These empty frames won't be equal without these conversions + # HACK: Convert the index type to the same type + self.empty_dup.index = self.empty_dup.index.astype("int") + # HACK: Convert the dtype of the "version" column to the same type + self.empty_dup["version"] = self.empty_dup["version"].astype("int") + pd.testing.assert_frame_equal(dupsdf, self.empty_dup) def test__update_tables_content(self): """Tests duplicates are added to the tables and errors/statues are @@ -970,6 +985,7 @@ def test_validation( entity = synapseclient.Entity( id="syn1234", md5="44444", + versionNumber=3, path="/path/to/foobar.txt", name="data_clinical_supp_SAGE.txt", ) @@ -984,6 +1000,7 @@ def test_validation( entity.name, modified_on, filetype, + entity.versionNumber, center, ] ] @@ -997,7 +1014,14 @@ def test_validation( errortracking_mock = emptytable_mock() valiate_cls = Mock() with patch.object( - syn, "tableQuery", side_effect=[validationstatus_mock, errortracking_mock] + syn, + "tableQuery", + side_effect=[ + validationstatus_mock, + errortracking_mock, + validationstatus_mock, + errortracking_mock, + ], ) as patch_query, patch.object( input_to_database, "validatefile", @@ -1006,6 +1030,8 @@ def test_validation( input_to_database, "build_validation_status_table", return_value=self.validation_statusdf, + ), patch.object( + syn, "get", return_value=entity ), patch.object( input_to_database, "build_error_tracking_table", return_value=self.errors_df ), patch.object(