From 6ec4fe1f9e667e69b1f2e21553473a260d2e53bb Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomas.yu@sagebase.org>
Date: Mon, 26 Aug 2024 11:07:15 -0700
Subject: [PATCH] [GEN-703] Validation status with version (#574)

* Add in process start date for validation status snapshot table

* only download file when necessary

* Add in hack to not download all the files when validating

* Add comment

* Format code

* Add version and tests

* Fix query

* Run black

* Add black version and remove code

* Change to 2

* Use specific black version

* Shift comment back

* comment out hack for downloads for now

* add linting

---------

Co-authored-by: rxu17 <26471741+rxu17@users.noreply.github.com>
---
 .github/workflows/ci.yml          |  7 +++---
 .pre-commit-config.yaml           |  8 +++----
 genie/dashboard_table_updater.py  |  6 +++---
 genie/input_to_database.py        | 33 ++++++++++++++++++++++------
 tests/test_database_to_staging.py |  5 -----
 tests/test_input_to_database.py   | 36 ++++++++++++++++++++++++++-----
 6 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 19e79da2..0ec1a25f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,7 +59,8 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: psf/black@stable
-
+        with:
+            version: "~=23.12"
   deploy:
     needs: [test, lint]
     runs-on: ubuntu-latest
@@ -108,7 +109,7 @@ jobs:
         uses: docker/metadata-action@v5
         with:
           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
-      
+
       - name: Format tags as registry refs
         id: registry_refs
         env:
@@ -127,4 +128,4 @@ jobs:
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: ${{ steps.registry_refs.outputs.tags }},mode=max
-          cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max
\ No newline at end of file
+          cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4a80f8b8..01be96d7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -42,10 +42,10 @@ repos:
 #   - id: blacken-docs
 #     additional_dependencies: [black]
 
-- repo: https://github.com/pre-commit/mirrors-mypy
-  rev: 'v1.3.0'
-  hooks:
-  - id: mypy
+# - repo: https://github.com/pre-commit/mirrors-mypy
+#   rev: 'v1.3.0'
+#   hooks:
+#   - id: mypy
     # additional_dependencies: [pydantic~=1.10]
 
 # Checks for missing docstrings
diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index 33a44d5f..99dfb3ca 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -460,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf):
         .applymap(int)
     )
 
-    diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = (
-        new_values
-    )
+    diff_between_releasesdf[
+        ["Clinical", "Mutation", "CNV", "SEG", "Fusions"]
+    ] = new_values
 
     load._update_table(
         syn,
diff --git a/genie/input_to_database.py b/genie/input_to_database.py
index bcedaace..b2426cf2 100644
--- a/genie/input_to_database.py
+++ b/genie/input_to_database.py
@@ -322,10 +322,9 @@ def validatefile(
     error_list = check_file_status["error_list"]
 
     messages_to_send = []
-
     # Need to figure out to how to remove this
     # This must pass in filenames, because filetype is determined by entity
-    # name Not by actual path of file
+    # name not by actual path of file
     validator = validate.GenieValidationHelper(
         syn=syn,
         project_id=project_id,
@@ -336,8 +335,13 @@ def validatefile(
         ancillary_files=ancillary_files,
     )
     filetype = validator.file_type
-
     if check_file_status["to_validate"]:
+        # HACK: Don't download again if only_validate is not True, but all
+        # files need to be downloaded currently when validation + processing
+        # isn't split up
+        # if entities[0].get("path") is None:
+        #    validator.entitylist = [syn.get(entity) for entity in entities]
+
         valid_cls, message = validator.validate_single_file(
             oncotree_link=genie_config["oncotreeLink"], nosymbol_check=False
         )
@@ -525,6 +529,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]):
         "modifiedOn",
         "fileType",
         "center",
+        "version",
         "entity",
     ]
     input_status_rows = []
@@ -539,6 +544,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]):
             "modifiedOn": entity_date_to_timestamp(entity.properties.modifiedOn),
             "fileType": input_status["fileType"],
             "center": input_status["center"],
+            "version": entity.versionNumber,
             "entity": entity,
         }
         input_status_rows.append(row)
@@ -562,7 +568,15 @@ def build_error_tracking_table(invalid_errors: List[dict]):
         Error tracking dataframe
 
     """
-    error_table_columns = ["id", "errors", "name", "fileType", "center", "entity"]
+    error_table_columns = [
+        "id",
+        "errors",
+        "name",
+        "fileType",
+        "center",
+        "version",
+        "entity",
+    ]
     invalid_error_rows = []
     for invalid_error in invalid_errors:
         entity = invalid_error["entity"]
@@ -572,6 +586,7 @@ def build_error_tracking_table(invalid_errors: List[dict]):
             "name": entity.name,
             "fileType": invalid_error["fileType"],
             "center": invalid_error["center"],
+            "version": entity.versionNumber,
             "entity": entity,
         }
         invalid_error_rows.append(row)
@@ -763,9 +778,11 @@ def validation(
                     user_message_dict[user].append(file_messages)
 
     validation_statusdf = build_validation_status_table(input_valid_statuses)
+
     error_trackingdf = build_error_tracking_table(invalid_errors)
 
     new_tables = _update_tables_content(validation_statusdf, error_trackingdf)
+
     validation_statusdf = new_tables["validation_statusdf"]
     error_trackingdf = new_tables["error_trackingdf"]
     duplicated_filesdf = new_tables["duplicated_filesdf"]
@@ -793,7 +810,6 @@ def validation(
         validation_status_table=validation_status_table,
         error_tracker_table=error_tracker_table,
     )
-
     valid_filesdf = validation_statusdf.query('status == "VALIDATED"')
     return valid_filesdf[["id", "path", "fileType", "name"]]
 
@@ -870,7 +886,12 @@ def center_input_to_database(
     center_input_synid = genie_config["center_config"][center]["inputSynId"]
     logger.info("Center: " + center)
     center_files = extract.get_center_input_files(
-        syn, center_input_synid, center, process
+        syn=syn,
+        synid=center_input_synid,
+        center=center,
+        process=process,
+        # HACK: Don't download all the files when only validate
+        # downloadFile=(not only_validate),
     )
 
     # ancillary_files = get_ancillary_files(
diff --git a/tests/test_database_to_staging.py b/tests/test_database_to_staging.py
index 5ed17fe5..14999afb 100644
--- a/tests/test_database_to_staging.py
+++ b/tests/test_database_to_staging.py
@@ -250,7 +250,6 @@ def test_that_runMAFinBED_calls_expected_calls(syn, test, staging):
             )
         ),
     ) as patch_store_maf_in_bed_filtered_variants:
-
         # setting some testing vars
         test_script_dir = "test_file_dir/"
         test_notinbed_file_path = os.path.join(test_script_dir, "../R/notinbed.csv")
@@ -479,7 +478,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is
             name="flaggedVariants",
         ),
     ) as patch_get_flagged_variants:
-
         # setting some testing vars
         test_center_mapping_df = pd.DataFrame(
             dict(Center=["SAGE"], stagingSynId=["synZZZZZ"])
@@ -588,7 +586,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is
             name="flaggedVariants",
         ),
     ) as patch_get_flagged_variants:
-
         # setting some testing vars
         test_center_mapping_df = pd.DataFrame(
             dict(Center=["SAGE"], stagingSynId=["synZZZZZ"])
@@ -697,7 +694,6 @@ def test_that_get_mutation_in_cis_flagged_variants_returns_expected_flagged_vari
     with patch.object(
         extract, "get_syntabledf", return_value=flagged_df
     ) as patch_get_syntabledf:
-
         result = database_to_staging.get_mutation_in_cis_flagged_variants(
             syn, variant_filtering_synId="synZZZZZ"
         )
@@ -737,7 +733,6 @@ def test_that_get_mutation_in_cis_filtered_samples_returns_expected_variants(
     with patch.object(
         extract, "get_syntabledf", return_value=filtered_df
     ) as patch_get_syntabledf:
-
         result = database_to_staging.get_mutation_in_cis_filtered_samples(
             syn, variant_filtering_synId="synZZZZZ"
         )
diff --git a/tests/test_input_to_database.py b/tests/test_input_to_database.py
index 59a76ef8..6eb651f5 100644
--- a/tests/test_input_to_database.py
+++ b/tests/test_input_to_database.py
@@ -26,6 +26,7 @@
     name="data_clinical_supp_sample_SAGE.txt",
     modifiedOn="2019-03-24T12:00:00.Z",
     md5="44444",
+    versionNumber=3,
 )
 
 patient_clinical_synid = "syn11111"
@@ -679,6 +680,7 @@ def setup_method(self):
                 1553428800000,
                 "clinical",
                 center,
+                sample_clinical_entity.versionNumber,
                 sample_clinical_entity,
             ]
         ]
@@ -692,6 +694,7 @@ def setup_method(self):
                 "modifiedOn",
                 "fileType",
                 "center",
+                "version",
                 "entity",
             ]
         )
@@ -706,6 +709,7 @@ def setup_method(self):
                 "modifiedOn",
                 "fileType",
                 "center",
+                "version",
                 "entity",
             ],
         )
@@ -716,14 +720,16 @@ def setup_method(self):
                 sample_clinical_entity.name,
                 "clinical",
                 center,
+                sample_clinical_entity.versionNumber,
                 sample_clinical_entity,
             ]
         ]
         self.errors_df = pd.DataFrame(
-            error, columns=["id", "errors", "name", "fileType", "center", "entity"]
+            error,
+            columns=["id", "errors", "name", "fileType", "center", "version", "entity"],
         )
         self.empty_errors = pd.DataFrame(
-            columns=["id", "errors", "name", "fileType", "center", "entity"]
+            columns=["id", "errors", "name", "fileType", "center", "version", "entity"]
         )
 
         self.with_dupsdf = pd.DataFrame(
@@ -744,6 +750,7 @@ def setup_method(self):
                 ],
                 "center": ["SAGE"] * 5,
                 "fileType": ["type"] * 5,
+                "version": [3] * 5,
                 "entity": ["entity"] * 5,
             }
         )
@@ -768,12 +775,15 @@ def setup_method(self):
                 ],
                 "center": ["SAGE"] * 5,
                 "fileType": ["type"] * 5,
+                "version": [3] * 5,
                 "entity": ["entity"] * 5,
             }
         )
         self.empty_dup = pd.DataFrame(
-            columns=["id", "name", "center", "fileType", "entity", "errors"]
+            columns=["id", "name", "center", "fileType", "version", "entity", "errors"]
         )
+        # self.empty_dup = self.empty_dup.astype({"version": int})
+        # self.empty_dup.index = self.empty_dup.index.astype('object')
 
     def test_build_validation_status_table(self):
         input_valid_statuses = [
@@ -836,7 +846,12 @@ def test_dups_get_duplicated_files(self):
     def test_nodups_get_duplicated_files(self):
         """Test no duplicated"""
         dupsdf = input_to_database.get_duplicated_files(self.no_dupsdf)
-        assert dupsdf.equals(self.empty_dup)
+        # These empty frames won't be equal without these conversions
+        # HACK: Convert the index type to the same type
+        self.empty_dup.index = self.empty_dup.index.astype("int")
+        # HACK: Convert the dtype of the "version" column to the same type
+        self.empty_dup["version"] = self.empty_dup["version"].astype("int")
+        pd.testing.assert_frame_equal(dupsdf, self.empty_dup)
 
     def test__update_tables_content(self):
         """Tests duplicates are added to the tables and errors/statues are
@@ -970,6 +985,7 @@ def test_validation(
         entity = synapseclient.Entity(
             id="syn1234",
             md5="44444",
+            versionNumber=3,
             path="/path/to/foobar.txt",
             name="data_clinical_supp_SAGE.txt",
         )
@@ -984,6 +1000,7 @@ def test_validation(
                 entity.name,
                 modified_on,
                 filetype,
+                entity.versionNumber,
                 center,
             ]
         ]
@@ -997,7 +1014,14 @@ def test_validation(
         errortracking_mock = emptytable_mock()
         valiate_cls = Mock()
         with patch.object(
-            syn, "tableQuery", side_effect=[validationstatus_mock, errortracking_mock]
+            syn,
+            "tableQuery",
+            side_effect=[
+                validationstatus_mock,
+                errortracking_mock,
+                validationstatus_mock,
+                errortracking_mock,
+            ],
         ) as patch_query, patch.object(
             input_to_database,
             "validatefile",
@@ -1006,6 +1030,8 @@ def test_validation(
             input_to_database,
             "build_validation_status_table",
             return_value=self.validation_statusdf,
+        ), patch.object(
+            syn, "get", return_value=entity
         ), patch.object(
             input_to_database, "build_error_tracking_table", return_value=self.errors_df
         ), patch.object(