Skip to content

Commit

Permalink
[GEN-703] Validation status with version (#574)
Browse files Browse the repository at this point in the history
* Add in process start date for validation status snapshot table

* only download file when necessary

* Add in hack to not download all the files when validating

* Add comment

* Format code

* Add version and tests

* Fix query

* Run black

* Add black version and remove code

* Change to 2

* Use specific black version

* Shift comment back

* comment out hack for downloads for now

* add linting

---------

Co-authored-by: rxu17 <[email protected]>
  • Loading branch information
thomasyu888 and rxu17 authored Aug 26, 2024
1 parent 4a2b9dd commit 6ec4fe1
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 26 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: psf/black@stable

with:
version: "~=23.12"
deploy:
needs: [test, lint]
runs-on: ubuntu-latest
Expand Down Expand Up @@ -108,7 +109,7 @@ jobs:
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

- name: Format tags as registry refs
id: registry_refs
env:
Expand All @@ -127,4 +128,4 @@ jobs:
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: ${{ steps.registry_refs.outputs.tags }},mode=max
cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max
cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ repos:
# - id: blacken-docs
# additional_dependencies: [black]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v1.3.0'
hooks:
- id: mypy
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: 'v1.3.0'
# hooks:
# - id: mypy
# additional_dependencies: [pydantic~=1.10]

# Checks for missing docstrings
Expand Down
6 changes: 3 additions & 3 deletions genie/dashboard_table_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf):
.applymap(int)
)

diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = (
new_values
)
diff_between_releasesdf[
["Clinical", "Mutation", "CNV", "SEG", "Fusions"]
] = new_values

load._update_table(
syn,
Expand Down
33 changes: 27 additions & 6 deletions genie/input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,10 +322,9 @@ def validatefile(
error_list = check_file_status["error_list"]

messages_to_send = []

# Need to figure out to how to remove this
# This must pass in filenames, because filetype is determined by entity
# name Not by actual path of file
# name not by actual path of file
validator = validate.GenieValidationHelper(
syn=syn,
project_id=project_id,
Expand All @@ -336,8 +335,13 @@ def validatefile(
ancillary_files=ancillary_files,
)
filetype = validator.file_type

if check_file_status["to_validate"]:
# HACK: Don't download again if only_validate is not True, but all
# files need to be downloaded currently when validation + processing
# isn't split up
# if entities[0].get("path") is None:
# validator.entitylist = [syn.get(entity) for entity in entities]

valid_cls, message = validator.validate_single_file(
oncotree_link=genie_config["oncotreeLink"], nosymbol_check=False
)
Expand Down Expand Up @@ -525,6 +529,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]):
"modifiedOn",
"fileType",
"center",
"version",
"entity",
]
input_status_rows = []
Expand All @@ -539,6 +544,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]):
"modifiedOn": entity_date_to_timestamp(entity.properties.modifiedOn),
"fileType": input_status["fileType"],
"center": input_status["center"],
"version": entity.versionNumber,
"entity": entity,
}
input_status_rows.append(row)
Expand All @@ -562,7 +568,15 @@ def build_error_tracking_table(invalid_errors: List[dict]):
Error tracking dataframe
"""
error_table_columns = ["id", "errors", "name", "fileType", "center", "entity"]
error_table_columns = [
"id",
"errors",
"name",
"fileType",
"center",
"version",
"entity",
]
invalid_error_rows = []
for invalid_error in invalid_errors:
entity = invalid_error["entity"]
Expand All @@ -572,6 +586,7 @@ def build_error_tracking_table(invalid_errors: List[dict]):
"name": entity.name,
"fileType": invalid_error["fileType"],
"center": invalid_error["center"],
"version": entity.versionNumber,
"entity": entity,
}
invalid_error_rows.append(row)
Expand Down Expand Up @@ -763,9 +778,11 @@ def validation(
user_message_dict[user].append(file_messages)

validation_statusdf = build_validation_status_table(input_valid_statuses)

error_trackingdf = build_error_tracking_table(invalid_errors)

new_tables = _update_tables_content(validation_statusdf, error_trackingdf)

validation_statusdf = new_tables["validation_statusdf"]
error_trackingdf = new_tables["error_trackingdf"]
duplicated_filesdf = new_tables["duplicated_filesdf"]
Expand Down Expand Up @@ -793,7 +810,6 @@ def validation(
validation_status_table=validation_status_table,
error_tracker_table=error_tracker_table,
)

valid_filesdf = validation_statusdf.query('status == "VALIDATED"')
return valid_filesdf[["id", "path", "fileType", "name"]]

Expand Down Expand Up @@ -870,7 +886,12 @@ def center_input_to_database(
center_input_synid = genie_config["center_config"][center]["inputSynId"]
logger.info("Center: " + center)
center_files = extract.get_center_input_files(
syn, center_input_synid, center, process
syn=syn,
synid=center_input_synid,
center=center,
process=process,
# HACK: Don't download all the files when only validate
# downloadFile=(not only_validate),
)

# ancillary_files = get_ancillary_files(
Expand Down
5 changes: 0 additions & 5 deletions tests/test_database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,6 @@ def test_that_runMAFinBED_calls_expected_calls(syn, test, staging):
)
),
) as patch_store_maf_in_bed_filtered_variants:

# setting some testing vars
test_script_dir = "test_file_dir/"
test_notinbed_file_path = os.path.join(test_script_dir, "../R/notinbed.csv")
Expand Down Expand Up @@ -479,7 +478,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is
name="flaggedVariants",
),
) as patch_get_flagged_variants:

# setting some testing vars
test_center_mapping_df = pd.DataFrame(
dict(Center=["SAGE"], stagingSynId=["synZZZZZ"])
Expand Down Expand Up @@ -588,7 +586,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is
name="flaggedVariants",
),
) as patch_get_flagged_variants:

# setting some testing vars
test_center_mapping_df = pd.DataFrame(
dict(Center=["SAGE"], stagingSynId=["synZZZZZ"])
Expand Down Expand Up @@ -697,7 +694,6 @@ def test_that_get_mutation_in_cis_flagged_variants_returns_expected_flagged_vari
with patch.object(
extract, "get_syntabledf", return_value=flagged_df
) as patch_get_syntabledf:

result = database_to_staging.get_mutation_in_cis_flagged_variants(
syn, variant_filtering_synId="synZZZZZ"
)
Expand Down Expand Up @@ -737,7 +733,6 @@ def test_that_get_mutation_in_cis_filtered_samples_returns_expected_variants(
with patch.object(
extract, "get_syntabledf", return_value=filtered_df
) as patch_get_syntabledf:

result = database_to_staging.get_mutation_in_cis_filtered_samples(
syn, variant_filtering_synId="synZZZZZ"
)
Expand Down
36 changes: 31 additions & 5 deletions tests/test_input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
name="data_clinical_supp_sample_SAGE.txt",
modifiedOn="2019-03-24T12:00:00.Z",
md5="44444",
versionNumber=3,
)

patient_clinical_synid = "syn11111"
Expand Down Expand Up @@ -679,6 +680,7 @@ def setup_method(self):
1553428800000,
"clinical",
center,
sample_clinical_entity.versionNumber,
sample_clinical_entity,
]
]
Expand All @@ -692,6 +694,7 @@ def setup_method(self):
"modifiedOn",
"fileType",
"center",
"version",
"entity",
]
)
Expand All @@ -706,6 +709,7 @@ def setup_method(self):
"modifiedOn",
"fileType",
"center",
"version",
"entity",
],
)
Expand All @@ -716,14 +720,16 @@ def setup_method(self):
sample_clinical_entity.name,
"clinical",
center,
sample_clinical_entity.versionNumber,
sample_clinical_entity,
]
]
self.errors_df = pd.DataFrame(
error, columns=["id", "errors", "name", "fileType", "center", "entity"]
error,
columns=["id", "errors", "name", "fileType", "center", "version", "entity"],
)
self.empty_errors = pd.DataFrame(
columns=["id", "errors", "name", "fileType", "center", "entity"]
columns=["id", "errors", "name", "fileType", "center", "version", "entity"]
)

self.with_dupsdf = pd.DataFrame(
Expand All @@ -744,6 +750,7 @@ def setup_method(self):
],
"center": ["SAGE"] * 5,
"fileType": ["type"] * 5,
"version": [3] * 5,
"entity": ["entity"] * 5,
}
)
Expand All @@ -768,12 +775,15 @@ def setup_method(self):
],
"center": ["SAGE"] * 5,
"fileType": ["type"] * 5,
"version": [3] * 5,
"entity": ["entity"] * 5,
}
)
self.empty_dup = pd.DataFrame(
columns=["id", "name", "center", "fileType", "entity", "errors"]
columns=["id", "name", "center", "fileType", "version", "entity", "errors"]
)
# self.empty_dup = self.empty_dup.astype({"version": int})
# self.empty_dup.index = self.empty_dup.index.astype('object')

def test_build_validation_status_table(self):
input_valid_statuses = [
Expand Down Expand Up @@ -836,7 +846,12 @@ def test_dups_get_duplicated_files(self):
def test_nodups_get_duplicated_files(self):
"""Test no duplicated"""
dupsdf = input_to_database.get_duplicated_files(self.no_dupsdf)
assert dupsdf.equals(self.empty_dup)
# These empty frames won't be equal without these conversions
# HACK: Convert the index type to the same type
self.empty_dup.index = self.empty_dup.index.astype("int")
# HACK: Convert the dtype of the "version" column to the same type
self.empty_dup["version"] = self.empty_dup["version"].astype("int")
pd.testing.assert_frame_equal(dupsdf, self.empty_dup)

def test__update_tables_content(self):
"""Tests duplicates are added to the tables and errors/statues are
Expand Down Expand Up @@ -970,6 +985,7 @@ def test_validation(
entity = synapseclient.Entity(
id="syn1234",
md5="44444",
versionNumber=3,
path="/path/to/foobar.txt",
name="data_clinical_supp_SAGE.txt",
)
Expand All @@ -984,6 +1000,7 @@ def test_validation(
entity.name,
modified_on,
filetype,
entity.versionNumber,
center,
]
]
Expand All @@ -997,7 +1014,14 @@ def test_validation(
errortracking_mock = emptytable_mock()
valiate_cls = Mock()
with patch.object(
syn, "tableQuery", side_effect=[validationstatus_mock, errortracking_mock]
syn,
"tableQuery",
side_effect=[
validationstatus_mock,
errortracking_mock,
validationstatus_mock,
errortracking_mock,
],
) as patch_query, patch.object(
input_to_database,
"validatefile",
Expand All @@ -1006,6 +1030,8 @@ def test_validation(
input_to_database,
"build_validation_status_table",
return_value=self.validation_statusdf,
), patch.object(
syn, "get", return_value=entity
), patch.object(
input_to_database, "build_error_tracking_table", return_value=self.errors_df
), patch.object(
Expand Down

0 comments on commit 6ec4fe1

Please sign in to comment.