Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-703] Validation status with version #574

Merged
merged 16 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: psf/black@stable

with:
version: "~=23.12"
deploy:
needs: [test, lint]
runs-on: ubuntu-latest
Expand Down Expand Up @@ -108,7 +109,7 @@ jobs:
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

- name: Format tags as registry refs
id: registry_refs
env:
Expand All @@ -127,4 +128,4 @@ jobs:
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: ${{ steps.registry_refs.outputs.tags }},mode=max
cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max
cache-to: ${{ steps.registry_refs.outputs.tags }},mode=max
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ repos:
# - id: blacken-docs
# additional_dependencies: [black]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v1.3.0'
hooks:
- id: mypy
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: 'v1.3.0'
# hooks:
# - id: mypy
# additional_dependencies: [pydantic~=1.10]

# Checks for missing docstrings
Expand Down
6 changes: 3 additions & 3 deletions genie/dashboard_table_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf):
.applymap(int)
)

diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = (
new_values
)
diff_between_releasesdf[
["Clinical", "Mutation", "CNV", "SEG", "Fusions"]
] = new_values

load._update_table(
syn,
Expand Down
33 changes: 27 additions & 6 deletions genie/input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,10 +322,9 @@ def validatefile(
error_list = check_file_status["error_list"]

messages_to_send = []

# Need to figure out to how to remove this
# This must pass in filenames, because filetype is determined by entity
# name Not by actual path of file
# name not by actual path of file
validator = validate.GenieValidationHelper(
syn=syn,
project_id=project_id,
Expand All @@ -336,8 +335,13 @@ def validatefile(
ancillary_files=ancillary_files,
)
filetype = validator.file_type

if check_file_status["to_validate"]:
# HACK: Don't download again if only_validate is not True, but all
rxu17 marked this conversation as resolved.
Show resolved Hide resolved
# files need to be downloaded currently when validation + processing
# isn't split up
# if entities[0].get("path") is None:
# validator.entitylist = [syn.get(entity) for entity in entities]

valid_cls, message = validator.validate_single_file(
oncotree_link=genie_config["oncotreeLink"], nosymbol_check=False
)
Expand Down Expand Up @@ -525,6 +529,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]):
"modifiedOn",
"fileType",
"center",
"version",
"entity",
]
input_status_rows = []
Expand All @@ -539,6 +544,7 @@ def build_validation_status_table(input_valid_statuses: List[dict]):
"modifiedOn": entity_date_to_timestamp(entity.properties.modifiedOn),
"fileType": input_status["fileType"],
"center": input_status["center"],
"version": entity.versionNumber,
"entity": entity,
}
input_status_rows.append(row)
Expand All @@ -562,7 +568,15 @@ def build_error_tracking_table(invalid_errors: List[dict]):
Error tracking dataframe

"""
error_table_columns = ["id", "errors", "name", "fileType", "center", "entity"]
error_table_columns = [
"id",
"errors",
"name",
"fileType",
"center",
"version",
"entity",
]
invalid_error_rows = []
for invalid_error in invalid_errors:
entity = invalid_error["entity"]
Expand All @@ -572,6 +586,7 @@ def build_error_tracking_table(invalid_errors: List[dict]):
"name": entity.name,
"fileType": invalid_error["fileType"],
"center": invalid_error["center"],
"version": entity.versionNumber,
"entity": entity,
}
invalid_error_rows.append(row)
Expand Down Expand Up @@ -763,9 +778,11 @@ def validation(
user_message_dict[user].append(file_messages)

validation_statusdf = build_validation_status_table(input_valid_statuses)

error_trackingdf = build_error_tracking_table(invalid_errors)

new_tables = _update_tables_content(validation_statusdf, error_trackingdf)

validation_statusdf = new_tables["validation_statusdf"]
error_trackingdf = new_tables["error_trackingdf"]
duplicated_filesdf = new_tables["duplicated_filesdf"]
Expand Down Expand Up @@ -793,7 +810,6 @@ def validation(
validation_status_table=validation_status_table,
error_tracker_table=error_tracker_table,
)

valid_filesdf = validation_statusdf.query('status == "VALIDATED"')
return valid_filesdf[["id", "path", "fileType", "name"]]

Expand Down Expand Up @@ -870,7 +886,12 @@ def center_input_to_database(
center_input_synid = genie_config["center_config"][center]["inputSynId"]
logger.info("Center: " + center)
center_files = extract.get_center_input_files(
syn, center_input_synid, center, process
syn=syn,
synid=center_input_synid,
center=center,
process=process,
# HACK: Don't download all the files when only validate
# downloadFile=(not only_validate),
)

# ancillary_files = get_ancillary_files(
Expand Down
5 changes: 0 additions & 5 deletions tests/test_database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,6 @@ def test_that_runMAFinBED_calls_expected_calls(syn, test, staging):
)
),
) as patch_store_maf_in_bed_filtered_variants:

# setting some testing vars
test_script_dir = "test_file_dir/"
test_notinbed_file_path = os.path.join(test_script_dir, "../R/notinbed.csv")
Expand Down Expand Up @@ -479,7 +478,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is
name="flaggedVariants",
),
) as patch_get_flagged_variants:

# setting some testing vars
test_center_mapping_df = pd.DataFrame(
dict(Center=["SAGE"], stagingSynId=["synZZZZZ"])
Expand Down Expand Up @@ -588,7 +586,6 @@ def test_that_mutation_in_cis_filter_has_expected_calls_when_mutations_in_cis_is
name="flaggedVariants",
),
) as patch_get_flagged_variants:

# setting some testing vars
test_center_mapping_df = pd.DataFrame(
dict(Center=["SAGE"], stagingSynId=["synZZZZZ"])
Expand Down Expand Up @@ -697,7 +694,6 @@ def test_that_get_mutation_in_cis_flagged_variants_returns_expected_flagged_vari
with patch.object(
extract, "get_syntabledf", return_value=flagged_df
) as patch_get_syntabledf:

result = database_to_staging.get_mutation_in_cis_flagged_variants(
syn, variant_filtering_synId="synZZZZZ"
)
Expand Down Expand Up @@ -737,7 +733,6 @@ def test_that_get_mutation_in_cis_filtered_samples_returns_expected_variants(
with patch.object(
extract, "get_syntabledf", return_value=filtered_df
) as patch_get_syntabledf:

result = database_to_staging.get_mutation_in_cis_filtered_samples(
syn, variant_filtering_synId="synZZZZZ"
)
Expand Down
36 changes: 31 additions & 5 deletions tests/test_input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
name="data_clinical_supp_sample_SAGE.txt",
modifiedOn="2019-03-24T12:00:00.Z",
md5="44444",
versionNumber=3,
)

patient_clinical_synid = "syn11111"
Expand Down Expand Up @@ -679,6 +680,7 @@ def setup_method(self):
1553428800000,
"clinical",
center,
sample_clinical_entity.versionNumber,
sample_clinical_entity,
]
]
Expand All @@ -692,6 +694,7 @@ def setup_method(self):
"modifiedOn",
"fileType",
"center",
"version",
"entity",
]
)
Expand All @@ -706,6 +709,7 @@ def setup_method(self):
"modifiedOn",
"fileType",
"center",
"version",
"entity",
],
)
Expand All @@ -716,14 +720,16 @@ def setup_method(self):
sample_clinical_entity.name,
"clinical",
center,
sample_clinical_entity.versionNumber,
sample_clinical_entity,
]
]
self.errors_df = pd.DataFrame(
error, columns=["id", "errors", "name", "fileType", "center", "entity"]
error,
columns=["id", "errors", "name", "fileType", "center", "version", "entity"],
)
self.empty_errors = pd.DataFrame(
columns=["id", "errors", "name", "fileType", "center", "entity"]
columns=["id", "errors", "name", "fileType", "center", "version", "entity"]
)

self.with_dupsdf = pd.DataFrame(
Expand All @@ -744,6 +750,7 @@ def setup_method(self):
],
"center": ["SAGE"] * 5,
"fileType": ["type"] * 5,
"version": [3] * 5,
"entity": ["entity"] * 5,
}
)
Expand All @@ -768,12 +775,15 @@ def setup_method(self):
],
"center": ["SAGE"] * 5,
"fileType": ["type"] * 5,
"version": [3] * 5,
"entity": ["entity"] * 5,
}
)
self.empty_dup = pd.DataFrame(
columns=["id", "name", "center", "fileType", "entity", "errors"]
columns=["id", "name", "center", "fileType", "version", "entity", "errors"]
)
# self.empty_dup = self.empty_dup.astype({"version": int})
# self.empty_dup.index = self.empty_dup.index.astype('object')

def test_build_validation_status_table(self):
input_valid_statuses = [
Expand Down Expand Up @@ -836,7 +846,12 @@ def test_dups_get_duplicated_files(self):
def test_nodups_get_duplicated_files(self):
"""Test no duplicated"""
dupsdf = input_to_database.get_duplicated_files(self.no_dupsdf)
assert dupsdf.equals(self.empty_dup)
# These empty frames won't be equal without these conversions
# HACK: Convert the index type to the same type
self.empty_dup.index = self.empty_dup.index.astype("int")
# HACK: Convert the dtype of the "version" column to the same type
self.empty_dup["version"] = self.empty_dup["version"].astype("int")
pd.testing.assert_frame_equal(dupsdf, self.empty_dup)

def test__update_tables_content(self):
"""Tests duplicates are added to the tables and errors/statues are
Expand Down Expand Up @@ -970,6 +985,7 @@ def test_validation(
entity = synapseclient.Entity(
id="syn1234",
md5="44444",
versionNumber=3,
path="/path/to/foobar.txt",
name="data_clinical_supp_SAGE.txt",
)
Expand All @@ -984,6 +1000,7 @@ def test_validation(
entity.name,
modified_on,
filetype,
entity.versionNumber,
center,
]
]
Expand All @@ -997,7 +1014,14 @@ def test_validation(
errortracking_mock = emptytable_mock()
valiate_cls = Mock()
with patch.object(
syn, "tableQuery", side_effect=[validationstatus_mock, errortracking_mock]
syn,
"tableQuery",
side_effect=[
validationstatus_mock,
errortracking_mock,
validationstatus_mock,
errortracking_mock,
],
) as patch_query, patch.object(
input_to_database,
"validatefile",
Expand All @@ -1006,6 +1030,8 @@ def test_validation(
input_to_database,
"build_validation_status_table",
return_value=self.validation_statusdf,
), patch.object(
syn, "get", return_value=entity
), patch.object(
input_to_database, "build_error_tracking_table", return_value=self.errors_df
), patch.object(
Expand Down
Loading