Skip to content

Commit

Permalink
Merge branch 'develop' into gen-1028-report-dashboard
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed Jan 26, 2024
2 parents 436ba86 + 889698d commit 7106422
Show file tree
Hide file tree
Showing 9 changed files with 350 additions and 54 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ These are instructions on how you would develop and test the pipeline locally.
If you are having trouble with the above, try installing via `pipenv`
1. Specify a python version that is supported by this repo:
1. Specify a python version that is supported by this repo:
```pipenv --python <python_version>```
1. [pipenv install from requirements file](https://docs.pipenv.org/en/latest/advanced.html#importing-from-requirements-txt)
Expand Down
63 changes: 35 additions & 28 deletions genie/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,49 +44,35 @@ def build_parser():

subparsers = parser.add_subparsers(
title="commands",
description="The following commands are available:",
help='For additional help: "genie <COMMAND> -h"',
description="The following commands are available: ",
help='For additional help use: "genie <COMMAND> -h"',
)

parser_validate = subparsers.add_parser(
"validate", help="Validates GENIE file formats"
"validate", help="Validates GENIE file formats. "
)

parser_validate.add_argument(
"filepath",
type=str,
nargs="+",
help="File(s) that you are validating."
"If you validation your clinical files and you have both sample and "
"patient files, you must provide both",
help="File(s) that you are validating. "
"If you have separate clinical sample and patient files, "
"you must provide both files when validating.",
)

parser_validate.add_argument("center", type=str, help="Contributing Centers")

parser_validate.add_argument(
"--format_registry_packages",
type=str,
nargs="+",
default=["genie_registry"],
help="Python package name(s) to get valid file formats from (default: %(default)s).",
)

parser_validate.add_argument(
"--oncotree_link", type=str, help="Link to oncotree code"
)

validate_group = parser_validate.add_mutually_exclusive_group()

validate_group.add_argument(
"--filetype",
type=str,
help="By default, the validator uses the filename to match "
help="Use the --filetype {FILETYPE} parameter to ignore filename validation. "
"By default, the validator uses the filename to match "
"the file format. If your filename is incorrectly named, "
"it will be invalid. If you know the file format you are "
"validating, you can ignore the filename validation and skip "
"to file content validation. "
"Note, the filetypes with SP at "
"the end are for special sponsored projects.",
"it will be invalid. "
"Options: [maf, vcf, clinical, assayinfo, bed, cna, sv, seg, mutationsInCis]",
)

validate_group.add_argument(
Expand All @@ -98,18 +84,39 @@ def build_parser():
"to this directory.",
)

parser_validate.add_argument(
"--oncotree_link",
type=str,
help="Specify an oncotree url when validating your clinical "
"file "
"(e.g: https://oncotree.info/api/tumorTypes/tree?version=oncotree_2021_11_02). "
"By default the oncotree version used will be specified in this entity: "
"syn13890902",
)

parser_validate.add_argument(
"--nosymbol-check",
action="store_true",
help="Ignores specific post-processing validation criteria related to HUGO symbols "
"in the structural variant and cna files.",
)

# TODO: remove this default when private genie project is ready
parser_validate.add_argument(
"--project_id",
type=str,
default="syn3380222",
help="Synapse Project ID where data is stored. (default: %(default)s).",
help="FOR DEVELOPER USE ONLY: Synapse Project ID where data is stored. "
"(default: %(default)s).",
)

parser_validate.add_argument(
"--nosymbol-check",
action="store_true",
help="Do not check hugo symbols of fusion and cna file",
"--format_registry_packages",
type=str,
nargs="+",
default=["genie_registry"],
help="FOR DEVELOPER USE ONLY: Python package name(s) to get valid file formats "
"from (default: %(default)s).",
)

parser_validate.set_defaults(func=validate._perform_validate)
Expand Down
73 changes: 68 additions & 5 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,73 @@
SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt")
BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv")

FULL_MAF_RELEASE_COLUMNS = [
"Hugo_Symbol",
"Entrez_Gene_Id",
"Center",
"NCBI_Build",
"Chromosome",
"Start_Position",
"End_Position",
"Strand",
"Consequence",
"Variant_Classification",
"Variant_Type",
"Reference_Allele",
"Tumor_Seq_Allele1",
"Tumor_Seq_Allele2",
"dbSNP_RS",
"dbSNP_Val_Status",
"Tumor_Sample_Barcode",
"Matched_Norm_Sample_Barcode",
"Match_Norm_Seq_Allele1",
"Match_Norm_Seq_Allele2",
"Tumor_Validation_Allele1",
"Tumor_Validation_Allele2",
"Match_Norm_Validation_Allele1",
"Match_Norm_Validation_Allele2",
"Verification_Status",
"Validation_Status",
"Mutation_Status",
"Sequencing_Phase",
"Sequence_Source",
"Validation_Method",
"Score",
"BAM_File",
"Sequencer",
"t_ref_count",
"t_alt_count",
"n_ref_count",
"n_alt_count",
"HGVSc",
"HGVSp",
"HGVSp_Short",
"Transcript_ID",
"RefSeq",
"Protein_position",
"Codons",
"Exon_Number",
"gnomAD_AF",
"gnomAD_AFR_AF",
"gnomAD_AMR_AF",
"gnomAD_ASJ_AF",
"gnomAD_EAS_AF",
"gnomAD_FIN_AF",
"gnomAD_NFE_AF",
"gnomAD_OTH_AF",
"gnomAD_SAS_AF",
"FILTER",
"Polyphen_Prediction",
"Polyphen_Score",
"SIFT_Prediction",
"SIFT_Score",
"SWISSPROT",
"n_depth",
"t_depth",
"Annotation_Status",
"mutationInCis_Flag",
]


# TODO: Add to transform.py
def _to_redact_interval(df_col):
Expand Down Expand Up @@ -755,10 +822,7 @@ def store_maf_files(
with open(MUTATIONS_CENTER_PATH % center, "w"):
pass
used_entities = []
# Must get the headers (because can't assume headers are the same order)
maf_ent = syn.get(centerMafSynIdsDf.id[0])
headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0)
column_order = headerdf.columns
for _, mafSynId in enumerate(centerMafSynIdsDf.id):
maf_ent = syn.get(mafSynId)
logger.info(maf_ent.path)
Expand All @@ -771,13 +835,12 @@ def store_maf_files(
)

for mafchunk in mafchunks:
# Reorder column headers
mafchunk = mafchunk[column_order]
# Get center for center staging maf
# Configure maf
configured_mafdf = configure_maf(
mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants
)
configured_mafdf = configured_mafdf[FULL_MAF_RELEASE_COLUMNS]
# Create maf for release
merged_mafdf = remove_maf_samples(
configured_mafdf, keep_for_merged_consortium_samples
Expand Down
32 changes: 32 additions & 0 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,3 +948,35 @@ def create_new_fileformat_table(
"newdb_mappingdf": newdb_mappingdf,
"moved_ent": moved_ent,
}


def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
"""Creates and fills missing columns with the relevant NA value for the
given data type. Note that special handling had to occur for
allowing NAs in integer based columns in pandas by converting
the integer column into the Int64 (pandas nullable integer data type)
Args:
dataset (pd.DataFrame): input dataset to fill missing columns for
schema (dict): the expected schema {column_name(str): data_type(str)}
for the input dataset
Returns:
pd.Series: updated dataset
"""
missing_values = {
"string": "",
"integer": None,
"float": float("nan"),
"boolean": None,
}
for column, data_type in schema.items():
if column not in dataset.columns:
dataset = dataset.assign(**{column: missing_values[data_type]})

# only way to preserve NAs for these specific dtype columns
if data_type == "integer":
dataset[column] = dataset[column].astype("Int64")
elif data_type == "boolean":
dataset[column] = dataset[column].astype(pd.BooleanDtype())
return dataset[list(schema.keys())]
10 changes: 6 additions & 4 deletions genie/process_mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ def check_annotation_error_reports(
maf_table_synid (str): synapse_id of the narrow maf table
full_error_report (pd.DataFrame): the failed annotations error report
center (str): the center this is for
"""
maf_table_df = extract.get_syntabledf(
syn=syn,
Expand All @@ -351,10 +352,11 @@ def check_annotation_error_reports(
"Annotation_Status = 'FAILED'"
),
)
assert len(maf_table_df) == len(full_error_report), (
"Genome nexus's failed annotations error report rows doesn't match"
f"maf table's failed annotations for {center}"
)
if len(maf_table_df) != len(full_error_report):
logger.warning(
"Genome nexus's failed annotations error report rows doesn't match"
f"maf table's failed annotations for {center}"
)


def store_annotation_error_reports(
Expand Down
7 changes: 6 additions & 1 deletion genie/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,13 @@ def validate_single_file(self, **kwargs):
valid: Boolean value of validation status
"""
if self.file_type not in self._format_registry:
allowed_filetypes = list(self._format_registry.keys())
error_message = (
f"Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally. "
f"If specifying filetype, options are: [{', '.join(allowed_filetypes)}]\n"
)
valid_result_cls = example_filetype_format.ValidationResults(
errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally",
errors=error_message,
warnings="",
)
else:
Expand Down
Loading

0 comments on commit 7106422

Please sign in to comment.