Merge branch 'develop' into gen-1028-report-dashboard

Sage-Bionetworks · Jan 26, 2024 · 7106422 · 7106422
2 parents 436ba86 + 889698d
commit 7106422
Show file tree

Hide file tree

Showing 9 changed files with 350 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -66,7 +66,7 @@ These are instructions on how you would develop and test the pipeline locally.
 
     If you are having trouble with the above, try installing via `pipenv`
 
-    1. Specify a python version that is supported by this repo: 
+    1. Specify a python version that is supported by this repo:
         ```pipenv --python <python_version>```
 
     1. [pipenv install from requirements file](https://docs.pipenv.org/en/latest/advanced.html#importing-from-requirements-txt)

diff --git a/genie/__main__.py b/genie/__main__.py
@@ -44,49 +44,35 @@ def build_parser():
 
     subparsers = parser.add_subparsers(
         title="commands",
-        description="The following commands are available:",
-        help='For additional help: "genie <COMMAND> -h"',
+        description="The following commands are available: ",
+        help='For additional help use: "genie <COMMAND> -h"',
     )
 
     parser_validate = subparsers.add_parser(
-        "validate", help="Validates GENIE file formats"
+        "validate", help="Validates GENIE file formats. "
     )
 
     parser_validate.add_argument(
         "filepath",
         type=str,
         nargs="+",
-        help="File(s) that you are validating."
-        "If you validation your clinical files and you have both sample and "
-        "patient files, you must provide both",
+        help="File(s) that you are validating. "
+        "If you have separate clinical sample and patient files, "
+        "you must provide both files when validating.",
     )
 
     parser_validate.add_argument("center", type=str, help="Contributing Centers")
 
-    parser_validate.add_argument(
-        "--format_registry_packages",
-        type=str,
-        nargs="+",
-        default=["genie_registry"],
-        help="Python package name(s) to get valid file formats from (default: %(default)s).",
-    )
-
-    parser_validate.add_argument(
-        "--oncotree_link", type=str, help="Link to oncotree code"
-    )
-
     validate_group = parser_validate.add_mutually_exclusive_group()
 
     validate_group.add_argument(
         "--filetype",
         type=str,
-        help="By default, the validator uses the filename to match "
+        help="Use the --filetype {FILETYPE} parameter to ignore filename validation. "
+        "By default, the validator uses the filename to match "
         "the file format.  If your filename is incorrectly named, "
-        "it will be invalid.  If you know the file format you are "
-        "validating, you can ignore the filename validation and skip "
-        "to file content validation. "
-        "Note, the filetypes with SP at "
-        "the end are for special sponsored projects.",
+        "it will be invalid. "
+        "Options: [maf, vcf, clinical, assayinfo, bed, cna, sv, seg, mutationsInCis]",
     )
 
     validate_group.add_argument(
@@ -98,18 +84,39 @@ def build_parser():
         "to this directory.",
     )
 
+    parser_validate.add_argument(
+        "--oncotree_link",
+        type=str,
+        help="Specify an oncotree url when validating your clinical "
+        "file "
+        "(e.g: https://oncotree.info/api/tumorTypes/tree?version=oncotree_2021_11_02). "
+        "By default the oncotree version used will be specified in this entity: "
+        "syn13890902",
+    )
+
+    parser_validate.add_argument(
+        "--nosymbol-check",
+        action="store_true",
+        help="Ignores specific post-processing validation criteria related to HUGO symbols "
+        "in the structural variant and cna files.",
+    )
+
     # TODO: remove this default when private genie project is ready
     parser_validate.add_argument(
         "--project_id",
         type=str,
         default="syn3380222",
-        help="Synapse Project ID where data is stored. (default: %(default)s).",
+        help="FOR DEVELOPER USE ONLY: Synapse Project ID where data is stored. "
+        "(default: %(default)s).",
     )
 
     parser_validate.add_argument(
-        "--nosymbol-check",
-        action="store_true",
-        help="Do not check hugo symbols of fusion and cna file",
+        "--format_registry_packages",
+        type=str,
+        nargs="+",
+        default=["genie_registry"],
+        help="FOR DEVELOPER USE ONLY: Python package name(s) to get valid file formats "
+        "from (default: %(default)s).",
     )
 
     parser_validate.set_defaults(func=validate._perform_validate)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -35,6 +35,73 @@
 SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt")
 BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv")
 
+FULL_MAF_RELEASE_COLUMNS = [
+    "Hugo_Symbol",
+    "Entrez_Gene_Id",
+    "Center",
+    "NCBI_Build",
+    "Chromosome",
+    "Start_Position",
+    "End_Position",
+    "Strand",
+    "Consequence",
+    "Variant_Classification",
+    "Variant_Type",
+    "Reference_Allele",
+    "Tumor_Seq_Allele1",
+    "Tumor_Seq_Allele2",
+    "dbSNP_RS",
+    "dbSNP_Val_Status",
+    "Tumor_Sample_Barcode",
+    "Matched_Norm_Sample_Barcode",
+    "Match_Norm_Seq_Allele1",
+    "Match_Norm_Seq_Allele2",
+    "Tumor_Validation_Allele1",
+    "Tumor_Validation_Allele2",
+    "Match_Norm_Validation_Allele1",
+    "Match_Norm_Validation_Allele2",
+    "Verification_Status",
+    "Validation_Status",
+    "Mutation_Status",
+    "Sequencing_Phase",
+    "Sequence_Source",
+    "Validation_Method",
+    "Score",
+    "BAM_File",
+    "Sequencer",
+    "t_ref_count",
+    "t_alt_count",
+    "n_ref_count",
+    "n_alt_count",
+    "HGVSc",
+    "HGVSp",
+    "HGVSp_Short",
+    "Transcript_ID",
+    "RefSeq",
+    "Protein_position",
+    "Codons",
+    "Exon_Number",
+    "gnomAD_AF",
+    "gnomAD_AFR_AF",
+    "gnomAD_AMR_AF",
+    "gnomAD_ASJ_AF",
+    "gnomAD_EAS_AF",
+    "gnomAD_FIN_AF",
+    "gnomAD_NFE_AF",
+    "gnomAD_OTH_AF",
+    "gnomAD_SAS_AF",
+    "FILTER",
+    "Polyphen_Prediction",
+    "Polyphen_Score",
+    "SIFT_Prediction",
+    "SIFT_Score",
+    "SWISSPROT",
+    "n_depth",
+    "t_depth",
+    "Annotation_Status",
+    "mutationInCis_Flag",
+]
+
 
 # TODO: Add to transform.py
 def _to_redact_interval(df_col):
@@ -755,10 +822,7 @@ def store_maf_files(
         with open(MUTATIONS_CENTER_PATH % center, "w"):
             pass
     used_entities = []
-    # Must get the headers (because can't assume headers are the same order)
     maf_ent = syn.get(centerMafSynIdsDf.id[0])
-    headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0)
-    column_order = headerdf.columns
     for _, mafSynId in enumerate(centerMafSynIdsDf.id):
         maf_ent = syn.get(mafSynId)
         logger.info(maf_ent.path)
@@ -771,13 +835,12 @@ def store_maf_files(
             )
 
             for mafchunk in mafchunks:
-                # Reorder column headers
-                mafchunk = mafchunk[column_order]
                 # Get center for center staging maf
                 # Configure maf
                 configured_mafdf = configure_maf(
                     mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants
                 )
+                configured_mafdf = configured_mafdf[FULL_MAF_RELEASE_COLUMNS]
                 # Create maf for release
                 merged_mafdf = remove_maf_samples(
                     configured_mafdf, keep_for_merged_consortium_samples

diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -948,3 +948,35 @@ def create_new_fileformat_table(
         "newdb_mappingdf": newdb_mappingdf,
         "moved_ent": moved_ent,
     }
+
+
+def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
+    """Creates and fills missing columns with the relevant NA value for the
+        given data type. Note that special handling had to occur for
+        allowing NAs in integer based columns in pandas by converting
+        the integer column into the Int64 (pandas nullable integer data type)
+
+    Args:
+        dataset (pd.DataFrame): input dataset to fill missing columns for
+        schema (dict): the expected schema {column_name(str): data_type(str)}
+            for the input dataset
+
+    Returns:
+        pd.Series: updated dataset
+    """
+    missing_values = {
+        "string": "",
+        "integer": None,
+        "float": float("nan"),
+        "boolean": None,
+    }
+    for column, data_type in schema.items():
+        if column not in dataset.columns:
+            dataset = dataset.assign(**{column: missing_values[data_type]})
+
+        # only way to preserve NAs for these specific dtype columns
+        if data_type == "integer":
+            dataset[column] = dataset[column].astype("Int64")
+        elif data_type == "boolean":
+            dataset[column] = dataset[column].astype(pd.BooleanDtype())
+    return dataset[list(schema.keys())]
diff --git a/genie/process_mutation.py b/genie/process_mutation.py
@@ -342,6 +342,7 @@ def check_annotation_error_reports(
         maf_table_synid (str): synapse_id of the narrow maf table
         full_error_report (pd.DataFrame): the failed annotations error report
         center (str): the center this is for
+
     """
     maf_table_df = extract.get_syntabledf(
         syn=syn,
@@ -351,10 +352,11 @@ def check_annotation_error_reports(
             "Annotation_Status = 'FAILED'"
         ),
     )
-    assert len(maf_table_df) == len(full_error_report), (
-        "Genome nexus's failed annotations error report rows doesn't match"
-        f"maf table's failed annotations for {center}"
-    )
+    if len(maf_table_df) != len(full_error_report):
+        logger.warning(
+            "Genome nexus's failed annotations error report rows doesn't match"
+            f"maf table's failed annotations for {center}"
+        )
 
 
 def store_annotation_error_reports(

diff --git a/genie/validate.py b/genie/validate.py
@@ -95,8 +95,13 @@ def validate_single_file(self, **kwargs):
             valid: Boolean value of validation status
         """
         if self.file_type not in self._format_registry:
+            allowed_filetypes = list(self._format_registry.keys())
+            error_message = (
+                f"Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally. "
+                f"If specifying filetype, options are: [{', '.join(allowed_filetypes)}]\n"
+            )
             valid_result_cls = example_filetype_format.ValidationResults(
-                errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally",
+                errors=error_message,
                 warnings="",
             )
         else: