Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-974] Allow NaN, nan and NA strings for mutation data #549

Merged
merged 9 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions genie/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Configuration to obtain registry classes"""

import importlib
import logging

Expand Down
1 change: 1 addition & 0 deletions genie/create_case_lists.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Creates case lists per cancer type
"""

from collections import defaultdict
import csv
import os
Expand Down
15 changes: 9 additions & 6 deletions genie/dashboard_table_updater.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Updates dashboard tables"""

import argparse
import datetime
import logging
Expand Down Expand Up @@ -347,9 +348,11 @@ def update_oncotree_code_tables(syn, database_mappingdf):
oncotree_mapping = process_functions.get_oncotree_code_mappings(oncotree_link)

clinicaldf["PRIMARY_CODES"] = [
oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"]
if i.upper() in oncotree_mapping.keys()
else "DEPRECATED_CODE"
(
oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"]
if i.upper() in oncotree_mapping.keys()
else "DEPRECATED_CODE"
)
for i in clinicaldf.ONCOTREE_CODE
]

Expand Down Expand Up @@ -457,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf):
.applymap(int)
)

diff_between_releasesdf[
["Clinical", "Mutation", "CNV", "SEG", "Fusions"]
] = new_values
diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = (
new_values
)

load._update_table(
syn,
Expand Down
40 changes: 25 additions & 15 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1052,30 +1052,38 @@ def store_clinical_files(
}

clinicaldf["CANCER_TYPE"] = [
oncotree_dict[code.upper()]["CANCER_TYPE"]
if code.upper() in oncotree_dict.keys()
else float("nan")
(
oncotree_dict[code.upper()]["CANCER_TYPE"]
if code.upper() in oncotree_dict.keys()
else float("nan")
)
for code in clinicaldf["ONCOTREE_CODE"]
]

clinicaldf["CANCER_TYPE_DETAILED"] = [
oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"]
if code.upper() in oncotree_dict.keys()
else float("nan")
(
oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"]
if code.upper() in oncotree_dict.keys()
else float("nan")
)
for code in clinicaldf["ONCOTREE_CODE"]
]

clinicaldf["ONCOTREE_PRIMARY_NODE"] = [
oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"]
if code.upper() in oncotree_dict.keys()
else float("nan")
(
oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"]
if code.upper() in oncotree_dict.keys()
else float("nan")
)
for code in clinicaldf["ONCOTREE_CODE"]
]

clinicaldf["ONCOTREE_SECONDARY_NODE"] = [
oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"]
if code.upper() in oncotree_dict.keys()
else float("nan")
(
oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"]
if code.upper() in oncotree_dict.keys()
else float("nan")
)
for code in clinicaldf["ONCOTREE_CODE"]
]

Expand All @@ -1086,9 +1094,11 @@ def store_clinical_files(
# descriptions can match
clinicaldf["AGE_AT_SEQ_REPORT_DAYS"] = clinicaldf["AGE_AT_SEQ_REPORT"]
clinicaldf["AGE_AT_SEQ_REPORT"] = [
int(math.floor(int(float(age)) / 365.25))
if process_functions.checkInt(age)
else age
(
int(math.floor(int(float(age)) / 365.25))
if process_functions.checkInt(age)
else age
)
for age in clinicaldf["AGE_AT_SEQ_REPORT"]
]
clinicaldf["AGE_AT_SEQ_REPORT"][clinicaldf["AGE_AT_SEQ_REPORT"] == ">32485"] = ">89"
Expand Down
1 change: 1 addition & 0 deletions genie/example_filetype_format.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""TODO: Rename this to model.py
This contains the GENIE model objects
"""

from abc import ABCMeta
from dataclasses import dataclass
import logging
Expand Down
1 change: 1 addition & 0 deletions genie/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module contains all the functions that stores data
to Synapse
"""

import logging
import os
import time
Expand Down
1 change: 1 addition & 0 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Processing functions that are used in the GENIE pipeline"""

import datetime
import json
import logging
Expand Down
1 change: 1 addition & 0 deletions genie/process_mutation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Process mutation files
TODO deprecate this module and spread functions around"""

from collections import namedtuple
import logging
import os
Expand Down
23 changes: 23 additions & 0 deletions genie/transform.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""This module contains all the transformation functions used throughout the GENIE
package"""

from typing import List
import warnings

import pandas as pd
Expand Down Expand Up @@ -64,3 +66,24 @@ def _convert_df_with_mixed_dtypes(read_csv_params: dict) -> pd.DataFrame:
df = pd.read_csv(**read_csv_params, low_memory=False, engine="c")
warnings.resetwarnings()
return df


def _convert_values_to_na(
input_df: pd.DataFrame, values_to_replace: List[str], columns_to_convert: List[str]
) -> pd.DataFrame:
"""Converts given values to NA in an input dataset

Args:
input_df (pd.DataFrame): input dataset
values_to_replace (List[str]): string values to replace with na
columns_to_convert (List[str]): subset of columns to convert with na in

Returns:
pd.DataFrame: dataset with specified values replaced with NAs
"""
if not input_df.empty:
replace_mapping = {value: None for value in values_to_replace}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you're converting to none here, but should it be float('nan')

Copy link
Contributor Author

@rxu17 rxu17 Feb 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for bringing this up.

So, I'm seeing this in maf.py's _validation in which this is handled (originally). I'm not seeing anything in processing in annotation-tools for maf.py that resembles what is happening in validation. Which is surprising, because I would think we'd want to do the same?

I think I have to move part of the code from the above specifically to the _convert_values_to_na in genie/transform.py method just to make sure it's int/float dtypes for both validation and processing

Or am I missing something and we don't need them to be numeric for processing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After experimenting, I think having None works. A dataframe with string values and None (e.g: pd.DataFrame({"bye":["2314", None, "124"]})) converted to float will convert the column to float just fine.

Copy link
Contributor Author

@rxu17 rxu17 Feb 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like for vcf files' validation, we don't have to worry about this numeric stuff handling in validation or processing because POS is the only expected numeric column and it can't have NAs anyways.

So just maf processing side needs the handling

Copy link
Member

@thomasyu888 thomasyu888 Feb 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So most of the heavy lifting of processing is done via genome nexus and once we write it out into a csv, it doesn't matter what "type" the column is.

That said - we do want to be diligent about making sure the data itself isn't changing, but a '1' and 1 is going to look the same when we write it out (unless we deliberately add quotes)

Copy link
Contributor Author

@rxu17 rxu17 Feb 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, then I think it's okay as it is? Since we've come this far without much issue without having to implement that for the maf processing side. I am for keeping None mainly because we can have more than just numeric columns and conversion to numeric works fine even with None values in the column

input_df[columns_to_convert] = input_df[columns_to_convert].replace(
replace_mapping
)
return input_df
1 change: 1 addition & 0 deletions genie/write_invalid_reasons.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Write invalid reasons"""

import logging
import os

Expand Down
1 change: 1 addition & 0 deletions genie_registry/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Initialize GENIE registry"""

# Import logging last to not take in synapseclient logging
import logging

Expand Down
1 change: 1 addition & 0 deletions genie_registry/assay.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Assay information class"""

import os
import yaml

Expand Down
1 change: 1 addition & 0 deletions genie_registry/bed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""GENIE bed class and functions"""

import os
import logging
import subprocess
Expand Down
1 change: 1 addition & 0 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Clinical file format validation and processing"""

# from __future__ import annotations
import datetime
from io import StringIO
Expand Down
63 changes: 47 additions & 16 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from io import StringIO
import os
import logging
import os
from typing import List

import pandas as pd

Expand Down Expand Up @@ -198,10 +199,6 @@ def _validate(self, mutationDF):
for col in numerical_cols:
col_exists = process_functions.checkColExist(mutationDF, col)
if col_exists:
# Since NA is an allowed value, when reading in the dataframe
# the 'NA' string is not converted. This will convert all
# 'NA' values in the numerical columns into actual float('nan')
mutationDF.loc[mutationDF[col] == "NA", col] = float("nan")
# Attempt to convert column to float
try:
mutationDF[col] = mutationDF[col].astype(float)
Expand Down Expand Up @@ -352,13 +349,38 @@ def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
)
return errors, warnings

def _get_dataframe(self, filePathList):
"""Get mutation dataframe"""
# Must do this because pandas.read_csv will allow for a file to
# have more column headers than content. E.g.
# A,B,C,D,E
# 1,2
# 2,3
def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame:
"""Get mutation dataframe

1) Starts reading the first line in the file
2) Skips lines that starts with #
3) Reads in second line
4) Checks that first line fields matches second line. Must do this because
pandas.read_csv will allow for a file to have more column headers than content.
E.g) A,B,C,D,E
1,2
2,3

5) We keep the 'NA', 'nan', and 'NaN' as strings in the data because
these are valid allele values
then convert the ones in the non-allele columns back to actual NAs

NOTE: Because allele columns are case-insensitive in maf data, we must
standardize the case of the columns when checking for the non-allele columns
to convert the NA strings to NAs

NOTE: This code allows empty dataframes to pass through
without errors

Args:
filePathList (List[str]): list of filepath(s)

Raises:
ValueError: First line fields doesn't match second line fields in file

Returns:
pd.DataFrame: mutation data
"""
with open(filePathList[0], "r") as maf_f:
firstline = maf_f.readline()
if firstline.startswith("#"):
Expand All @@ -370,34 +392,43 @@ def _get_dataframe(self, filePathList):
"Number of fields in a line do not match the "
"expected number of columns"
)

read_csv_params = {
"filepath_or_buffer": filePathList[0],
"sep": "\t",
"comment": "#",
# Keep the value 'NA'
"keep_default_na": False,
"na_values": [
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A N/A",
"NaN",
"#N/A",
"N/A",
"#NA",
"NULL",
"-NaN",
"nan",
"-nan",
"",
],
"keep_default_na": False,
# This is to check if people write files
# with R, quote=T
"quoting": 3,
# Retain completely blank lines so that
# validator will cause the file to fail
"skip_blank_lines": False,
}

mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params)

mutationdf = transform._convert_values_to_na(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👀 I did not think about this at all... Good catch!

input_df=mutationdf,
values_to_replace=["NA", "nan", "NaN"],
columns_to_convert=[
col
for col in mutationdf.columns
if col.upper() not in self._allele_cols
],
)
return mutationdf
52 changes: 49 additions & 3 deletions genie_registry/vcf.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import logging
import os
from typing import List

import pandas as pd

from genie.example_filetype_format import FileTypeFormat
from genie import process_functions, validate
from genie import process_functions, transform, validate

logger = logging.getLogger(__name__)

Expand All @@ -28,7 +29,25 @@ def _validateFilename(self, filePath):
endswith_vcf = basename.endswith(".vcf")
assert startswith_genie and endswith_vcf

def _get_dataframe(self, filePathList):
def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame:
"""Get mutation dataframe

1) Looks for the line in the file starting with #CHROM, that will be
the header line (columns).

2) When reading in the data, we keep the 'NA', 'nan', and 'NaN'
as strings in the data because these are valid allele values
then convert the ones in the non-allele columns back to actual NAs

Args:
filePathList (List[str]): list of filepath(s)

Raises:
ValueError: when line with #CHROM doesn't exist in file

Returns:
pd.DataFrame: mutation data
"""
headers = None
filepath = filePathList[0]
with open(filepath, "r") as vcffile:
Expand All @@ -38,10 +57,37 @@ def _get_dataframe(self, filePathList):
break
if headers is not None:
vcfdf = pd.read_csv(
filepath, sep="\t", comment="#", header=None, names=headers
filepath,
sep="\t",
comment="#",
header=None,
names=headers,
keep_default_na=False,
na_values=[
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A N/A",
"#N/A",
"N/A",
"#NA",
"NULL",
"-NaN",
"-nan",
"",
],
)
else:
raise ValueError("Your vcf must start with the header #CHROM")

vcfdf = transform._convert_values_to_na(
input_df=vcfdf,
values_to_replace=["NA", "nan", "NaN"],
columns_to_convert=[
col for col in vcfdf.columns if col not in self._allele_cols
],
)
return vcfdf

def process_steps(self, df):
Expand Down
Loading
Loading