Merge pull request #65 from sbslee/0.35.0-dev

0.35.0 dev
sbslee · Jul 11, 2022 · 192fa56 · 192fa56
2 parents de8ce31 + 12e103e
commit 192fa56
Show file tree

Hide file tree

Showing 4 changed files with 244 additions and 30 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,15 @@
 Changelog
 *********
 
+0.35.0 (2022-07-12)
+-------------------
+
+* Fix bug in :meth:`pyvcf.VcfFrame.pseudophase` method.
+* Add new methods :meth:`pyvcf.VcfFrame.diploidize` and :meth:`pyvcf.gt_diploidize`.
+* Update :meth:`pyvcf.VcfFrame.get_af` method to handle situations where there are multiple records with the same ``REF`` allele.
+* Add new method :meth:`pymaf.MafFrame.plot_regplot_tmb`.
+* Rename :meth:`pyvcf.VcfFrame.plot_regplot` method to :meth:`pyvcf.VcfFrame.plot_regplot_tmb` and :meth:`pymaf.MafFrame.plot_regplot` method to :meth:`pymaf.MafFrame.plot_regplot_gene`.
+
 0.34.0 (2022-06-08)
 -------------------
 

diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py
@@ -1295,7 +1295,7 @@ def plot_genepair(
 
         return ax
 
-    def plot_regplot(
+    def plot_regplot_gene(
         self, af, group_col, a, b, a_size=None, b_size=None, genes=None,
         count=10, to_csv=None, ax=None, figsize=None, **kwargs
     ):
@@ -1353,7 +1353,7 @@ def plot_regplot(
             >>> annot_file = '~/fuc-data/tcga-laml/tcga_laml_annot.tsv'
             >>> mf = pymaf.MafFrame.from_file(maf_file)
             >>> af = common.AnnFrame.from_file(annot_file, sample_col=0)
-            >>> mf.plot_regplot(af, 'FAB_classification', 'M1', 'M2')
+            >>> mf.plot_regplot_gene(af, 'FAB_classification', 'M1', 'M2')
             Results for M2 ~ M1:
             R^2 = 0.43
               P = 3.96e-02
@@ -1398,6 +1398,93 @@ def plot_regplot(
 
         return ax
 
+    def plot_regplot_tmb(
+        self, af, subject_col, group_col, a, b, ax=None, figsize=None,
+        **kwargs
+    ):
+        """
+        Create a scatter plot with a linear regression model fit visualizing
+        correlation between TMB in two sample groups.
+
+        The method will automatically calculate and print summary statistics
+        including R-squared and p-value.
+
+        Parameters
+        ----------
+        af : AnnFrame
+            AnnFrame containing sample annotation data.
+        subject_col : str
+            AnnFrame column containing sample subject information.
+        group_col : str
+            AnnFrame column containing sample group information.
+        a, b : str
+            Sample group names.
+        ax : matplotlib.axes.Axes, optional
+            Pre-existing axes for the plot. Otherwise, crete a new one.
+        figsize : tuple, optional
+            Width, height in inches. Format: (float, float).
+        kwargs
+            Other keyword arguments will be passed down to
+            :meth:`seaborn.regplot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes
+            The matplotlib axes containing the plot.
+
+        See Also
+        --------
+        fuc.api.pyvcf.VcfFrame.plot_regplot_tmb
+            Similar method for the :meth:`fuc.api.pyvcf.VcfFrame` class.
+
+        Examples
+        --------
+
+        .. plot::
+
+            >>> import matplotlib.pyplot as plt
+            >>> from fuc import common, pymaf, pyvcf
+            >>> common.load_dataset('pyvcf')
+            >>> vcf_file = '~/fuc-data/pyvcf/normal-tumor.vcf'
+            >>> annot_file = '~/fuc-data/pyvcf/normal-tumor-annot.tsv'
+            >>> vf = pyvcf.VcfFrame.from_file(vcf_file)
+            >>> af = common.AnnFrame.from_file(annot_file, sample_col='Sample')
+            >>> mf = pymaf.MafFrame.from_vcf(vf)
+            >>> mf.plot_regplot_tmb(af, 'Patient', 'Tissue', 'Normal', 'Tumor')
+            Results for Tumor ~ Normal:
+            R^2 = 0.01
+              P = 7.17e-01
+            >>> plt.tight_layout()
+        """
+        d = self.df.value_counts('Tumor_Sample_Barcode').to_dict()
+
+        df = af.df[af.df[group_col].isin([a, b])][[group_col, subject_col]]
+        df = df.reset_index().pivot(index=subject_col, columns=group_col,
+            values=af.df.index.name)
+
+        def one_row(r):
+            for col in [a, b]:
+                if r[col] in d:
+                    r[col] = d[r[col]]
+                else:
+                    r[col] = 0
+            return r
+
+        df = df.apply(one_row, axis=1)
+
+        if ax is None:
+            fig, ax = plt.subplots(figsize=figsize)
+
+        sns.regplot(x=a, y=b, data=df, ax=ax, **kwargs)
+
+        # Print summary statistics including R-squared and p-value.
+        results = smf.ols(f'{b} ~ {a}', data=df).fit()
+        print(f"Results for {b} ~ {a}:")
+        print(f'R^2 = {results.rsquared:.2f}')
+        print(f'  P = {results.f_pvalue:.2e}')
+
+        return ax
+
     def plot_interactions(
         self, count=10, cmap=None, ax=None, figsize=None, **kwargs
     ):

diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
@@ -447,6 +447,54 @@ def rescue_filtered_variants(vfs, format='GT'):
     merged_vf = merge(filtered_vfs, how='outer', format=format)
     return merged_vf
 
+def gt_diploidize(g):
+    """
+    For given genotype, return its diploid form.
+
+    This method will ignore diploid genotypes (e.g. 0/1) and is therefore
+    only effective for haploid genotypes (e.g. genotypes from the X
+    chromosome for males).
+
+    Parameters
+    ----------
+    g : str
+        Sample genotype.
+
+    Returns
+    -------
+    bool
+        Diploid genotype.
+
+    See Also
+    --------
+    VcfFrame.diploidize
+        Diploidize VcfFrame.
+
+    Examples
+    --------
+
+    >>> from fuc import pyvcf
+    >>> pyvcf.gt_diploidize('0')
+    '0/0'
+    >>> pyvcf.gt_diploidize('1')
+    '0/1'
+    >>> pyvcf.gt_diploidize('.')
+    './.'
+    >>> pyvcf.gt_diploidize('1:34')
+    '0/1:34'
+    >>> pyvcf.gt_diploidize('0/1:34')
+    '0/1:34'
+    >>> pyvcf.gt_diploidize('./.')
+    './.'
+    """
+    if gt_ploidy(g) != 1:
+        return g
+    gt = g.split(':')[0]
+    if gt == '.':
+        return './' + g
+    else:
+        return '0/' + g
+
 def gt_miss(g):
     """
     For given genotype, return True if it has missing value.
@@ -700,9 +748,9 @@ def gt_pseudophase(g):
     --------
 
     >>> from fuc import pyvcf
-    >>> pyvcf.pseudophase('0/1')
+    >>> pyvcf.gt_pseudophase('0/1')
     '0|1'
-    >>> pyvcf.pseudophase('0/0:34:10,24')
+    >>> pyvcf.gt_pseudophase('0/0:34:10,24')
     '0|0:34:10,24'
     """
     l = g.split(':')
@@ -3472,7 +3520,9 @@ def plot_tmb(
 
         return ax
 
-    def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
+    def plot_regplot_tmb(
+        self, a, b, ax=None, figsize=None, **kwargs
+    ):
         """
         Create a scatter plot with a linear regression model fit visualizing
         correlation between TMB in two sample groups.
@@ -3497,6 +3547,11 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
         matplotlib.axes.Axes
             The matplotlib axes containing the plot.
 
+        See Also
+        --------
+        fuc.api.pymaf.MafFrame.plot_regplot_tmb
+            Similar method for the :meth:`fuc.api.pymaf.MafFrame` class.
+
         Examples
         --------
 
@@ -3512,7 +3567,7 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
             >>> normal.name = 'Normal'
             >>> tumor = af.df[af.df.Tissue == 'Tumor'].index
             >>> tumor.name = 'Tumor'
-            >>> vf.plot_regplot(normal, tumor)
+            >>> vf.plot_regplot_tmb(normal, tumor)
             Results for B ~ A:
             R^2 = 0.01
             P = 7.17e-01
@@ -6378,6 +6433,53 @@ def compare(self, other):
         df = df[['Locus', 'Sample', 'Self', 'Other']]
         return df
 
+    def diploidize(self):
+        """
+        Diploidize VcfFrame.
+
+        Returns
+        -------
+        VcfFrame
+            Diploidized VcfFrame.
+
+        See Also
+        --------
+        gt_diploidize
+            For given genotype, return its diploid form.
+
+        Examples
+        --------
+
+        >>> from fuc import pyvcf
+        >>> data = {
+        ...     'CHROM': ['chrX', 'chrX'],
+        ...     'POS': [100, 101],
+        ...     'ID': ['.', '.'],
+        ...     'REF': ['G', 'T'],
+        ...     'ALT': ['A', 'C'],
+        ...     'QUAL': ['.', '.'],
+        ...     'FILTER': ['.', '.'],
+        ...     'INFO': ['.', '.'],
+        ...     'FORMAT': ['GT', 'GT'],
+        ...     'Male': ['0', '1'],
+        ...     'Female': ['0/0', '0/1'],
+        ... }
+        >>> vf = pyvcf.VcfFrame.from_dict([], data)
+        >>> vf.df
+          CHROM  POS ID REF ALT QUAL FILTER INFO FORMAT Male Female
+        0  chrX  100  .   G   A    .      .    .     GT    0    0/0
+        1  chrX  101  .   T   C    .      .    .     GT    1    0/1
+        >>> vf.diploidize().df
+          CHROM  POS ID REF ALT QUAL FILTER INFO FORMAT Male Female
+        0  chrX  100  .   G   A    .      .    .     GT  0/0    0/0
+        1  chrX  101  .   T   C    .      .    .     GT  0/1    0/1
+        """
+        def one_row(r):
+            r[9:] = r[9:].apply(gt_diploidize)
+            return r
+        df = self.df.apply(one_row, axis=1)
+        return self.__class__(self.copy_meta(), df)
+
     def fetch(self, variant):
         """
         Fetch the VCF row that matches specified variant.
@@ -6463,7 +6565,7 @@ def pseudophase(self):
         1  chr2  101  .   T   C    .      .    .     GT  1|1
         """
         def one_row(r):
-            r[9:] = r[9:].apply(pseudophase)
+            r[9:] = r[9:].apply(gt_pseudophase)
             return r
         df = self.df.apply(one_row, axis=1)
         return self.__class__(self.copy_meta(), df)
@@ -6472,7 +6574,10 @@ def get_af(self, sample, variant):
         """
         Get allele fraction for a pair of sample and variant.
 
-        The method will return ``numpy.nan`` if the value is missing.
+        The method will return ``numpy.nan`` when:
+
+        1. variant is absent, or
+        2. variant is present but there is no ``AF`` in the ``FORMAT`` column
 
         Parameters
         ----------
@@ -6491,32 +6596,48 @@ def get_af(self, sample, variant):
 
         >>> from fuc import pyvcf, common
         >>> data = {
-        ...     'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'],
-        ...     'POS': [100, 101, 102, 103],
-        ...     'ID': ['.', '.', '.', '.'],
-        ...     'REF': ['A', 'G', 'A', 'C'],
-        ...     'ALT': ['C', 'T', 'G', 'G,A'],
-        ...     'QUAL': ['.', '.', '.', '.'],
-        ...     'FILTER': ['.', '.', '.', '.'],
-        ...     'INFO': ['.', '.', '.', '.'],
-        ...     'FORMAT': ['GT:AD:AF', 'GT:AD:AF', 'GT:AF', 'GT:AD:AF'],
-        ...     'A': ['0/1:12,15:0.444,0.556', '0/0:32,1:0.970,0.030', '0/1:.', './.:.:.'],
-        ...     'B': ['0/1:13,17:0.433,0.567', '0/1:14,15:0.483,0.517', './.:.', '1/2:0,11,17:0.000,0.393,0.607'],
+        ...     'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
+        ...     'POS': [100, 100, 101, 102, 103],
+        ...     'ID': ['.', '.', '.', '.', '.'],
+        ...     'REF': ['A', 'A', 'G', 'A', 'C'],
+        ...     'ALT': ['C', 'T', 'T', 'G', 'G,A'],
+        ...     'QUAL': ['.', '.', '.', '.', '.'],
+        ...     'FILTER': ['.', '.', '.', '.', '.'],
+        ...     'INFO': ['.', '.', '.', '.', '.'],
+        ...     'FORMAT': ['GT:AD:AF', 'GT:AD:AF', 'GT:AD:AF', 'GT:AF', 'GT:AD:AF'],
+        ...     'A': ['0/1:12,15:0.444,0.556', '0/0:31,0:1.000,0.000', '0/0:32,1:0.970,0.030', '0/1:.', './.:.:.'],
+        ...     'B': ['0/0:29,0:1.000,0.000', '0/1:13,17:0.433,0.567', '0/1:14,15:0.483,0.517', './.:.', '1/2:0,11,17:0.000,0.393,0.607'],
         ... }
         >>> vf = pyvcf.VcfFrame.from_dict([], data)
         >>> vf.df
           CHROM  POS ID REF  ALT QUAL FILTER INFO    FORMAT                      A                              B
-        0  chr1  100  .   A    C    .      .    .  GT:AD:AF  0/1:12,15:0.444,0.556          0/1:13,17:0.433,0.567
-        1  chr1  101  .   G    T    .      .    .  GT:AD:AF   0/0:32,1:0.970,0.030          0/1:14,15:0.483,0.517
-        2  chr1  102  .   A    G    .      .    .     GT:AF                  0/1:.                          ./.:.
-        3  chr1  103  .   C  G,A    .      .    .  GT:AD:AF                ./.:.:.  1/2:0,11,17:0.000,0.393,0.607
+        0  chr1  100  .   A    C    .      .    .  GT:AD:AF  0/1:12,15:0.444,0.556           0/0:29,0:1.000,0.000
+        1  chr1  100  .   A    T    .      .    .  GT:AD:AF   0/0:31,0:1.000,0.000          0/1:13,17:0.433,0.567
+        2  chr1  101  .   G    T    .      .    .  GT:AD:AF   0/0:32,1:0.970,0.030          0/1:14,15:0.483,0.517
+        3  chr1  102  .   A    G    .      .    .     GT:AF                  0/1:.                          ./.:.
+        4  chr1  103  .   C  G,A    .      .    .  GT:AD:AF                ./.:.:.  1/2:0,11,17:0.000,0.393,0.607
         >>> vf.get_af('A', 'chr1-100-A-C')
         0.556
-        >>> vf.get_af('B', 'chr1-102-A-G')
+        >>> vf.get_af('A', 'chr1-100-A-T')
+        0.0
+        >>> vf.get_af('B', 'chr1-100-A-T')
+        0.567
+        >>> vf.get_af('B', 'chr1-100-A-G') # does not exist
+        nan
+        >>> vf.get_af('B', 'chr1-102-A-G') # missing AF data
         nan
+        >>> vf.get_af('B', 'chr1-103-C-A') # multiallelic locus
+        0.607
         """
         chrom, pos, ref, alt = common.parse_variant(variant)
-        r = self.df[(self.df.CHROM == chrom) & (self.df.POS == pos) & (self.df.REF == ref)]
+
+        i = self.df.apply(lambda r: ((r.CHROM == chrom) & (r.POS == pos) &
+                (r.REF == ref) & (alt in r.ALT.split(','))), axis=1)
+
+        if i.any():
+            r = self.df[i]
+        else:
+            return np.nan
 
         try:
             i = r.FORMAT.values[0].split(':').index('AF')
@@ -6525,10 +6646,7 @@ def get_af(self, sample, variant):
 
         alts = r.ALT.values[0].split(',')
 
-        if alt in alts:
-            j = r.ALT.values[0].split(',').index(alt)
-        else:
-            raise ValueError(f'ALT allele not found, possible choices: {alts}')
+        j = r.ALT.values[0].split(',').index(alt)
 
         field = r[sample].values[0].split(':')[i]
 

diff --git a/fuc/version.py b/fuc/version.py
@@ -1 +1 @@
-__version__ = '0.34.0'
+__version__ = '0.35.0'