Skip to content

Commit

Permalink
Merge pull request #65 from sbslee/0.35.0-dev
Browse files Browse the repository at this point in the history
0.35.0 dev
  • Loading branch information
sbslee authored Jul 11, 2022
2 parents de8ce31 + 12e103e commit 192fa56
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 30 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
Changelog
*********

0.35.0 (2022-07-12)
-------------------

* Fix bug in :meth:`pyvcf.VcfFrame.pseudophase` method.
* Add new methods :meth:`pyvcf.VcfFrame.diploidize` and :meth:`pyvcf.gt_diploidize`.
* Update :meth:`pyvcf.VcfFrame.get_af` method to handle situations where there are multiple records with the same ``REF`` allele.
* Add new method :meth:`pymaf.MafFrame.plot_regplot_tmb`.
* Rename :meth:`pyvcf.VcfFrame.plot_regplot` method to :meth:`pyvcf.VcfFrame.plot_regplot_tmb` and :meth:`pymaf.MafFrame.plot_regplot` method to :meth:`pymaf.MafFrame.plot_regplot_gene`.

0.34.0 (2022-06-08)
-------------------

Expand Down
91 changes: 89 additions & 2 deletions fuc/api/pymaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1295,7 +1295,7 @@ def plot_genepair(

return ax

def plot_regplot(
def plot_regplot_gene(
self, af, group_col, a, b, a_size=None, b_size=None, genes=None,
count=10, to_csv=None, ax=None, figsize=None, **kwargs
):
Expand Down Expand Up @@ -1353,7 +1353,7 @@ def plot_regplot(
>>> annot_file = '~/fuc-data/tcga-laml/tcga_laml_annot.tsv'
>>> mf = pymaf.MafFrame.from_file(maf_file)
>>> af = common.AnnFrame.from_file(annot_file, sample_col=0)
>>> mf.plot_regplot(af, 'FAB_classification', 'M1', 'M2')
>>> mf.plot_regplot_gene(af, 'FAB_classification', 'M1', 'M2')
Results for M2 ~ M1:
R^2 = 0.43
P = 3.96e-02
Expand Down Expand Up @@ -1398,6 +1398,93 @@ def plot_regplot(

return ax

def plot_regplot_tmb(
self, af, subject_col, group_col, a, b, ax=None, figsize=None,
**kwargs
):
"""
Create a scatter plot with a linear regression model fit visualizing
correlation between TMB in two sample groups.
The method will automatically calculate and print summary statistics
including R-squared and p-value.
Parameters
----------
af : AnnFrame
AnnFrame containing sample annotation data.
subject_col : str
AnnFrame column containing sample subject information.
group_col : str
AnnFrame column containing sample group information.
a, b : str
Sample group names.
ax : matplotlib.axes.Axes, optional
Pre-existing axes for the plot. Otherwise, crete a new one.
figsize : tuple, optional
Width, height in inches. Format: (float, float).
kwargs
Other keyword arguments will be passed down to
:meth:`seaborn.regplot`.
Returns
-------
matplotlib.axes.Axes
The matplotlib axes containing the plot.
See Also
--------
fuc.api.pyvcf.VcfFrame.plot_regplot_tmb
Similar method for the :meth:`fuc.api.pyvcf.VcfFrame` class.
Examples
--------
.. plot::
>>> import matplotlib.pyplot as plt
>>> from fuc import common, pymaf, pyvcf
>>> common.load_dataset('pyvcf')
>>> vcf_file = '~/fuc-data/pyvcf/normal-tumor.vcf'
>>> annot_file = '~/fuc-data/pyvcf/normal-tumor-annot.tsv'
>>> vf = pyvcf.VcfFrame.from_file(vcf_file)
>>> af = common.AnnFrame.from_file(annot_file, sample_col='Sample')
>>> mf = pymaf.MafFrame.from_vcf(vf)
>>> mf.plot_regplot_tmb(af, 'Patient', 'Tissue', 'Normal', 'Tumor')
Results for Tumor ~ Normal:
R^2 = 0.01
P = 7.17e-01
>>> plt.tight_layout()
"""
d = self.df.value_counts('Tumor_Sample_Barcode').to_dict()

df = af.df[af.df[group_col].isin([a, b])][[group_col, subject_col]]
df = df.reset_index().pivot(index=subject_col, columns=group_col,
values=af.df.index.name)

def one_row(r):
for col in [a, b]:
if r[col] in d:
r[col] = d[r[col]]
else:
r[col] = 0
return r

df = df.apply(one_row, axis=1)

if ax is None:
fig, ax = plt.subplots(figsize=figsize)

sns.regplot(x=a, y=b, data=df, ax=ax, **kwargs)

# Print summary statistics including R-squared and p-value.
results = smf.ols(f'{b} ~ {a}', data=df).fit()
print(f"Results for {b} ~ {a}:")
print(f'R^2 = {results.rsquared:.2f}')
print(f' P = {results.f_pvalue:.2e}')

return ax

def plot_interactions(
self, count=10, cmap=None, ax=None, figsize=None, **kwargs
):
Expand Down
172 changes: 145 additions & 27 deletions fuc/api/pyvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,54 @@ def rescue_filtered_variants(vfs, format='GT'):
merged_vf = merge(filtered_vfs, how='outer', format=format)
return merged_vf

def gt_diploidize(g):
"""
For given genotype, return its diploid form.
This method will ignore diploid genotypes (e.g. 0/1) and is therefore
only effective for haploid genotypes (e.g. genotypes from the X
chromosome for males).
Parameters
----------
g : str
Sample genotype.
Returns
-------
bool
Diploid genotype.
See Also
--------
VcfFrame.diploidize
Diploidize VcfFrame.
Examples
--------
>>> from fuc import pyvcf
>>> pyvcf.gt_diploidize('0')
'0/0'
>>> pyvcf.gt_diploidize('1')
'0/1'
>>> pyvcf.gt_diploidize('.')
'./.'
>>> pyvcf.gt_diploidize('1:34')
'0/1:34'
>>> pyvcf.gt_diploidize('0/1:34')
'0/1:34'
>>> pyvcf.gt_diploidize('./.')
'./.'
"""
if gt_ploidy(g) != 1:
return g
gt = g.split(':')[0]
if gt == '.':
return './' + g
else:
return '0/' + g

def gt_miss(g):
"""
For given genotype, return True if it has missing value.
Expand Down Expand Up @@ -700,9 +748,9 @@ def gt_pseudophase(g):
--------
>>> from fuc import pyvcf
>>> pyvcf.pseudophase('0/1')
>>> pyvcf.gt_pseudophase('0/1')
'0|1'
>>> pyvcf.pseudophase('0/0:34:10,24')
>>> pyvcf.gt_pseudophase('0/0:34:10,24')
'0|0:34:10,24'
"""
l = g.split(':')
Expand Down Expand Up @@ -3472,7 +3520,9 @@ def plot_tmb(

return ax

def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
def plot_regplot_tmb(
self, a, b, ax=None, figsize=None, **kwargs
):
"""
Create a scatter plot with a linear regression model fit visualizing
correlation between TMB in two sample groups.
Expand All @@ -3497,6 +3547,11 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
matplotlib.axes.Axes
The matplotlib axes containing the plot.
See Also
--------
fuc.api.pymaf.MafFrame.plot_regplot_tmb
Similar method for the :meth:`fuc.api.pymaf.MafFrame` class.
Examples
--------
Expand All @@ -3512,7 +3567,7 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
>>> normal.name = 'Normal'
>>> tumor = af.df[af.df.Tissue == 'Tumor'].index
>>> tumor.name = 'Tumor'
>>> vf.plot_regplot(normal, tumor)
>>> vf.plot_regplot_tmb(normal, tumor)
Results for B ~ A:
R^2 = 0.01
P = 7.17e-01
Expand Down Expand Up @@ -6378,6 +6433,53 @@ def compare(self, other):
df = df[['Locus', 'Sample', 'Self', 'Other']]
return df

def diploidize(self):
"""
Diploidize VcfFrame.
Returns
-------
VcfFrame
Diploidized VcfFrame.
See Also
--------
gt_diploidize
For given genotype, return its diploid form.
Examples
--------
>>> from fuc import pyvcf
>>> data = {
... 'CHROM': ['chrX', 'chrX'],
... 'POS': [100, 101],
... 'ID': ['.', '.'],
... 'REF': ['G', 'T'],
... 'ALT': ['A', 'C'],
... 'QUAL': ['.', '.'],
... 'FILTER': ['.', '.'],
... 'INFO': ['.', '.'],
... 'FORMAT': ['GT', 'GT'],
... 'Male': ['0', '1'],
... 'Female': ['0/0', '0/1'],
... }
>>> vf = pyvcf.VcfFrame.from_dict([], data)
>>> vf.df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Male Female
0 chrX 100 . G A . . . GT 0 0/0
1 chrX 101 . T C . . . GT 1 0/1
>>> vf.diploidize().df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Male Female
0 chrX 100 . G A . . . GT 0/0 0/0
1 chrX 101 . T C . . . GT 0/1 0/1
"""
def one_row(r):
r[9:] = r[9:].apply(gt_diploidize)
return r
df = self.df.apply(one_row, axis=1)
return self.__class__(self.copy_meta(), df)

def fetch(self, variant):
"""
Fetch the VCF row that matches specified variant.
Expand Down Expand Up @@ -6463,7 +6565,7 @@ def pseudophase(self):
1 chr2 101 . T C . . . GT 1|1
"""
def one_row(r):
r[9:] = r[9:].apply(pseudophase)
r[9:] = r[9:].apply(gt_pseudophase)
return r
df = self.df.apply(one_row, axis=1)
return self.__class__(self.copy_meta(), df)
Expand All @@ -6472,7 +6574,10 @@ def get_af(self, sample, variant):
"""
Get allele fraction for a pair of sample and variant.
The method will return ``numpy.nan`` if the value is missing.
The method will return ``numpy.nan`` when:
1. variant is absent, or
2. variant is present but there is no ``AF`` in the ``FORMAT`` column
Parameters
----------
Expand All @@ -6491,32 +6596,48 @@ def get_af(self, sample, variant):
>>> from fuc import pyvcf, common
>>> data = {
... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'],
... 'POS': [100, 101, 102, 103],
... 'ID': ['.', '.', '.', '.'],
... 'REF': ['A', 'G', 'A', 'C'],
... 'ALT': ['C', 'T', 'G', 'G,A'],
... 'QUAL': ['.', '.', '.', '.'],
... 'FILTER': ['.', '.', '.', '.'],
... 'INFO': ['.', '.', '.', '.'],
... 'FORMAT': ['GT:AD:AF', 'GT:AD:AF', 'GT:AF', 'GT:AD:AF'],
... 'A': ['0/1:12,15:0.444,0.556', '0/0:32,1:0.970,0.030', '0/1:.', './.:.:.'],
... 'B': ['0/1:13,17:0.433,0.567', '0/1:14,15:0.483,0.517', './.:.', '1/2:0,11,17:0.000,0.393,0.607'],
... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
... 'POS': [100, 100, 101, 102, 103],
... 'ID': ['.', '.', '.', '.', '.'],
... 'REF': ['A', 'A', 'G', 'A', 'C'],
... 'ALT': ['C', 'T', 'T', 'G', 'G,A'],
... 'QUAL': ['.', '.', '.', '.', '.'],
... 'FILTER': ['.', '.', '.', '.', '.'],
... 'INFO': ['.', '.', '.', '.', '.'],
... 'FORMAT': ['GT:AD:AF', 'GT:AD:AF', 'GT:AD:AF', 'GT:AF', 'GT:AD:AF'],
... 'A': ['0/1:12,15:0.444,0.556', '0/0:31,0:1.000,0.000', '0/0:32,1:0.970,0.030', '0/1:.', './.:.:.'],
... 'B': ['0/0:29,0:1.000,0.000', '0/1:13,17:0.433,0.567', '0/1:14,15:0.483,0.517', './.:.', '1/2:0,11,17:0.000,0.393,0.607'],
... }
>>> vf = pyvcf.VcfFrame.from_dict([], data)
>>> vf.df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
0 chr1 100 . A C . . . GT:AD:AF 0/1:12,15:0.444,0.556 0/1:13,17:0.433,0.567
1 chr1 101 . G T . . . GT:AD:AF 0/0:32,1:0.970,0.030 0/1:14,15:0.483,0.517
2 chr1 102 . A G . . . GT:AF 0/1:. ./.:.
3 chr1 103 . C G,A . . . GT:AD:AF ./.:.:. 1/2:0,11,17:0.000,0.393,0.607
0 chr1 100 . A C . . . GT:AD:AF 0/1:12,15:0.444,0.556 0/0:29,0:1.000,0.000
1 chr1 100 . A T . . . GT:AD:AF 0/0:31,0:1.000,0.000 0/1:13,17:0.433,0.567
2 chr1 101 . G T . . . GT:AD:AF 0/0:32,1:0.970,0.030 0/1:14,15:0.483,0.517
3 chr1 102 . A G . . . GT:AF 0/1:. ./.:.
4 chr1 103 . C G,A . . . GT:AD:AF ./.:.:. 1/2:0,11,17:0.000,0.393,0.607
>>> vf.get_af('A', 'chr1-100-A-C')
0.556
>>> vf.get_af('B', 'chr1-102-A-G')
>>> vf.get_af('A', 'chr1-100-A-T')
0.0
>>> vf.get_af('B', 'chr1-100-A-T')
0.567
>>> vf.get_af('B', 'chr1-100-A-G') # does not exist
nan
>>> vf.get_af('B', 'chr1-102-A-G') # missing AF data
nan
>>> vf.get_af('B', 'chr1-103-C-A') # multiallelic locus
0.607
"""
chrom, pos, ref, alt = common.parse_variant(variant)
r = self.df[(self.df.CHROM == chrom) & (self.df.POS == pos) & (self.df.REF == ref)]

i = self.df.apply(lambda r: ((r.CHROM == chrom) & (r.POS == pos) &
(r.REF == ref) & (alt in r.ALT.split(','))), axis=1)

if i.any():
r = self.df[i]
else:
return np.nan

try:
i = r.FORMAT.values[0].split(':').index('AF')
Expand All @@ -6525,10 +6646,7 @@ def get_af(self, sample, variant):

alts = r.ALT.values[0].split(',')

if alt in alts:
j = r.ALT.values[0].split(',').index(alt)
else:
raise ValueError(f'ALT allele not found, possible choices: {alts}')
j = r.ALT.values[0].split(',').index(alt)

field = r[sample].values[0].split(':')[i]

Expand Down
2 changes: 1 addition & 1 deletion fuc/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.34.0'
__version__ = '0.35.0'

0 comments on commit 192fa56

Please sign in to comment.