diff --git a/CHANGELOG.rst b/CHANGELOG.rst index abc8678..e809a45 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,15 @@ Changelog ********* +0.35.0 (2022-07-12) +------------------- + +* Fix bug in :meth:`pyvcf.VcfFrame.pseudophase` method. +* Add new methods :meth:`pyvcf.VcfFrame.diploidize` and :meth:`pyvcf.gt_diploidize`. +* Update :meth:`pyvcf.VcfFrame.get_af` method to handle situations where there are multiple records with the same ``REF`` allele. +* Add new method :meth:`pymaf.MafFrame.plot_regplot_tmb`. +* Rename :meth:`pyvcf.VcfFrame.plot_regplot` method to :meth:`pyvcf.VcfFrame.plot_regplot_tmb` and :meth:`pymaf.MafFrame.plot_regplot` method to :meth:`pymaf.MafFrame.plot_regplot_gene`. + 0.34.0 (2022-06-08) ------------------- diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index b666736..cefbaa3 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -1295,7 +1295,7 @@ def plot_genepair( return ax - def plot_regplot( + def plot_regplot_gene( self, af, group_col, a, b, a_size=None, b_size=None, genes=None, count=10, to_csv=None, ax=None, figsize=None, **kwargs ): @@ -1353,7 +1353,7 @@ def plot_regplot( >>> annot_file = '~/fuc-data/tcga-laml/tcga_laml_annot.tsv' >>> mf = pymaf.MafFrame.from_file(maf_file) >>> af = common.AnnFrame.from_file(annot_file, sample_col=0) - >>> mf.plot_regplot(af, 'FAB_classification', 'M1', 'M2') + >>> mf.plot_regplot_gene(af, 'FAB_classification', 'M1', 'M2') Results for M2 ~ M1: R^2 = 0.43 P = 3.96e-02 @@ -1398,6 +1398,93 @@ def plot_regplot( return ax + def plot_regplot_tmb( + self, af, subject_col, group_col, a, b, ax=None, figsize=None, + **kwargs + ): + """ + Create a scatter plot with a linear regression model fit visualizing + correlation between TMB in two sample groups. + + The method will automatically calculate and print summary statistics + including R-squared and p-value. + + Parameters + ---------- + af : AnnFrame + AnnFrame containing sample annotation data. + subject_col : str + AnnFrame column containing sample subject information. + group_col : str + AnnFrame column containing sample group information. + a, b : str + Sample group names. + ax : matplotlib.axes.Axes, optional + Pre-existing axes for the plot. Otherwise, crete a new one. + figsize : tuple, optional + Width, height in inches. Format: (float, float). + kwargs + Other keyword arguments will be passed down to + :meth:`seaborn.regplot`. + + Returns + ------- + matplotlib.axes.Axes + The matplotlib axes containing the plot. + + See Also + -------- + fuc.api.pyvcf.VcfFrame.plot_regplot_tmb + Similar method for the :meth:`fuc.api.pyvcf.VcfFrame` class. + + Examples + -------- + + .. plot:: + + >>> import matplotlib.pyplot as plt + >>> from fuc import common, pymaf, pyvcf + >>> common.load_dataset('pyvcf') + >>> vcf_file = '~/fuc-data/pyvcf/normal-tumor.vcf' + >>> annot_file = '~/fuc-data/pyvcf/normal-tumor-annot.tsv' + >>> vf = pyvcf.VcfFrame.from_file(vcf_file) + >>> af = common.AnnFrame.from_file(annot_file, sample_col='Sample') + >>> mf = pymaf.MafFrame.from_vcf(vf) + >>> mf.plot_regplot_tmb(af, 'Patient', 'Tissue', 'Normal', 'Tumor') + Results for Tumor ~ Normal: + R^2 = 0.01 + P = 7.17e-01 + >>> plt.tight_layout() + """ + d = self.df.value_counts('Tumor_Sample_Barcode').to_dict() + + df = af.df[af.df[group_col].isin([a, b])][[group_col, subject_col]] + df = df.reset_index().pivot(index=subject_col, columns=group_col, + values=af.df.index.name) + + def one_row(r): + for col in [a, b]: + if r[col] in d: + r[col] = d[r[col]] + else: + r[col] = 0 + return r + + df = df.apply(one_row, axis=1) + + if ax is None: + fig, ax = plt.subplots(figsize=figsize) + + sns.regplot(x=a, y=b, data=df, ax=ax, **kwargs) + + # Print summary statistics including R-squared and p-value. + results = smf.ols(f'{b} ~ {a}', data=df).fit() + print(f"Results for {b} ~ {a}:") + print(f'R^2 = {results.rsquared:.2f}') + print(f' P = {results.f_pvalue:.2e}') + + return ax + def plot_interactions( self, count=10, cmap=None, ax=None, figsize=None, **kwargs ): diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index b5f74f8..4efdb28 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -447,6 +447,54 @@ def rescue_filtered_variants(vfs, format='GT'): merged_vf = merge(filtered_vfs, how='outer', format=format) return merged_vf +def gt_diploidize(g): + """ + For given genotype, return its diploid form. + + This method will ignore diploid genotypes (e.g. 0/1) and is therefore + only effective for haploid genotypes (e.g. genotypes from the X + chromosome for males). + + Parameters + ---------- + g : str + Sample genotype. + + Returns + ------- + bool + Diploid genotype. + + See Also + -------- + VcfFrame.diploidize + Diploidize VcfFrame. + + Examples + -------- + + >>> from fuc import pyvcf + >>> pyvcf.gt_diploidize('0') + '0/0' + >>> pyvcf.gt_diploidize('1') + '0/1' + >>> pyvcf.gt_diploidize('.') + './.' + >>> pyvcf.gt_diploidize('1:34') + '0/1:34' + >>> pyvcf.gt_diploidize('0/1:34') + '0/1:34' + >>> pyvcf.gt_diploidize('./.') + './.' + """ + if gt_ploidy(g) != 1: + return g + gt = g.split(':')[0] + if gt == '.': + return './' + g + else: + return '0/' + g + def gt_miss(g): """ For given genotype, return True if it has missing value. @@ -700,9 +748,9 @@ def gt_pseudophase(g): -------- >>> from fuc import pyvcf - >>> pyvcf.pseudophase('0/1') + >>> pyvcf.gt_pseudophase('0/1') '0|1' - >>> pyvcf.pseudophase('0/0:34:10,24') + >>> pyvcf.gt_pseudophase('0/0:34:10,24') '0|0:34:10,24' """ l = g.split(':') @@ -3472,7 +3520,9 @@ def plot_tmb( return ax - def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs): + def plot_regplot_tmb( + self, a, b, ax=None, figsize=None, **kwargs + ): """ Create a scatter plot with a linear regression model fit visualizing correlation between TMB in two sample groups. @@ -3497,6 +3547,11 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs): matplotlib.axes.Axes The matplotlib axes containing the plot. + See Also + -------- + fuc.api.pymaf.MafFrame.plot_regplot_tmb + Similar method for the :meth:`fuc.api.pymaf.MafFrame` class. + Examples -------- @@ -3512,7 +3567,7 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs): >>> normal.name = 'Normal' >>> tumor = af.df[af.df.Tissue == 'Tumor'].index >>> tumor.name = 'Tumor' - >>> vf.plot_regplot(normal, tumor) + >>> vf.plot_regplot_tmb(normal, tumor) Results for B ~ A: R^2 = 0.01 P = 7.17e-01 @@ -6378,6 +6433,53 @@ def compare(self, other): df = df[['Locus', 'Sample', 'Self', 'Other']] return df + def diploidize(self): + """ + Diploidize VcfFrame. + + Returns + ------- + VcfFrame + Diploidized VcfFrame. + + See Also + -------- + gt_diploidize + For given genotype, return its diploid form. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chrX', 'chrX'], + ... 'POS': [100, 101], + ... 'ID': ['.', '.'], + ... 'REF': ['G', 'T'], + ... 'ALT': ['A', 'C'], + ... 'QUAL': ['.', '.'], + ... 'FILTER': ['.', '.'], + ... 'INFO': ['.', '.'], + ... 'FORMAT': ['GT', 'GT'], + ... 'Male': ['0', '1'], + ... 'Female': ['0/0', '0/1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Male Female + 0 chrX 100 . G A . . . GT 0 0/0 + 1 chrX 101 . T C . . . GT 1 0/1 + >>> vf.diploidize().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Male Female + 0 chrX 100 . G A . . . GT 0/0 0/0 + 1 chrX 101 . T C . . . GT 0/1 0/1 + """ + def one_row(r): + r[9:] = r[9:].apply(gt_diploidize) + return r + df = self.df.apply(one_row, axis=1) + return self.__class__(self.copy_meta(), df) + def fetch(self, variant): """ Fetch the VCF row that matches specified variant. @@ -6463,7 +6565,7 @@ def pseudophase(self): 1 chr2 101 . T C . . . GT 1|1 """ def one_row(r): - r[9:] = r[9:].apply(pseudophase) + r[9:] = r[9:].apply(gt_pseudophase) return r df = self.df.apply(one_row, axis=1) return self.__class__(self.copy_meta(), df) @@ -6472,7 +6574,10 @@ def get_af(self, sample, variant): """ Get allele fraction for a pair of sample and variant. - The method will return ``numpy.nan`` if the value is missing. + The method will return ``numpy.nan`` when: + + 1. variant is absent, or + 2. variant is present but there is no ``AF`` in the ``FORMAT`` column Parameters ---------- @@ -6491,32 +6596,48 @@ def get_af(self, sample, variant): >>> from fuc import pyvcf, common >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['A', 'G', 'A', 'C'], - ... 'ALT': ['C', 'T', 'G', 'G,A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT:AD:AF', 'GT:AD:AF', 'GT:AF', 'GT:AD:AF'], - ... 'A': ['0/1:12,15:0.444,0.556', '0/0:32,1:0.970,0.030', '0/1:.', './.:.:.'], - ... 'B': ['0/1:13,17:0.433,0.567', '0/1:14,15:0.483,0.517', './.:.', '1/2:0,11,17:0.000,0.393,0.607'], + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'G', 'A', 'C'], + ... 'ALT': ['C', 'T', 'T', 'G', 'G,A'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT:AD:AF', 'GT:AD:AF', 'GT:AD:AF', 'GT:AF', 'GT:AD:AF'], + ... 'A': ['0/1:12,15:0.444,0.556', '0/0:31,0:1.000,0.000', '0/0:32,1:0.970,0.030', '0/1:.', './.:.:.'], + ... 'B': ['0/0:29,0:1.000,0.000', '0/1:13,17:0.433,0.567', '0/1:14,15:0.483,0.517', './.:.', '1/2:0,11,17:0.000,0.393,0.607'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B - 0 chr1 100 . A C . . . GT:AD:AF 0/1:12,15:0.444,0.556 0/1:13,17:0.433,0.567 - 1 chr1 101 . G T . . . GT:AD:AF 0/0:32,1:0.970,0.030 0/1:14,15:0.483,0.517 - 2 chr1 102 . A G . . . GT:AF 0/1:. ./.:. - 3 chr1 103 . C G,A . . . GT:AD:AF ./.:.:. 1/2:0,11,17:0.000,0.393,0.607 + 0 chr1 100 . A C . . . GT:AD:AF 0/1:12,15:0.444,0.556 0/0:29,0:1.000,0.000 + 1 chr1 100 . A T . . . GT:AD:AF 0/0:31,0:1.000,0.000 0/1:13,17:0.433,0.567 + 2 chr1 101 . G T . . . GT:AD:AF 0/0:32,1:0.970,0.030 0/1:14,15:0.483,0.517 + 3 chr1 102 . A G . . . GT:AF 0/1:. ./.:. + 4 chr1 103 . C G,A . . . GT:AD:AF ./.:.:. 1/2:0,11,17:0.000,0.393,0.607 >>> vf.get_af('A', 'chr1-100-A-C') 0.556 - >>> vf.get_af('B', 'chr1-102-A-G') + >>> vf.get_af('A', 'chr1-100-A-T') + 0.0 + >>> vf.get_af('B', 'chr1-100-A-T') + 0.567 + >>> vf.get_af('B', 'chr1-100-A-G') # does not exist + nan + >>> vf.get_af('B', 'chr1-102-A-G') # missing AF data nan + >>> vf.get_af('B', 'chr1-103-C-A') # multiallelic locus + 0.607 """ chrom, pos, ref, alt = common.parse_variant(variant) - r = self.df[(self.df.CHROM == chrom) & (self.df.POS == pos) & (self.df.REF == ref)] + + i = self.df.apply(lambda r: ((r.CHROM == chrom) & (r.POS == pos) & + (r.REF == ref) & (alt in r.ALT.split(','))), axis=1) + + if i.any(): + r = self.df[i] + else: + return np.nan try: i = r.FORMAT.values[0].split(':').index('AF') @@ -6525,10 +6646,7 @@ def get_af(self, sample, variant): alts = r.ALT.values[0].split(',') - if alt in alts: - j = r.ALT.values[0].split(',').index(alt) - else: - raise ValueError(f'ALT allele not found, possible choices: {alts}') + j = r.ALT.values[0].split(',').index(alt) field = r[sample].values[0].split(':')[i] diff --git a/fuc/version.py b/fuc/version.py index cac7112..2670d05 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.34.0' +__version__ = '0.35.0'