From fe3500d1e352d9b513f6bf7baa79171737ba13d0 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 12 Jul 2022 09:56:20 +0900 Subject: [PATCH 1/9] Bump up version number --- CHANGELOG.rst | 3 +++ fuc/version.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e809a45..7ef83c5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,9 @@ Changelog ********* +0.36.0 (in development) +----------------------- + 0.35.0 (2022-07-12) ------------------- diff --git a/fuc/version.py b/fuc/version.py index 2670d05..aae5aca 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.35.0' +__version__ = '0.36.0' From f1af96423a0dce30c8dfc465c2e4ace36e171c1e Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 28 Jul 2022 09:56:33 +0900 Subject: [PATCH 2/9] Update ``pyvcf`` submodule to accept "sites-only" VCF --- CHANGELOG.rst | 2 ++ data/vcf/3.vcf | 7 +++++++ fuc/api/pyvcf.py | 16 ++++++++++++++-- test.py | 4 ++++ 4 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 data/vcf/3.vcf diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7ef83c5..19c78a7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,8 @@ Changelog 0.36.0 (in development) ----------------------- +* Update ``pyvcf`` submodule to accept "sites-only" VCF. + 0.35.0 (2022-07-12) ------------------- diff --git a/data/vcf/3.vcf b/data/vcf/3.vcf new file mode 100644 index 0000000..711330f --- /dev/null +++ b/data/vcf/3.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 100 . A "T,C" . . . +chr1 101 . G T . . . +chr2 1055 . T G . . . +chr2 3345 . A C . . . +chr2 5594 . T G . . . \ No newline at end of file diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 4efdb28..7a412fe 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -86,7 +86,8 @@ do not contain the FORMAT column or sample-specific information. These are called "sites-only" VCF files, and normally represent genetic variation that has been observed in a large population. Generally, information about the -population of origin should be included in the header. +population of origin should be included in the header. Note that the pyvcf +submodule supports these sites-only VCF files as well. There are several reserved keywords in the INFO and FORMAT columns that are standards across the community. Popular keywords are listed below: @@ -1577,6 +1578,8 @@ class VcfFrame: """ Class for storing VCF data. + Sites-only VCF files are supported. + Parameters ---------- meta : list @@ -1624,7 +1627,16 @@ class VcfFrame: def _check_df(self, df): df = df.reset_index(drop=True) - df = df.astype(HEADERS) + headers = HEADERS.copy() + # Handle "sites-only" VCF. + if 'FORMAT' not in df.columns: + del headers['FORMAT'] + if set(df.columns) != set(headers): + raise ValueError("The input appears to be a sites-only VCF " + "because it's missing the FORMAT column; " + "however, it contains one or more incorrect " + f"columns: {df.columns.to_list()}.") + df = df.astype(headers) return df def __init__(self, meta, df): diff --git a/test.py b/test.py index f55be54..0054d78 100644 --- a/test.py +++ b/test.py @@ -51,6 +51,10 @@ def test_subset(self): vf = vf.subset(['Sarah', 'John']) self.assertEqual(len(vf.samples), 2) + def test_sites_only(self): + vf = pyvcf.VcfFrame.from_file(vcf_file3) + self.assertEqual(vf.shape, (5, 0)) + class TestPybed(unittest.TestCase): def test_intersect(self): From ef3039aed1190345310431f56cd55ffc712803a3 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 1 Aug 2022 08:36:57 +0900 Subject: [PATCH 3/9] Update docs --- README.rst | 8 ++++++++ docs/create.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/README.rst b/README.rst index d7f087e..812b0fa 100644 --- a/README.rst +++ b/README.rst @@ -57,6 +57,14 @@ Your contributions (e.g. feature ideas, pull requests) are most welcome. | Email: sbstevenlee@gmail.com | License: MIT License +Citation +======== + +If you use fuc in a published analysis, please report the program version +and cite the following article: + +Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation `__. PLOS ONE. + Installation ============ diff --git a/docs/create.py b/docs/create.py index 27d9cf9..a215862 100644 --- a/docs/create.py +++ b/docs/create.py @@ -85,6 +85,14 @@ | Email: sbstevenlee@gmail.com | License: MIT License +Citation +======== + +If you use fuc in a published analysis, please report the program version +and cite the following article: + +Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation `__. PLOS ONE. + Installation ============ From d0427227eb0f20b614340b5460eb4e2c9a7905d0 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 8 Aug 2022 08:58:44 +0900 Subject: [PATCH 4/9] Add new method `pyvcf.VcfFrame.filter_gsa` --- CHANGELOG.rst | 1 + fuc/api/pyvcf.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 19c78a7..211b728 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ Changelog ----------------------- * Update ``pyvcf`` submodule to accept "sites-only" VCF. +* Add new method :meth:`pyvcf.VcfFrame.filter_gsa`. 0.35.0 (2022-07-12) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 7a412fe..edf688b 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -4368,6 +4368,87 @@ def f(r): return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) + def filter_gsa(self, opposite=False, as_index=False): + """ + Filter rows specific to Illumina's GSA array. + + This function will remove variants that are specific to Illimina's + Infinium Global Screening (GSA) array. More specifically, variants + are removed if they contain one of the characters {'I', 'D', 'N', + ','} as either REF or ALT. + + Parameters + ---------- + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. + + Returns + ------- + VcfFrame or pandas.Series + Filtered VcfFrame or boolean index array. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['D', 'N', 'A', 'C'], + ... 'ALT': ['I', '.', '.', 'A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'Steven': ['0/1', '0/0', './.', '0/1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . D I . . . GT 0/1 + 1 chr1 101 . N . . . . GT 0/0 + 2 chr1 102 . A . . . . GT ./. + 3 chr1 103 . C A . . . GT 0/1 + + We can remove rows that are GSA-specific: + + >>> vf.filter_gsa().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 103 . C A . . . GT 0/1 + + We can also select those rows: + + >>> vf.filter_gsa(opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . D I . . . GT 0/1 + 1 chr1 101 . N . . . . GT 0/0 + 2 chr1 102 . A . . . . GT ./. + + Finally, we can return boolean index array from the filtering: + + >>> vf.filter_gsa(as_index=True) + 0 False + 1 False + 2 False + 3 True + dtype: bool + """ + def one_row(r): + alleles = ['I', 'D', '.', 'N'] + return r.REF in alleles or r.ALT in alleles + i = ~self.df.apply(one_row, axis=1) + if opposite: + i = ~i + if as_index: + return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) + return self.__class__(self.copy_meta(), self.df[i]) + def filter_indel(self, opposite=False, as_index=False): """ Filter rows with indel. From 8771b4166ba1b4ab314dc1e7d66df4997e318fdc Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 8 Aug 2022 10:06:29 +0900 Subject: [PATCH 5/9] Add new method `pyvcf.VcfFrame.duplicated` --- CHANGELOG.rst | 1 + fuc/api/pyvcf.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 211b728..92cb0c4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Changelog * Update ``pyvcf`` submodule to accept "sites-only" VCF. * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`. +* Add new method :meth:`pyvcf.VcfFrame.duplicated`. 0.35.0 (2022-07-12) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index edf688b..8211b5b 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -5945,6 +5945,73 @@ def rename(self, names, indicies=None): vf.df.columns = columns return vf + def duplicated(self, subset=None, keep='first'): + """ + Return boolean Series denoting duplicate rows in VcfFrame. + + This method essentially wraps the :meth:`pandas.DataFrame.duplicated` + method. + + Considering certain columns is optional. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + Series + Boolean series for each duplicated rows. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], + ... 'POS': [100, 100, 200, 200], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'C', 'C'], + ... 'ALT': ['C', 'T', 'G', 'G,A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'A': ['0/1', './.', '0/1', './.'], + ... 'B': ['./.', '0/1', './.', '1/2'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . A C . . . GT 0/1 ./. + 1 chr1 100 . A T . . . GT ./. 0/1 + 2 chr2 200 . C G . . . GT 0/1 ./. + 3 chr2 200 . C G,A . . . GT ./. 1/2 + >>> vf.duplicated(['CHROM', 'POS', 'REF']) + 0 False + 1 True + 2 False + 3 True + dtype: bool + >>> vf.duplicated(['CHROM', 'POS', 'REF'], keep='last') + 0 True + 1 False + 2 True + 3 False + dtype: bool + """ + return self.df.duplicated(subset=subset, keep=keep) + def drop_duplicates(self, subset=None, keep='first'): """ Return VcfFrame with duplicate rows removed. From 9b6bc1498427e867d1d8e3766d4c2e946b219d96 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 8 Aug 2022 15:01:03 +0900 Subject: [PATCH 6/9] Update docs --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 92cb0c4..089ce6c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,7 @@ Changelog 0.36.0 (in development) ----------------------- +* ``fuc`` now has a citation! Please refer to the publication “`ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation `__” by Lee et al., 2022 (Steven is the first author). Fore more details, see the Citation section in README. * Update ``pyvcf`` submodule to accept "sites-only" VCF. * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`. * Add new method :meth:`pyvcf.VcfFrame.duplicated`. From dd92ed8994ee7a044c8b127dc14afe16cb0fa108 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 9 Aug 2022 09:04:18 +0900 Subject: [PATCH 7/9] Update `pymaf.MafFrame.plot_regplot_tmb`: * Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method. --- CHANGELOG.rst | 3 ++- fuc/api/pymaf.py | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 089ce6c..38a4010 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,7 @@ Changelog * Update ``pyvcf`` submodule to accept "sites-only" VCF. * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`. * Add new method :meth:`pyvcf.VcfFrame.duplicated`. +* Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method. 0.35.0 (2022-07-12) ------------------- @@ -41,7 +42,7 @@ Changelog 0.32.0 (2022-04-02) ------------------- -* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix. +* Add new optional argument ``filter_off`` to :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix. * Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files. * Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity. * Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations. diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index cefbaa3..d3ab996 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -1400,7 +1400,7 @@ def plot_regplot_gene( def plot_regplot_tmb( self, af, subject_col, group_col, a, b, ax=None, figsize=None, - **kwargs + to_csv=None, **kwargs ): """ Create a scatter plot with a linear regression model fit visualizing @@ -1419,6 +1419,8 @@ def plot_regplot_tmb( AnnFrame column containing sample group information. a, b : str Sample group names. + to_csv : str, optional + Write the plot's data to a CSV file. ax : matplotlib.axes.Axes, optional Pre-existing axes for the plot. Otherwise, crete a new one. figsize : tuple, optional @@ -1483,6 +1485,10 @@ def one_row(r): print(f'R^2 = {results.rsquared:.2f}') print(f' P = {results.f_pvalue:.2e}') + # Write the DataFrame to a CSV file. + if to_csv is not None: + df.to_csv(to_csv) + return ax def plot_interactions( From c4b6a426e99309394b4193734123c3577f66c646 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 10 Aug 2022 15:07:48 +0900 Subject: [PATCH 8/9] Update `pymaf.MafFrame.plot_mutated_matched`: * Add new optional argument ``count`` to :meth:`pymaf.MafFrame.plot_mutated_matched` method. --- CHANGELOG.rst | 1 + fuc/api/pymaf.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 38a4010..204b048 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ Changelog * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`. * Add new method :meth:`pyvcf.VcfFrame.duplicated`. * Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method. +* Add new optional argument ``count`` to :meth:`pymaf.MafFrame.plot_mutated_matched` method. 0.35.0 (2022-07-12) ------------------- diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index d3ab996..aaffa10 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -1811,8 +1811,8 @@ def plot_mutated( return ax def plot_mutated_matched( - self, af, patient_col, group_col, group_order, ax=None, figsize=None, - **kwargs + self, af, patient_col, group_col, group_order, count=10, ax=None, + figsize=None, **kwargs ): """ Create a bar plot visualizing the mutation prevalence of top @@ -1828,6 +1828,8 @@ def plot_mutated_matched( AnnFrame column containing sample group information. group_order : list List of sample group names. + count : int, defualt: 10 + Number of top mutated genes to display. ax : matplotlib.axes.Axes, optional Pre-existing axes for the plot. Otherwise, crete a new one. figsize : tuple, optional @@ -1841,7 +1843,7 @@ def plot_mutated_matched( matplotlib.axes.Axes The matplotlib axes containing the plot. """ - df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order) + df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order, count=count) df = df.applymap(lambda x: 0 if x == 'None' else 1) s = df.sum(axis=1) / len(df.columns) * 100 s.name = 'Count' From b1da2a1f4f2d052814d020efcddb95bfc1d700b6 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 12 Aug 2022 07:57:14 +0900 Subject: [PATCH 9/9] Update docs --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 204b048..b444d25 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,8 @@ Changelog ********* -0.36.0 (in development) ------------------------ +0.36.0 (2022-08-12) +------------------- * ``fuc`` now has a citation! Please refer to the publication “`ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation `__” by Lee et al., 2022 (Steven is the first author). Fore more details, see the Citation section in README. * Update ``pyvcf`` submodule to accept "sites-only" VCF.