From fe3500d1e352d9b513f6bf7baa79171737ba13d0 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Tue, 12 Jul 2022 09:56:20 +0900
Subject: [PATCH 1/9] Bump up version number

---
 CHANGELOG.rst  | 3 +++
 fuc/version.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e809a45..7ef83c5 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,9 @@
 Changelog
 *********
 
+0.36.0 (in development)
+-----------------------
+
 0.35.0 (2022-07-12)
 -------------------
 
diff --git a/fuc/version.py b/fuc/version.py
index 2670d05..aae5aca 100644
--- a/fuc/version.py
+++ b/fuc/version.py
@@ -1 +1 @@
-__version__ = '0.35.0'
+__version__ = '0.36.0'

From f1af96423a0dce30c8dfc465c2e4ace36e171c1e Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 28 Jul 2022 09:56:33 +0900
Subject: [PATCH 2/9] Update ``pyvcf`` submodule to accept "sites-only" VCF

---
 CHANGELOG.rst    |  2 ++
 data/vcf/3.vcf   |  7 +++++++
 fuc/api/pyvcf.py | 16 ++++++++++++++--
 test.py          |  4 ++++
 4 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 data/vcf/3.vcf

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 7ef83c5..19c78a7 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,6 +4,8 @@ Changelog
 0.36.0 (in development)
 -----------------------
 
+* Update ``pyvcf`` submodule to accept "sites-only" VCF.
+
 0.35.0 (2022-07-12)
 -------------------
 
diff --git a/data/vcf/3.vcf b/data/vcf/3.vcf
new file mode 100644
index 0000000..711330f
--- /dev/null
+++ b/data/vcf/3.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2							
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+chr1	100	.	A	"T,C"	.	.	.
+chr1	101	.	G	T	.	.	.
+chr2	1055	.	T	G	.	.	.
+chr2	3345	.	A	C	.	.	.
+chr2	5594	.	T	G	.	.	.
\ No newline at end of file
diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
index 4efdb28..7a412fe 100644
--- a/fuc/api/pyvcf.py
+++ b/fuc/api/pyvcf.py
@@ -86,7 +86,8 @@
 do not contain the FORMAT column or sample-specific information. These are
 called "sites-only" VCF files, and normally represent genetic variation that
 has been observed in a large population. Generally, information about the
-population of origin should be included in the header.
+population of origin should be included in the header. Note that the pyvcf
+submodule supports these sites-only VCF files as well.
 
 There are several reserved keywords in the INFO and FORMAT columns that are
 standards across the community. Popular keywords are listed below:
@@ -1577,6 +1578,8 @@ class VcfFrame:
     """
     Class for storing VCF data.
 
+    Sites-only VCF files are supported.
+
     Parameters
     ----------
     meta : list
@@ -1624,7 +1627,16 @@ class VcfFrame:
 
     def _check_df(self, df):
         df = df.reset_index(drop=True)
-        df = df.astype(HEADERS)
+        headers = HEADERS.copy()
+        # Handle "sites-only" VCF.
+        if 'FORMAT' not in df.columns:
+            del headers['FORMAT']
+            if set(df.columns) != set(headers):
+                raise ValueError("The input appears to be a sites-only VCF "
+                                 "because it's missing the FORMAT column; "
+                                 "however, it contains one or more incorrect "
+                                 f"columns: {df.columns.to_list()}.")
+        df = df.astype(headers)
         return df
 
     def __init__(self, meta, df):
diff --git a/test.py b/test.py
index f55be54..0054d78 100644
--- a/test.py
+++ b/test.py
@@ -51,6 +51,10 @@ def test_subset(self):
         vf = vf.subset(['Sarah', 'John'])
         self.assertEqual(len(vf.samples), 2)
 
+    def test_sites_only(self):
+        vf = pyvcf.VcfFrame.from_file(vcf_file3)
+        self.assertEqual(vf.shape, (5, 0))
+
 class TestPybed(unittest.TestCase):
 
     def test_intersect(self):

From ef3039aed1190345310431f56cd55ffc712803a3 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Mon, 1 Aug 2022 08:36:57 +0900
Subject: [PATCH 3/9] Update docs

---
 README.rst     | 8 ++++++++
 docs/create.py | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/README.rst b/README.rst
index d7f087e..812b0fa 100644
--- a/README.rst
+++ b/README.rst
@@ -57,6 +57,14 @@ Your contributions (e.g. feature ideas, pull requests) are most welcome.
 | Email: sbstevenlee@gmail.com
 | License: MIT License
 
+Citation
+========
+
+If you use fuc in a published analysis, please report the program version
+and cite the following article:
+
+Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__. PLOS ONE.
+
 Installation
 ============
 
diff --git a/docs/create.py b/docs/create.py
index 27d9cf9..a215862 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -85,6 +85,14 @@
 | Email: sbstevenlee@gmail.com
 | License: MIT License
 
+Citation
+========
+
+If you use fuc in a published analysis, please report the program version
+and cite the following article:
+
+Lee et al., 2022. `ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__. PLOS ONE.
+
 Installation
 ============
 

From d0427227eb0f20b614340b5460eb4e2c9a7905d0 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Mon, 8 Aug 2022 08:58:44 +0900
Subject: [PATCH 4/9] Add new method `pyvcf.VcfFrame.filter_gsa`

---
 CHANGELOG.rst    |  1 +
 fuc/api/pyvcf.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 19c78a7..211b728 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,7 @@ Changelog
 -----------------------
 
 * Update ``pyvcf`` submodule to accept "sites-only" VCF.
+* Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
 
 0.35.0 (2022-07-12)
 -------------------
diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
index 7a412fe..edf688b 100644
--- a/fuc/api/pyvcf.py
+++ b/fuc/api/pyvcf.py
@@ -4368,6 +4368,87 @@ def f(r):
             return self.__class__(self.copy_meta(), self.copy_df())
         return self.__class__(self.copy_meta(), self.df[i])
 
+    def filter_gsa(self, opposite=False, as_index=False):
+        """
+        Filter rows specific to Illumina's GSA array.
+
+        This function will remove variants that are specific to Illimina's
+        Infinium Global Screening (GSA) array. More specifically, variants
+        are removed if they contain one of the characters {'I', 'D', 'N',
+        ','} as either REF or ALT.
+
+        Parameters
+        ----------
+        opposite : bool, default: False
+            If True, return rows that don't meet the said criteria.
+        as_index : bool, default: False
+            If True, return boolean index array instead of VcfFrame.
+
+        Returns
+        -------
+        VcfFrame or pandas.Series
+            Filtered VcfFrame or boolean index array.
+
+        Examples
+        --------
+        Assume we have the following data:
+
+        >>> from fuc import pyvcf
+        >>> data = {
+        ...     'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'],
+        ...     'POS': [100, 101, 102, 103],
+        ...     'ID': ['.', '.', '.', '.'],
+        ...     'REF': ['D', 'N', 'A', 'C'],
+        ...     'ALT': ['I', '.', '.', 'A'],
+        ...     'QUAL': ['.', '.', '.', '.'],
+        ...     'FILTER': ['.', '.', '.', '.'],
+        ...     'INFO': ['.', '.', '.', '.'],
+        ...     'FORMAT': ['GT', 'GT', 'GT', 'GT'],
+        ...     'Steven': ['0/1', '0/0', './.', '0/1'],
+        ... }
+        >>> vf = pyvcf.VcfFrame.from_dict([], data)
+        >>> vf.df
+          CHROM  POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+        0  chr1  100  .   D   I    .      .    .     GT    0/1
+        1  chr1  101  .   N   .    .      .    .     GT    0/0
+        2  chr1  102  .   A   .    .      .    .     GT    ./.
+        3  chr1  103  .   C   A    .      .    .     GT    0/1
+
+        We can remove rows that are GSA-specific:
+
+        >>> vf.filter_gsa().df
+          CHROM  POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+        0  chr1  103  .   C   A    .      .    .     GT    0/1
+
+        We can also select those rows:
+
+        >>> vf.filter_gsa(opposite=True).df
+          CHROM  POS ID REF ALT QUAL FILTER INFO FORMAT Steven
+        0  chr1  100  .   D   I    .      .    .     GT    0/1
+        1  chr1  101  .   N   .    .      .    .     GT    0/0
+        2  chr1  102  .   A   .    .      .    .     GT    ./.
+
+        Finally, we can return boolean index array from the filtering:
+
+        >>> vf.filter_gsa(as_index=True)
+        0    False
+        1    False
+        2    False
+        3     True
+        dtype: bool
+        """
+        def one_row(r):
+            alleles = ['I', 'D', '.', 'N']
+            return r.REF in alleles or r.ALT in alleles
+        i = ~self.df.apply(one_row, axis=1)
+        if opposite:
+            i = ~i
+        if as_index:
+            return i
+        if i.empty:
+            return self.__class__(self.copy_meta(), self.copy_df())
+        return self.__class__(self.copy_meta(), self.df[i])
+
     def filter_indel(self, opposite=False, as_index=False):
         """
         Filter rows with indel.

From 8771b4166ba1b4ab314dc1e7d66df4997e318fdc Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Mon, 8 Aug 2022 10:06:29 +0900
Subject: [PATCH 5/9] Add new method `pyvcf.VcfFrame.duplicated`

---
 CHANGELOG.rst    |  1 +
 fuc/api/pyvcf.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 211b728..92cb0c4 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,6 +6,7 @@ Changelog
 
 * Update ``pyvcf`` submodule to accept "sites-only" VCF.
 * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
+* Add new method :meth:`pyvcf.VcfFrame.duplicated`.
 
 0.35.0 (2022-07-12)
 -------------------
diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
index edf688b..8211b5b 100644
--- a/fuc/api/pyvcf.py
+++ b/fuc/api/pyvcf.py
@@ -5945,6 +5945,73 @@ def rename(self, names, indicies=None):
         vf.df.columns = columns
         return vf
 
+    def duplicated(self, subset=None, keep='first'):
+        """
+        Return boolean Series denoting duplicate rows in VcfFrame.
+
+        This method essentially wraps the :meth:`pandas.DataFrame.duplicated`
+        method.
+
+        Considering certain columns is optional.
+
+        Parameters
+        ----------
+        subset : column label or sequence of labels, optional
+            Only consider certain columns for identifying duplicates, by
+            default use all of the columns.
+        keep : {'first', 'last', False}, default 'first'
+            Determines which duplicates (if any) to keep.
+
+            - ``first`` : Mark duplicates as ``True`` except for the first
+              occurrence.
+            - ``last`` : Mark duplicates as ``True`` except for the last
+              occurrence.
+            - False : Mark all duplicates as ``True``.
+
+        Returns
+        -------
+        Series
+            Boolean series for each duplicated rows.
+
+        Examples
+        --------
+
+        >>> from fuc import pyvcf
+        >>> data = {
+        ...     'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'],
+        ...     'POS': [100, 100, 200, 200],
+        ...     'ID': ['.', '.', '.', '.'],
+        ...     'REF': ['A', 'A', 'C', 'C'],
+        ...     'ALT': ['C', 'T', 'G', 'G,A'],
+        ...     'QUAL': ['.', '.', '.', '.'],
+        ...     'FILTER': ['.', '.', '.', '.'],
+        ...     'INFO': ['.', '.', '.', '.'],
+        ...     'FORMAT': ['GT', 'GT', 'GT', 'GT'],
+        ...     'A': ['0/1', './.', '0/1', './.'],
+        ...     'B': ['./.', '0/1', './.', '1/2'],
+        ... }
+        >>> vf = pyvcf.VcfFrame.from_dict([], data)
+        >>> vf.df
+          CHROM  POS ID REF  ALT QUAL FILTER INFO FORMAT    A    B
+        0  chr1  100  .   A    C    .      .    .     GT  0/1  ./.
+        1  chr1  100  .   A    T    .      .    .     GT  ./.  0/1
+        2  chr2  200  .   C    G    .      .    .     GT  0/1  ./.
+        3  chr2  200  .   C  G,A    .      .    .     GT  ./.  1/2
+        >>> vf.duplicated(['CHROM', 'POS', 'REF'])
+        0    False
+        1     True
+        2    False
+        3     True
+        dtype: bool
+        >>> vf.duplicated(['CHROM', 'POS', 'REF'], keep='last')
+        0     True
+        1    False
+        2     True
+        3    False
+        dtype: bool
+        """
+        return self.df.duplicated(subset=subset, keep=keep)
+
     def drop_duplicates(self, subset=None, keep='first'):
         """
         Return VcfFrame with duplicate rows removed.

From 9b6bc1498427e867d1d8e3766d4c2e946b219d96 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Mon, 8 Aug 2022 15:01:03 +0900
Subject: [PATCH 6/9] Update docs

---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 92cb0c4..089ce6c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,6 +4,7 @@ Changelog
 0.36.0 (in development)
 -----------------------
 
+* ``fuc`` now has a citation! Please refer to the publication “`ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__” by Lee et al., 2022 (Steven is the first author). Fore more details, see the Citation section in README.
 * Update ``pyvcf`` submodule to accept "sites-only" VCF.
 * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
 * Add new method :meth:`pyvcf.VcfFrame.duplicated`.

From dd92ed8994ee7a044c8b127dc14afe16cb0fa108 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Tue, 9 Aug 2022 09:04:18 +0900
Subject: [PATCH 7/9] Update `pymaf.MafFrame.plot_regplot_tmb`:

* Add new optional argument ``to_csv`` to
:meth:`pymaf.MafFrame.plot_regplot_tmb` method.
---
 CHANGELOG.rst    | 3 ++-
 fuc/api/pymaf.py | 8 +++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 089ce6c..38a4010 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,6 +8,7 @@ Changelog
 * Update ``pyvcf`` submodule to accept "sites-only" VCF.
 * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
 * Add new method :meth:`pyvcf.VcfFrame.duplicated`.
+* Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method.
 
 0.35.0 (2022-07-12)
 -------------------
@@ -41,7 +42,7 @@ Changelog
 0.32.0 (2022-04-02)
 -------------------
 
-* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
+* Add new optional argument ``filter_off`` to :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
 * Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files.
 * Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity.
 * Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations.
diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py
index cefbaa3..d3ab996 100644
--- a/fuc/api/pymaf.py
+++ b/fuc/api/pymaf.py
@@ -1400,7 +1400,7 @@ def plot_regplot_gene(
 
     def plot_regplot_tmb(
         self, af, subject_col, group_col, a, b, ax=None, figsize=None,
-        **kwargs
+        to_csv=None, **kwargs
     ):
         """
         Create a scatter plot with a linear regression model fit visualizing
@@ -1419,6 +1419,8 @@ def plot_regplot_tmb(
             AnnFrame column containing sample group information.
         a, b : str
             Sample group names.
+        to_csv : str, optional
+            Write the plot's data to a CSV file.
         ax : matplotlib.axes.Axes, optional
             Pre-existing axes for the plot. Otherwise, crete a new one.
         figsize : tuple, optional
@@ -1483,6 +1485,10 @@ def one_row(r):
         print(f'R^2 = {results.rsquared:.2f}')
         print(f'  P = {results.f_pvalue:.2e}')
 
+        # Write the DataFrame to a CSV file.
+        if to_csv is not None:
+            df.to_csv(to_csv)
+
         return ax
 
     def plot_interactions(

From c4b6a426e99309394b4193734123c3577f66c646 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Wed, 10 Aug 2022 15:07:48 +0900
Subject: [PATCH 8/9] Update `pymaf.MafFrame.plot_mutated_matched`:

* Add new optional argument ``count`` to
:meth:`pymaf.MafFrame.plot_mutated_matched` method.
---
 CHANGELOG.rst    | 1 +
 fuc/api/pymaf.py | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 38a4010..204b048 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -9,6 +9,7 @@ Changelog
 * Add new method :meth:`pyvcf.VcfFrame.filter_gsa`.
 * Add new method :meth:`pyvcf.VcfFrame.duplicated`.
 * Add new optional argument ``to_csv`` to :meth:`pymaf.MafFrame.plot_regplot_tmb` method.
+* Add new optional argument ``count`` to :meth:`pymaf.MafFrame.plot_mutated_matched` method.
 
 0.35.0 (2022-07-12)
 -------------------
diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py
index d3ab996..aaffa10 100644
--- a/fuc/api/pymaf.py
+++ b/fuc/api/pymaf.py
@@ -1811,8 +1811,8 @@ def plot_mutated(
         return ax
 
     def plot_mutated_matched(
-        self, af, patient_col, group_col, group_order, ax=None, figsize=None,
-        **kwargs
+        self, af, patient_col, group_col, group_order, count=10, ax=None,
+        figsize=None, **kwargs
     ):
         """
         Create a bar plot visualizing the mutation prevalence of top
@@ -1828,6 +1828,8 @@ def plot_mutated_matched(
             AnnFrame column containing sample group information.
         group_order : list
             List of sample group names.
+        count : int, defualt: 10
+            Number of top mutated genes to display.
         ax : matplotlib.axes.Axes, optional
             Pre-existing axes for the plot. Otherwise, crete a new one.
         figsize : tuple, optional
@@ -1841,7 +1843,7 @@ def plot_mutated_matched(
         matplotlib.axes.Axes
             The matplotlib axes containing the plot.
         """
-        df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order)
+        df = self.matrix_waterfall_matched(af, patient_col, group_col, group_order, count=count)
         df = df.applymap(lambda x: 0 if x == 'None' else 1)
         s = df.sum(axis=1) / len(df.columns) * 100
         s.name = 'Count'

From b1da2a1f4f2d052814d020efcddb95bfc1d700b6 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 12 Aug 2022 07:57:14 +0900
Subject: [PATCH 9/9] Update docs

---
 CHANGELOG.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 204b048..b444d25 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,8 +1,8 @@
 Changelog
 *********
 
-0.36.0 (in development)
------------------------
+0.36.0 (2022-08-12)
+-------------------
 
 * ``fuc`` now has a citation! Please refer to the publication “`ClinPharmSeq: A targeted sequencing panel for clinical pharmacogenetics implementation <https://doi.org/10.1371/journal.pone.0272129>`__” by Lee et al., 2022 (Steven is the first author). Fore more details, see the Citation section in README.
 * Update ``pyvcf`` submodule to accept "sites-only" VCF.