From c2ee025d1ec69d6980507579ef8aa4a9704422f9 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 10 Jun 2021 16:18:12 +0900
Subject: [PATCH 01/10] Add pyvcf.VcfFrame.add_af

---
 CHANGELOG.rst    |  5 ++++
 README.rst       |  1 +
 docs/create.py   |  1 +
 fuc/api/pyvcf.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
 fuc/version.py   |  2 +-
 5 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 18679d3..005e738 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,11 @@
 Changelog
 *********
 
+0.12.0 (in development)
+-----------------------
+
+* Add new method :meth:`pyvcf.VcfFrame.add_af`.
+
 0.11.0 (2021-06-10)
 -------------------
 
diff --git a/README.rst b/README.rst
index adb03b2..c2f0c57 100644
--- a/README.rst
+++ b/README.rst
@@ -245,6 +245,7 @@ The following packages are required to run fuc:
    biopython
    lxml
    matplotlib
+   matplotlib-venn
    numpy
    pandas
    pyranges
diff --git a/docs/create.py b/docs/create.py
index f639cfa..598b398 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -273,6 +273,7 @@
    biopython
    lxml
    matplotlib
+   matplotlib-venn
    numpy
    pandas
    pyranges
diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
index e9f62d6..a9c49a3 100644
--- a/fuc/api/pyvcf.py
+++ b/fuc/api/pyvcf.py
@@ -743,6 +743,71 @@ def shape(self):
         """tuple : Dimensionality of VcfFrame (variants, samples)."""
         return (self.df.shape[0], len(self.samples))
 
+    def add_af(self, decimals=3):
+        """Compute AF using AD and add it to the FORMAT field.
+
+        Parameters
+        ----------
+        decimals : int, default: 3
+            Number of decimals to display.
+
+        Returns
+        -------
+        VcfFrame
+            Updated VcfFrame.
+
+        Examples
+        --------
+        Assume we have the following data:
+
+        >>> from fuc import pyvcf
+        >>> data = {
+        ...     'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'],
+        ...     'POS': [100, 100, 200, 200],
+        ...     'ID': ['.', '.', '.', '.'],
+        ...     'REF': ['A', 'A', 'C', 'C'],
+        ...     'ALT': ['C', 'T', 'G', 'G,A'],
+        ...     'QUAL': ['.', '.', '.', '.'],
+        ...     'FILTER': ['.', '.', '.', '.'],
+        ...     'INFO': ['.', '.', '.', '.'],
+        ...     'FORMAT': ['GT:AD', 'GT:AD', 'GT:AD', 'GT:AD'],
+        ...     'Steven': ['0/1:12,15', '0/0:32,1', '0/1:16,12', './.:.'],
+        ...     'Sara': ['0/1:13,17', '0/1:14,15', './.:.', '1/2:0,11,17'],
+        ... }
+        >>> vf = pyvcf.VcfFrame.from_dict([], data)
+        >>> vf.df
+          CHROM  POS ID REF  ALT QUAL FILTER INFO FORMAT     Steven         Sara
+        0  chr1  100  .   A    C    .      .    .  GT:AD  0/1:12,15    0/1:13,17
+        1  chr1  100  .   A    T    .      .    .  GT:AD   0/0:32,1    0/1:14,15
+        2  chr2  200  .   C    G    .      .    .  GT:AD  0/1:16,12        ./.:.
+        3  chr2  200  .   C  G,A    .      .    .  GT:AD      ./.:.  1/2:0,11,17
+
+        We can add the AF subfield to our genotype data:
+
+        >>> vf.add_af().df
+          CHROM  POS ID REF  ... INFO    FORMAT           Steven               Sara
+        0  chr1  100  .   A  ...    .  GT:AD:AF  0/1:12,15:0.556    0/1:13,17:0.567
+        1  chr1  100  .   A  ...    .  GT:AD:AF   0/0:32,1:0.030    0/1:14,15:0.517
+        2  chr2  200  .   C  ...    .  GT:AD:AF  0/1:16,12:0.429            ./.:.:.
+        3  chr2  200  .   C  ...    .  GT:AD:AF          ./.:.:.  1/2:0,11,17:0.607
+        """
+        def one_row(r):
+            i = r.FORMAT.split(':').index('AD')
+            def one_gt(g):
+                ad = g.split(':')[i]
+                if ad == '.':
+                    return f'{g}:.'
+                depths = [int(x) for x in ad.split(',')]
+                total = sum(depths)
+                af = max(depths[1:]) / total
+                return f'{g}:{af:.{decimals}f}'
+            r.iloc[9:] = r.iloc[9:].apply(one_gt)
+            r.FORMAT += ':AF'
+            return r
+        df = self.df.apply(one_row, axis=1)
+        vf = self.__class__(self.copy_meta(), df)
+        return vf
+
     def add_dp(self):
         """Compute DP using AD and add it to the FORMAT field.
 
diff --git a/fuc/version.py b/fuc/version.py
index f323a57..2c7bffb 100644
--- a/fuc/version.py
+++ b/fuc/version.py
@@ -1 +1 @@
-__version__ = '0.11.0'
+__version__ = '0.12.0'

From 4e038dcf4804af0aa8eb2bef61c15fee7690b717 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Thu, 10 Jun 2021 17:18:45 +0900
Subject: [PATCH 02/10] Update API

---
 fuc/api/pymaf.py | 66 ++++++++++++++++++++++++------------------------
 fuc/api/pyvep.py | 36 ++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py
index 67f9f18..783ef2b 100644
--- a/fuc/api/pymaf.py
+++ b/fuc/api/pymaf.py
@@ -11,39 +11,39 @@
 protein change. However, most of the analysis in pymaf uses the
 following fields:
 
-+------+------------------------+----------------------+-------------------------------+
-| No.  | Name                   | Description          | Examples                      |
-+======+========================+======================+===============================+
-| 1    | Hugo_Symbol            | HUGO gene symbol     | 'TP53', 'Unknown'             |
-+------+------------------------+----------------------+-------------------------------+
-| 2    | Entrez_Gene_Id         | Entrez or Ensembl ID | 0, 8714                       |
-+------+------------------------+----------------------+-------------------------------+
-| 3    | Center                 | Sequencing center    | '.', 'genome.wustl.edu'       |
-+------+------------------------+----------------------+-------------------------------+
-| 4    | NCBI_Build             | Genome assembly      | '37', 'GRCh38'                |
-+------+------------------------+----------------------+-------------------------------+
-| 5    | Chromosome             | Chromosome name      | 'chr1'                        |
-+------+------------------------+----------------------+-------------------------------+
-| 6    | Start_Position         | Start coordinate     | 119031351                     |
-+------+------------------------+----------------------+-------------------------------+
-| 7    | End_Position           | End coordinate       | 44079555                      |
-+------+------------------------+----------------------+-------------------------------+
-| 8    | Strand                 | Genomic strand       | '+', '-'                      |
-+------+------------------------+----------------------+-------------------------------+
-| 9    | Variant_Classification | Translational effect | 'Missense_Mutation', 'Silent' |
-+------+------------------------+----------------------+-------------------------------+
-| 10   | Variant_Type           | Mutation type        | 'SNP', 'INS', 'DEL'           |
-+------+------------------------+----------------------+-------------------------------+
-| 11   | Reference_Allele       | Reference allele     | 'T', '-', 'ACAA'              |
-+------+------------------------+----------------------+-------------------------------+
-| 12   | Tumor_Seq_Allele1      | First tumor allele   | 'A', '-', 'TCA'               |
-+------+------------------------+----------------------+-------------------------------+
-| 13   | Tumor_Seq_Allele2      | Second tumor allele  | 'A', '-', 'TCA'               |
-+------+------------------------+----------------------+-------------------------------+
-| 14   | Tumor_Sample_Barcode   | Sample ID            | 'TCGA-AB-3002'                |
-+------+------------------------+----------------------+-------------------------------+
-| 15   | Protein_Change         | Protein change       | 'p.L558Q'                     |
-+------+------------------------+----------------------+-------------------------------+
++-----+------------------------+----------------------+-------------------------------+
+| No. | Name                   | Description          | Examples                      |
++=====+========================+======================+===============================+
+| 1   | Hugo_Symbol            | HUGO gene symbol     | 'TP53', 'Unknown'             |
++-----+------------------------+----------------------+-------------------------------+
+| 2   | Entrez_Gene_Id         | Entrez or Ensembl ID | 0, 8714                       |
++-----+------------------------+----------------------+-------------------------------+
+| 3   | Center                 | Sequencing center    | '.', 'genome.wustl.edu'       |
++-----+------------------------+----------------------+-------------------------------+
+| 4   | NCBI_Build             | Genome assembly      | '37', 'GRCh38'                |
++-----+------------------------+----------------------+-------------------------------+
+| 5   | Chromosome             | Chromosome name      | 'chr1'                        |
++-----+------------------------+----------------------+-------------------------------+
+| 6   | Start_Position         | Start coordinate     | 119031351                     |
++-----+------------------------+----------------------+-------------------------------+
+| 7   | End_Position           | End coordinate       | 44079555                      |
++-----+------------------------+----------------------+-------------------------------+
+| 8   | Strand                 | Genomic strand       | '+', '-'                      |
++-----+------------------------+----------------------+-------------------------------+
+| 9   | Variant_Classification | Translational effect | 'Missense_Mutation', 'Silent' |
++-----+------------------------+----------------------+-------------------------------+
+| 10  | Variant_Type           | Mutation type        | 'SNP', 'INS', 'DEL'           |
++-----+------------------------+----------------------+-------------------------------+
+| 11  | Reference_Allele       | Reference allele     | 'T', '-', 'ACAA'              |
++-----+------------------------+----------------------+-------------------------------+
+| 12  | Tumor_Seq_Allele1      | First tumor allele   | 'A', '-', 'TCA'               |
++-----+------------------------+----------------------+-------------------------------+
+| 13  | Tumor_Seq_Allele2      | Second tumor allele  | 'A', '-', 'TCA'               |
++-----+------------------------+----------------------+-------------------------------+
+| 14  | Tumor_Sample_Barcode   | Sample ID            | 'TCGA-AB-3002'                |
++-----+------------------------+----------------------+-------------------------------+
+| 15  | Protein_Change         | Protein change       | 'p.L558Q'                     |
++-----+------------------------+----------------------+-------------------------------+
 """
 
 import pandas as pd
diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py
index 3a724dc..b487acb 100644
--- a/fuc/api/pyvep.py
+++ b/fuc/api/pyvep.py
@@ -8,6 +8,42 @@
 `web interface <https://asia.ensembl.org/Tools/VEP>`_ with
 “RefSeq transcripts” as the transcript database and the filtering option
 “Show one selected consequence per variant”.
+
+A typical VEP-annotated VCF file contains many fields ranging from gene
+symbol to protein change. However, most of the analysis in pyvep uses the
+following fields:
+
++-----+--------------------+---------------------+----------------------------------------------+
+| No. | Name               | Description         | Examples                                     |
++=====+====================+=====================+==============================================+
+| 1   | Allele             | Variant allele      | 'C', 'T', 'CG'                               |
++-----+--------------------+---------------------+----------------------------------------------+
+| 2   | Consequence        | Consequence type    | 'missense_variant', 'stop_gained'            |
++-----+--------------------+---------------------+----------------------------------------------+
+| 3   | IMPACT             | Consequence impact  | 'MODERATE', 'HIGH'                           |
++-----+--------------------+---------------------+----------------------------------------------+
+| 4   | SYMBOL             | Gene symbol         | 'TP53', 'APC'                                |
++-----+--------------------+---------------------+----------------------------------------------+
+| 5   | Gene               | Ensembl ID          | 7157, 55294                                  |
++-----+--------------------+---------------------+----------------------------------------------+
+| 6   | BIOTYPE            | Transcript biotype  | 'protein_coding', 'pseudogene', 'miRNA'      |
++-----+--------------------+---------------------+----------------------------------------------+
+| 7   | EXON               | Exon number         | '8/17', '17/17'                              |
++-----+--------------------+---------------------+----------------------------------------------+
+| 8   | Protein_position   | Amino acid position | 234, 1510                                    |
++-----+--------------------+---------------------+----------------------------------------------+
+| 9   | Amino_acids        | Amino acid changes  | 'R/\*', 'A/X', 'V/A'                         |
++-----+--------------------+---------------------+----------------------------------------------+
+| 10  | Existing_variation | Variant identifier  | 'rs17851045', 'rs786201856&COSV57325157'     |
++-----+--------------------+---------------------+----------------------------------------------+
+| 11  | STRAND             | Genomic strand      | 1, -1                                        |
++-----+--------------------+---------------------+----------------------------------------------+
+| 12  | SIFT               | SIFT prediction     | 'deleterious(0)', 'tolerated(0.6)'           |
++-----+--------------------+---------------------+----------------------------------------------+
+| 13  | PolyPhen           | PolyPhen prediction | 'benign(0.121)', 'probably_damaging(0.999)'  |
++-----+--------------------+---------------------+----------------------------------------------+
+| 14  | CLIN_SIG           | ClinVar prediction  | 'pathogenic', 'likely_pathogenic&pathogenic' |
++-----+--------------------+---------------------+----------------------------------------------+
 """
 
 import re

From a6692dc5245770a74f61ccff9eb8fab9dc300c5f Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 11 Jun 2021 13:30:56 +0900
Subject: [PATCH 03/10] Update CLI

---
 CHANGELOG.rst        |  2 ++
 README.rst           |  1 +
 docs/cli.rst         | 23 ++++++++++++++
 fuc/api/pyvcf.py     | 75 ++++++++++++++++++++++++++++++++++++++++++++
 fuc/api/pyvep.py     | 31 +++++++-----------
 fuc/cli/vep_query.py | 30 ++++++++++++++++++
 6 files changed, 143 insertions(+), 19 deletions(-)
 create mode 100644 fuc/cli/vep_query.py

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 005e738..78b28ae 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,8 @@ Changelog
 -----------------------
 
 * Add new method :meth:`pyvcf.VcfFrame.add_af`.
+* Add new method :meth:`pyvcf.VcfFrame.extract`.
+* Add new command :command:`vep_query`.
 
 0.11.0 (2021-06-10)
 -------------------
diff --git a/README.rst b/README.rst
index c2f0c57..14e71da 100644
--- a/README.rst
+++ b/README.rst
@@ -308,6 +308,7 @@ For getting help on CLI:
        vcf_merge    [VCF] merge two or more VCF files
        vcf_slice    [VCF] slice a VCF file
        vcf_vcf2bed  [VCF] convert a VCF file to a BED file
+       vep_query    [VEP] filter a VEP-annotated VCF file
    
    optional arguments:
      -h, --help     show this help message and exit
diff --git a/docs/cli.rst b/docs/cli.rst
index d1d489e..db63edd 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -38,6 +38,7 @@ For getting help on CLI:
        vcf_merge    [VCF] merge two or more VCF files
        vcf_slice    [VCF] slice a VCF file
        vcf_vcf2bed  [VCF] convert a VCF file to a BED file
+       vep_query    [VEP] filter a VEP-annotated VCF file
    
    optional arguments:
      -h, --help     show this help message and exit
@@ -481,3 +482,25 @@ vcf_vcf2bed
    optional arguments:
      -h, --help  show this help message and exit
 
+vep_query
+=========
+
+.. code-block:: console
+
+   $ fuc vep_query -h
+   usage: fuc vep_query [-h] vcf expr
+   
+   This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
+   
+   examples:
+     $ fuc vep_query in.vcf 'SYMBOL == "TP53"' > out.vcf
+     $ fuc vep_query in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf
+     $ fuc vep_query in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf
+   
+   positional arguments:
+     vcf         VEP-annotated VCF file
+     expr        query expression to evaluate
+   
+   optional arguments:
+     -h, --help  show this help message and exit
+
diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
index a9c49a3..b7883af 100644
--- a/fuc/api/pyvcf.py
+++ b/fuc/api/pyvcf.py
@@ -3685,3 +3685,78 @@ def one_row(r):
         df = df.drop_duplicates()
         bf = pybed.BedFrame.from_frame([], df)
         return bf
+
+    def extract(self, k, func=None, as_nan=False):
+        """
+        Create a DataFrame containing analysis-ready data for the given
+        genotype key.
+
+        Parameters
+        ----------
+        k : str
+            Genotype key such as 'DP' and 'AD'.
+        func : function, optional
+            Function to apply to each of the returned values. For example, if
+            the goal is to extract AD only for reference alleles, one can use
+            ``func=lambda x: x.split(',')[0]``
+        as_nan : bool, default: False
+            If True, missing values will be returned as ``NaN``.
+
+        Returns
+        -------
+        pandas.DataFrame
+            DataFrame containing requested data.
+
+        Examples
+        --------
+
+        >>> from fuc import pyvcf
+        >>> data = {
+        ...     'CHROM': ['chr1', 'chr1'],
+        ...     'POS': [100, 101],
+        ...     'ID': ['.', '.'],
+        ...     'REF': ['A', 'T'],
+        ...     'ALT': ['A', 'T'],
+        ...     'QUAL': ['.', '.'],
+        ...     'FILTER': ['.', '.'],
+        ...     'INFO': ['.', '.'],
+        ...     'FORMAT': ['GT:AD:DP', 'GT:AD:DP'],
+        ...     'A': ['0/1:15,13:28', '0/0:28,1:29'],
+        ...     'B': ['./.:.:.', '1/1:0,26:26'],
+        ... }
+        >>> vf = pyvcf.VcfFrame.from_dict([], data)
+        >>> vf.df
+          CHROM  POS ID REF ALT QUAL FILTER INFO    FORMAT             A            B
+        0  chr1  100  .   A   A    .      .    .  GT:AD:DP  0/1:15,13:28      ./.:.:.
+        1  chr1  101  .   T   T    .      .    .  GT:AD:DP   0/0:28,1:29  1/1:0,26:26
+        >>> vf.extract('GT')
+             A    B
+        0  0/1  ./.
+        1  0/0  1/1
+        >>> vf.extract('GT', as_nan=True)
+             A    B
+        0  0/1  NaN
+        1  0/0  1/1
+        >>> vf.extract('AD')
+               A     B
+        0  15,13     .
+        1   28,1  0,26
+        >>> vf.extract('AD', as_nan=True, func=lambda x: float(x.split(',')[1]))
+              A     B
+        0  13.0   NaN
+        1   1.0  26.0
+        """
+        def one_row(r):
+            i = r.FORMAT.split(':').index(k)
+            def one_gt(g):
+                v = g.split(':')[i]
+                if as_nan and not i and gt_miss(v):
+                    return np.nan
+                if as_nan and v == '.':
+                    return np.nan
+                if func is not None:
+                    return func(v)
+                return v
+            return r[9:].apply(one_gt)
+        df = self.df.apply(one_row, axis=1)
+        return df
diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py
index b487acb..dad9382 100644
--- a/fuc/api/pyvep.py
+++ b/fuc/api/pyvep.py
@@ -48,6 +48,7 @@
 
 import re
 import pandas as pd
+import numpy as np
 from . import pyvcf
 
 SEVERITIY = [
@@ -546,26 +547,18 @@ def get_index(vf, target):
     headers = annot_names(vf)
     return headers.index(target)
 
-def get_table(vf):
-    """Write the VcfFrame as a tab-delimited text file."""
-    df = vf.df.copy()
-    headers = annot_names(vf)
-    def func(r):
-        ann = row_firstann(r)
-        if ann:
-            s = ['.' if x == '' else x for x in ann.split('|')]
-        else:
-            s = ['.' for x in headers]
-        s = pd.Series(s, index=headers)
-        s = pd.concat([r[:9], s, r[9:]])
-        return s
-    df = df.apply(func, axis=1)
-    return df
+def to_frame(vf):
+    """
+    Create a DataFrame containing analysis-ready annotation data.
 
-def write_table(vf, fn):
-    """Write the VcfFrame as a tab-delimited text file."""
-    df = get_table(vf)
-    df.to_csv(fn, sep='\t', index=False)
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame containing annotation data.
+    """
+    df = vf.df.apply(lambda r: pd.Series(row_firstann(r).split('|')), axis=1)
+    df.columns = annot_names(vf)
+    return df
 
 def pick_result(vf, mode='mostsevere'):
     """Return a new VcfFrame after picking one result per row.
diff --git a/fuc/cli/vep_query.py b/fuc/cli/vep_query.py
new file mode 100644
index 0000000..e89fdfd
--- /dev/null
+++ b/fuc/cli/vep_query.py
@@ -0,0 +1,30 @@
+from .. import api
+from argparse import RawTextHelpFormatter
+
+command = api.common.script_name(__file__)
+
+description = f"""
+This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
+
+examples:
+  $ fuc {command} in.vcf 'SYMBOL == "TP53"' > out.vcf
+  $ fuc {command} in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf
+  $ fuc {command} in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf
+"""
+
+def create_parser(subparsers):
+    parser = subparsers.add_parser(
+        command,
+        help='[VEP] filter a VEP-annotated VCF file',
+        description=description,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument('vcf', help='VEP-annotated VCF file')
+    parser.add_argument('expr', help='query expression to evaluate')
+
+def main(args):
+    vf = api.pyvcf.VcfFrame.from_file(args.vcf)
+    df = api.pyvep.to_frame(vf)
+    i = df.query(args.expr).index
+    filtered_vf = api.pyvcf.VcfFrame(vf.copy_meta(), vf.df.iloc[i])
+    print(filtered_vf.to_string())

From abb7ffd08f7d9be9180a78ab655949c14ff3e1bd Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 11 Jun 2021 13:32:06 +0900
Subject: [PATCH 04/10] Update CHANGELOG.rst

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 78b28ae..103e31c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,7 +6,7 @@ Changelog
 
 * Add new method :meth:`pyvcf.VcfFrame.add_af`.
 * Add new method :meth:`pyvcf.VcfFrame.extract`.
-* Add new command :command:`vep_query`.
+* :issue:`19`: Add new command :command:`vep_query`.
 
 0.11.0 (2021-06-10)
 -------------------

From 572f6cb01fd33c8321cf98ad81326cc67ecc0afd Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 11 Jun 2021 15:47:36 +0900
Subject: [PATCH 05/10] Update API

---
 CHANGELOG.rst        |   2 +
 docs/cli.rst         |  12 +++-
 fuc/api/pyvep.py     | 142 ++++++++++++++++++++++++++++---------------
 fuc/cli/bam_slice.py |   2 +-
 fuc/cli/vep_query.py |  20 ++++--
 5 files changed, 121 insertions(+), 57 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 103e31c..dbd5db2 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,6 +6,8 @@ Changelog
 
 * Add new method :meth:`pyvcf.VcfFrame.add_af`.
 * Add new method :meth:`pyvcf.VcfFrame.extract`.
+* Deprecate methods :meth:`pyvep.filter_af/biotype`.
+* Add new method :meth:`pyvep.filter_query`.
 * :issue:`19`: Add new command :command:`vep_query`.
 
 0.11.0 (2021-06-10)
diff --git a/docs/cli.rst b/docs/cli.rst
index db63edd..7fa6920 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -118,7 +118,7 @@ bam_slice
    
    optional arguments:
      -h, --help  show this help message and exit
-     --no_index  use to this flag to skip indexing
+     --no_index  use this flag to skip indexing
 
 bed_intxn
 =========
@@ -488,14 +488,18 @@ vep_query
 .. code-block:: console
 
    $ fuc vep_query -h
-   usage: fuc vep_query [-h] vcf expr
+   usage: fuc vep_query [-h] [--opposite] [--as_zero] vcf expr
    
-   This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
+   This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
    
    examples:
      $ fuc vep_query in.vcf 'SYMBOL == "TP53"' > out.vcf
+     $ fuc vep_query in.vcf 'SYMBOL != "TP53"' > out.vcf
+     $ fuc vep_query in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf
      $ fuc vep_query in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf
      $ fuc vep_query in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf
+     $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' > out.vcf
+     $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf
    
    positional arguments:
      vcf         VEP-annotated VCF file
@@ -503,4 +507,6 @@ vep_query
    
    optional arguments:
      -h, --help  show this help message and exit
+     --opposite  use this flag to return records that don’t meet the said criteria
+     --as_zero   use this flag to treat missing values as zero instead of NaN
 
diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py
index dad9382..4aeb209 100644
--- a/fuc/api/pyvep.py
+++ b/fuc/api/pyvep.py
@@ -51,6 +51,69 @@
 import numpy as np
 from . import pyvcf
 
+DATA_TYPES = {
+    'Allele': 'str',
+    'Consequence': 'str',
+    'IMPACT': 'str',
+    'SYMBOL': 'str',
+    'Gene': 'str',
+    'Feature_type': 'str',
+    'Feature': 'str',
+    'BIOTYPE': 'str',
+    'EXON': 'str',
+    'INTRON': 'str',
+    'HGVSc': 'str',
+    'HGVSp': 'str',
+    'cDNA_position': 'str',
+    'CDS_position': 'str',
+    'Protein_position': 'str',
+    'Amino_acids': 'str',
+    'Codons': 'str',
+    'Existing_variation': 'str',
+    'DISTANCE': 'str',
+    'STRAND': 'str',
+    'FLAGS': 'str',
+    'SYMBOL_SOURCE': 'str',
+    'HGNC_ID': 'str',
+    'MANE_SELECT': 'str',
+    'MANE_PLUS_CLINICAL': 'str',
+    'TSL': 'str',
+    'APPRIS': 'str',
+    'REFSEQ_MATCH': 'str',
+    'REFSEQ_OFFSET': 'str',
+    'GIVEN_REF': 'str',
+    'USED_REF': 'str',
+    'BAM_EDIT': 'str',
+    'SIFT': 'str',
+    'PolyPhen': 'str',
+    'AF': 'float64',
+    'AFR_AF': 'float64',
+    'AMR_AF': 'float64',
+    'EAS_AF': 'float64',
+    'EUR_AF': 'float64',
+    'SAS_AF': 'float64',
+    'AA_AF': 'float64',
+    'EA_AF': 'float64',
+    'gnomAD_AF': 'float64',
+    'gnomAD_AFR_AF': 'float64',
+    'gnomAD_AMR_AF': 'float64',
+    'gnomAD_ASJ_AF': 'float64',
+    'gnomAD_EAS_AF': 'float64',
+    'gnomAD_FIN_AF': 'float64',
+    'gnomAD_NFE_AF': 'float64',
+    'gnomAD_OTH_AF': 'float64',
+    'gnomAD_SAS_AF': 'float64',
+    'CLIN_SIG': 'str',
+    'SOMATIC': 'str',
+    'PHENO': 'str',
+    'PUBMED': 'str',
+    'MOTIF_NAME': 'str',
+    'MOTIF_POS': 'str',
+    'HIGH_INF_POS': 'str',
+    'MOTIF_SCORE_CHANGE': 'str',
+    'TRANSCRIPTION_FACTORS': 'str',
+}
+
 SEVERITIY = [
     'transcript_ablation',
     'splice_acceptor_variant',
@@ -547,17 +610,30 @@ def get_index(vf, target):
     headers = annot_names(vf)
     return headers.index(target)
 
-def to_frame(vf):
+def to_frame(vf, as_zero=False):
     """
-    Create a DataFrame containing analysis-ready annotation data.
+    Create a DataFrame containing analysis-ready VEP annotation data.
+
+    Parameters
+    ----------
+    vf : VcfFrame
+        VcfFrame object.
+    as_zero : bool, default: False
+        If True, missing values will be converted to zeros instead of ``NaN``.
 
     Returns
     -------
     pandas.DataFrame
-        DataFrame containing annotation data.
+        DataFrame containing VEP annotation data.
     """
-    df = vf.df.apply(lambda r: pd.Series(row_firstann(r).split('|')), axis=1)
+    f = lambda r: pd.Series(row_firstann(r).split('|'))
+    df = vf.df.apply(f, axis=1)
+    if as_zero:
+        df = df.replace('', 0)
+    else:
+        df = df.replace('', np.nan)
     df.columns = annot_names(vf)
+    df = df.astype(DATA_TYPES)
     return df
 
 def pick_result(vf, mode='mostsevere'):
@@ -566,7 +642,7 @@ def pick_result(vf, mode='mostsevere'):
     Parameters
     ----------
     vf : VcfFrame
-        VcfFrame.
+        VcfFrame object.
     mode : {'mostsevere', 'firstann'}, default: 'mostsevere'
         Selection mode:
 
@@ -644,40 +720,6 @@ def annot_names(vf):
             l = re.search(r'Format: (.*?)">', vf.meta[i]).group(1).split('|')
     return l
 
-def filter_af(vf, name, threshold, opposite=None, as_index=False):
-    """Select rows whose selected AF is below threshold.
-
-    Parameters
-    ----------
-    name : str
-        Name of the consequence annotation representing AF (e.g. 'gnomAD_AF').
-    threshold : float
-        Minimum AF.
-    opposite : bool, default: False
-        If True, return rows that don't meet the said criteria.
-    as_index : bool, default: False
-        If True, return boolean index array instead of VcfFrame.
-
-    Returns
-    -------
-    VcfFrame or pandas.Series
-        Filtered VcfFrame or boolean index array.
-    """
-    i = annot_names(vf).index(name)
-    def one_row(r):
-        af = row_firstann(r).split('|')[i]
-        if not af:
-            af = 0
-        return float(af) < threshold
-    i = vf.df.apply(one_row, axis=1)
-    if opposite:
-        i = ~i
-    if as_index:
-        return i
-    df = vf.df[i].reset_index(drop=True)
-    vf = vf.__class__(vf.copy_meta(), df)
-    return vf
-
 def filter_lof(vf, opposite=None, as_index=False):
     """Select rows whose conseuqence annotation is deleterious and/or LoF.
 
@@ -720,30 +762,32 @@ def one_row(r):
     vf = vf.__class__(vf.copy_meta(), df)
     return vf
 
-def filter_biotype(vf, value, opposite=None, as_index=False):
-    """Select rows whose BIOTYPE matches the given value.
+def filter_query(vf, expr, opposite=None, as_index=False, as_zero=False):
+    """
+    Select rows that satisfy the query expression.
+
+    This method essentially wraps the :meth:`pandas.DataFrame.query` method.
 
     Parameters
     ----------
+    expr : str
+        The query string to evaluate.
     opposite : bool, default: False
         If True, return rows that don't meet the said criteria.
     as_index : bool, default: False
         If True, return boolean index array instead of VcfFrame.
+    as_zero : bool, default: False
+        If True, missing values will be converted to zeros instead of ``NaN``.
 
     Returns
     -------
     VcfFrame or pandas.Series
         Filtered VcfFrame or boolean index array.
     """
-    i = annot_names(vf).index('BIOTYPE')
-    def one_row(r):
-        l = row_firstann(r).split('|')
-        return l[i] == value
-    i = vf.df.apply(one_row, axis=1)
+    df = to_frame(vf, as_zero=as_zero)
+    i = vf.df.index.isin(df.query(expr).index)
     if opposite:
         i = ~i
     if as_index:
         return i
-    df = vf.df[i]
-    vf = vf.__class__(vf.copy_meta(), df)
-    return vf
+    return vf.__class__(vf.copy_meta(), vf.df[i])
diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py
index 73501cd..f851e35 100644
--- a/fuc/cli/bam_slice.py
+++ b/fuc/cli/bam_slice.py
@@ -25,7 +25,7 @@ def create_parser(subparsers):
     parser.add_argument(
         '--no_index',
         action='store_true',
-        help='use to this flag to skip indexing'
+        help='use this flag to skip indexing'
     )
 
 def main(args):
diff --git a/fuc/cli/vep_query.py b/fuc/cli/vep_query.py
index e89fdfd..2d1ec70 100644
--- a/fuc/cli/vep_query.py
+++ b/fuc/cli/vep_query.py
@@ -4,12 +4,16 @@
 command = api.common.script_name(__file__)
 
 description = f"""
-This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
+This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
 
 examples:
   $ fuc {command} in.vcf 'SYMBOL == "TP53"' > out.vcf
+  $ fuc {command} in.vcf 'SYMBOL != "TP53"' > out.vcf
+  $ fuc {command} in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf
   $ fuc {command} in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf
   $ fuc {command} in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf
+  $ fuc {command} in.vcf 'gnomAD_AF < 0.001' > out.vcf
+  $ fuc {command} in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf
 """
 
 def create_parser(subparsers):
@@ -21,10 +25,18 @@ def create_parser(subparsers):
     )
     parser.add_argument('vcf', help='VEP-annotated VCF file')
     parser.add_argument('expr', help='query expression to evaluate')
+    parser.add_argument(
+        '--opposite',
+        action='store_true',
+        help='use this flag to return records that don’t meet the said criteria'
+    )
+    parser.add_argument(
+        '--as_zero',
+        action='store_true',
+        help='use this flag to treat missing values as zero instead of NaN'
+    )
 
 def main(args):
     vf = api.pyvcf.VcfFrame.from_file(args.vcf)
-    df = api.pyvep.to_frame(vf)
-    i = df.query(args.expr).index
-    filtered_vf = api.pyvcf.VcfFrame(vf.copy_meta(), vf.df.iloc[i])
+    filtered_vf = api.pyvep.filter_query(vf, args.expr, opposite=args.opposite, as_zero=args.as_zero)
     print(filtered_vf.to_string())

From f2a4726e5caccda0b3b9e1a452c7b91f1428a8e1 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 11 Jun 2021 15:56:48 +0900
Subject: [PATCH 06/10] Update API

---
 CHANGELOG.rst    |   2 +-
 fuc/api/pyvep.py | 134 +----------------------------------------------
 2 files changed, 3 insertions(+), 133 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index dbd5db2..c1ba0b9 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,7 +6,7 @@ Changelog
 
 * Add new method :meth:`pyvcf.VcfFrame.add_af`.
 * Add new method :meth:`pyvcf.VcfFrame.extract`.
-* Deprecate methods :meth:`pyvep.filter_af/biotype`.
+* Deprecate methods :meth:`pyvep.filter_af/biotype/nothas/impact`.
 * Add new method :meth:`pyvep.filter_query`.
 * :issue:`19`: Add new command :command:`vep_query`.
 
diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py
index 4aeb209..0553992 100644
--- a/fuc/api/pyvep.py
+++ b/fuc/api/pyvep.py
@@ -257,33 +257,6 @@ def row_mostsevere(r):
     f = lambda x: SEVERITIY.index(x.split('|')[1].split('&')[0])
     return sorted(results.split(','), key=f)[0]
 
-def filter_nothas(vf, s, include=False):
-    """Filter out rows based on the VEP annotations.
-
-    Parameters
-    ----------
-    vf : VcfFrame
-        Input VcfFrame.
-    targets : list
-        List of annotations (e.g. ['missense_variant', 'stop_gained']).
-    include : bool, default: False
-        If True, include only such rows instead of excluding them.
-
-    Returns
-    -------
-    vf : VcfFrame
-        Filtered VcfFrame.
-    """
-    def func(r):
-        ann = row_firstann(r)
-        return s not in ann
-    i = vf.df.apply(func, axis=1)
-    if include:
-        i = ~i
-    df = vf.df[i].reset_index(drop=True)
-    vf = vf.__class__(vf.copy_meta(), df)
-    return vf
-
 def filter_clinsig(vf, whitelist=None, blacklist=None, opposite=False,
                    index=False):
     """Select rows based on the given CLIN_SIG values.
@@ -390,7 +363,8 @@ def filter_clinsig(vf, whitelist=None, blacklist=None, opposite=False,
         blacklist = []
     def func(r):
         ann = row_firstann(r)
-        values = ann.split('|')[get_index(vf, 'CLIN_SIG')].split('&')
+        i = annot_names(vf).index('CLIN_SIG')
+        values = ann.split('|')[i].split('&')
         if not list(set(values) & set(whitelist)):
             return False
         if list(set(values) & set(blacklist)):
@@ -405,105 +379,6 @@ def func(r):
     vf = vf.__class__(vf.copy_meta(), df)
     return vf
 
-def filter_impact(vf, values, opposite=False, index=False):
-    """Select rows based on the given IMPACT values.
-
-    List of IMPACT values:
-
-        - LOW
-        - MODERATE
-        - HIGH
-        - MODIFIER
-
-    Parameters
-    ----------
-    vf : VcfFrame
-        VcfFrame.
-    values : list, default: None
-        If any one of the IMPACT values is present, select the row.
-    opposite : bool, default: False
-        If True, return rows that don't meet the said criteria.
-    index : bool, default: False
-        If True, return boolean index array instead of VcfFrame.
-
-    Returns
-    -------
-    VcfFrame or pandas.Series
-        Filtered VcfFrame or boolean index array.
-
-    Examples
-    --------
-    Assume we have the following data:
-
-    >>> meta = [
-    ...     '##fileformat=VCFv4.1',
-    ...     '##VEP="v104" time="2021-05-20 10:50:12" cache="/net/isilonP/public/ro/ensweb-data/latest/tools/grch37/e104/vep/cache/homo_sapiens/104_GRCh37" db="homo_sapiens_core_104_37@hh-mysql-ens-grch37-web" 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"',
-    ...     '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|SIFT|PolyPhen|AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS">'
-    ... ]
-    >>> data = {
-    ...     'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'],
-    ...     'POS': [100, 101, 200, 201],
-    ...     'ID': ['.', '.', '.', '.'],
-    ...     'REF': ['G', 'C', 'CAG', 'C'],
-    ...     'ALT': ['T', 'T', 'C', 'T'],
-    ...     'QUAL': ['.', '.', '.', '.'],
-    ...     'FILTER': ['.', '.', '.', '.'],
-    ...     'INFO': [
-    ...         'CSQ=T|missense_variant|MODERATE|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|47/58||||6721|6644|2215|S/Y|tCt/tAt|rs587777894&COSV63868278&COSV63868313||-1||HGNC|3942|||||deleterious(0)|possibly_damaging(0.876)||likely_pathogenic&pathogenic|0&1&1|1&1&1|26619011&27159400&24631838&26018084&27830187|||||',
-    ...         'CSQ=T|synonymous_variant|LOW|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|49/58||||6986|6909|2303|L|ctG/ctA|rs11121691&COSV63870864||-1||HGNC|3942|||||||0.2206|benign|0&1|1&1|24996771|||||',
-    ...         'CSQ=-|frameshift_variant|HIGH|BRCA2|ENSG00000139618|Transcript|ENST00000380152.3|protein_coding|18/27||||8479-8480|8246-8247|2749|Q/X|cAG/c|rs80359701||1||HGNC|1101||||||||pathogenic||1|26467025&26295337&15340362|||||',
-    ...         'CSQ=T|missense_variant|MODERATE|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|30/58||||4516|4439|1480|R/H|cGc/cAc|rs780930764&COSV63868373||-1||HGNC|3942|||||tolerated(0.13)|benign(0)||likely_benign|0&1|1&1||||||'
-    ...     ],
-    ...     'FORMAT': ['GT', 'GT', 'GT', 'GT'],
-    ...     'Steven': ['0/1', '0/1', '0/1', '0/1'],
-    ... }
-    >>> vf = pyvcf.VcfFrame.from_dict(meta, data)
-    >>> pyvep.parseann(vf, ['IMPACT']).df
-      CHROM  POS ID  REF ALT QUAL FILTER      INFO FORMAT Steven
-    0  chr1  100  .    G   T    .      .  MODERATE     GT    0/1
-    1  chr1  101  .    C   T    .      .       LOW     GT    0/1
-    2  chr2  200  .  CAG   C    .      .      HIGH     GT    0/1
-    3  chr2  201  .    C   T    .      .  MODERATE     GT    0/1
-
-    We can select rows with either LOW or HIGH:
-
-    >>> temp_vf = pyvep.filter_impact(vf, ['LOW', 'HIGH'])
-    >>> pyvep.parseann(temp_vf, ['IMPACT']).df
-      CHROM  POS ID REF ALT QUAL FILTER      INFO FORMAT Steven
-    0  chr1  100  .   G   T    .      .  MODERATE     GT    0/1
-    1  chr2  201  .   C   T    .      .  MODERATE     GT    0/1
-
-    We can also remove those rows:
-
-    >>> temp_vf = pyvep.filter_impact(vf, ['LOW', 'HIGH'], opposite=True)
-    >>> pyvep.parseann(temp_vf, ['IMPACT']).df
-      CHROM  POS ID REF ALT QUAL FILTER      INFO FORMAT Steven
-    0  chr1  100  .   G   T    .      .  MODERATE     GT    0/1
-    1  chr2  201  .   C   T    .      .  MODERATE     GT    0/1
-
-    Finally, we can return boolean index array from the filtering:
-
-    >>> pyvep.filter_impact(vf, ['LOW', 'HIGH'], index=True)
-    0     True
-    1    False
-    2    False
-    3     True
-    dtype: bool
-
-    """
-    def func(r):
-        ann = row_firstann(r)
-        impact = ann.split('|')[get_index(vf, 'IMPACT')]
-        return impact in values
-    i = vf.df.apply(func, axis=1)
-    if opposite:
-        i = ~i
-    if index:
-        return i
-    df = vf.df[i].reset_index(drop=True)
-    vf = vf.__class__(vf.copy_meta(), df)
-    return vf
-
 def parseann(vf, targets, sep=' | ', as_series=False):
     """Parse variant annotations in VcfFrame.
 
@@ -605,11 +480,6 @@ def func(r):
     new_vf.df.INFO = s
     return new_vf
 
-def get_index(vf, target):
-    """Return the index of the target field (e.g. CLIN_SIG)."""
-    headers = annot_names(vf)
-    return headers.index(target)
-
 def to_frame(vf, as_zero=False):
     """
     Create a DataFrame containing analysis-ready VEP annotation data.

From 376cd8ef060e7a11f59a9f61ffc88e32795d8372 Mon Sep 17 00:00:00 2001
From: "Seung-been \"Steven\" Lee" <sbstevenlee@gmail.com>
Date: Fri, 11 Jun 2021 17:20:05 +0900
Subject: [PATCH 07/10] Update API

---
 fuc/api/pyvep.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py
index 0553992..5f99eec 100644
--- a/fuc/api/pyvep.py
+++ b/fuc/api/pyvep.py
@@ -51,6 +51,8 @@
 import numpy as np
 from . import pyvcf
 
+# https://m.ensembl.org/info/docs/tools/vep/vep_formats.html
+
 DATA_TYPES = {
     'Allele': 'str',
     'Consequence': 'str',

From 61ef904c98639ceed7051e832ac55545cd78d5b1 Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sat, 12 Jun 2021 10:46:20 +0900
Subject: [PATCH 08/10] Update API

---
 CHANGELOG.rst                        |  2 +-
 README.rst                           |  8 +++++++-
 docs/cli.rst                         | 28 ++++++++++++++--------------
 docs/create.py                       |  6 ++++++
 fuc/cli/{vep_query.py => vcf_vep.py} |  6 +++---
 5 files changed, 31 insertions(+), 19 deletions(-)
 rename fuc/cli/{vep_query.py => vcf_vep.py} (76%)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c1ba0b9..56f3a02 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,7 +8,7 @@ Changelog
 * Add new method :meth:`pyvcf.VcfFrame.extract`.
 * Deprecate methods :meth:`pyvep.filter_af/biotype/nothas/impact`.
 * Add new method :meth:`pyvep.filter_query`.
-* :issue:`19`: Add new command :command:`vep_query`.
+* :issue:`19`: Add new command :command:`vcf_vep`.
 
 0.11.0 (2021-06-10)
 -------------------
diff --git a/README.rst b/README.rst
index 14e71da..e91ce79 100644
--- a/README.rst
+++ b/README.rst
@@ -116,6 +116,12 @@ To merge VCF files:
 
    $ fuc vcf_merge 1.vcf 2.vcf 3.vcf > merged.vcf
 
+To filter a VCF file annotated by Ensemble VEP:
+
+.. code-block:: console
+
+   $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' > out.vcf
+
 API Examples
 ============
 
@@ -308,7 +314,7 @@ For getting help on CLI:
        vcf_merge    [VCF] merge two or more VCF files
        vcf_slice    [VCF] slice a VCF file
        vcf_vcf2bed  [VCF] convert a VCF file to a BED file
-       vep_query    [VEP] filter a VEP-annotated VCF file
+       vcf_vep      [VCF] filter a VCF file annotated by Ensemble VEP
    
    optional arguments:
      -h, --help     show this help message and exit
diff --git a/docs/cli.rst b/docs/cli.rst
index 7fa6920..66e121a 100644
--- a/docs/cli.rst
+++ b/docs/cli.rst
@@ -38,7 +38,7 @@ For getting help on CLI:
        vcf_merge    [VCF] merge two or more VCF files
        vcf_slice    [VCF] slice a VCF file
        vcf_vcf2bed  [VCF] convert a VCF file to a BED file
-       vep_query    [VEP] filter a VEP-annotated VCF file
+       vcf_vep      [VCF] filter a VCF file annotated by Ensemble VEP
    
    optional arguments:
      -h, --help     show this help message and exit
@@ -482,27 +482,27 @@ vcf_vcf2bed
    optional arguments:
      -h, --help  show this help message and exit
 
-vep_query
-=========
+vcf_vep
+=======
 
 .. code-block:: console
 
-   $ fuc vep_query -h
-   usage: fuc vep_query [-h] [--opposite] [--as_zero] vcf expr
+   $ fuc vcf_vep -h
+   usage: fuc vcf_vep [-h] [--opposite] [--as_zero] vcf expr
    
-   This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
+   This command will filter a VCF file annotated by Ensemble VEP. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
    
    examples:
-     $ fuc vep_query in.vcf 'SYMBOL == "TP53"' > out.vcf
-     $ fuc vep_query in.vcf 'SYMBOL != "TP53"' > out.vcf
-     $ fuc vep_query in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf
-     $ fuc vep_query in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf
-     $ fuc vep_query in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf
-     $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' > out.vcf
-     $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf
+     $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' > out.vcf
+     $ fuc vcf_vep in.vcf 'SYMBOL != "TP53"' > out.vcf
+     $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf
+     $ fuc vcf_vep in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf
+     $ fuc vcf_vep in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf
+     $ fuc vcf_vep in.vcf 'gnomAD_AF < 0.001' > out.vcf
+     $ fuc vcf_vep in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf
    
    positional arguments:
-     vcf         VEP-annotated VCF file
+     vcf         Ensemble VEP-annotated VCF file
      expr        query expression to evaluate
    
    optional arguments:
diff --git a/docs/create.py b/docs/create.py
index 598b398..b737564 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -144,6 +144,12 @@
 
    $ fuc vcf_merge 1.vcf 2.vcf 3.vcf > merged.vcf
 
+To filter a VCF file annotated by Ensemble VEP:
+
+.. code-block:: console
+
+   $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' > out.vcf
+
 API Examples
 ============
 
diff --git a/fuc/cli/vep_query.py b/fuc/cli/vcf_vep.py
similarity index 76%
rename from fuc/cli/vep_query.py
rename to fuc/cli/vcf_vep.py
index 2d1ec70..ca94f3b 100644
--- a/fuc/cli/vep_query.py
+++ b/fuc/cli/vcf_vep.py
@@ -4,7 +4,7 @@
 command = api.common.script_name(__file__)
 
 description = f"""
-This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
+This command will filter a VCF file annotated by Ensemble VEP. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query).
 
 examples:
   $ fuc {command} in.vcf 'SYMBOL == "TP53"' > out.vcf
@@ -19,11 +19,11 @@
 def create_parser(subparsers):
     parser = subparsers.add_parser(
         command,
-        help='[VEP] filter a VEP-annotated VCF file',
+        help='[VCF] filter a VCF file annotated by Ensemble VEP',
         description=description,
         formatter_class=RawTextHelpFormatter,
     )
-    parser.add_argument('vcf', help='VEP-annotated VCF file')
+    parser.add_argument('vcf', help='Ensemble VEP-annotated VCF file')
     parser.add_argument('expr', help='query expression to evaluate')
     parser.add_argument(
         '--opposite',

From f771f571379a88060c1144b0d55fdb4ff955899b Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sat, 12 Jun 2021 19:04:42 +0900
Subject: [PATCH 09/10] Update API

---
 CHANGELOG.rst         |   7 +-
 README.rst            |  51 ++++------
 conda.yml             |   1 +
 docs/api.rst          |   2 +-
 docs/create.py        |  49 ++++------
 docs/requirements.txt |   1 +
 fuc/api/common.py     |   2 +
 fuc/api/pyvcf.py      | 215 ++++++++++++++++++++++++++++++------------
 setup.py              |   2 +-
 9 files changed, 204 insertions(+), 126 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 56f3a02..a34e9ca 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,14 +1,17 @@
 Changelog
 *********
 
-0.12.0 (in development)
------------------------
+0.12.0 (2021-06-12)
+-------------------
 
 * Add new method :meth:`pyvcf.VcfFrame.add_af`.
 * Add new method :meth:`pyvcf.VcfFrame.extract`.
 * Deprecate methods :meth:`pyvep.filter_af/biotype/nothas/impact`.
 * Add new method :meth:`pyvep.filter_query`.
 * :issue:`19`: Add new command :command:`vcf_vep`.
+* Rename :meth:`pyvcf.VcfFrame.plot_histplot` to :meth:`pyvcf.VcfFrame.plot_tmb`.
+* Add ``scipy`` package as dependency for performing statistical analysis.
+* Add new method :meth:`pyvcf.VcfFrame.plot_hist`.
 
 0.11.0 (2021-06-10)
 -------------------
diff --git a/README.rst b/README.rst
index e91ce79..7f6771b 100644
--- a/README.rst
+++ b/README.rst
@@ -161,40 +161,26 @@ To create a Venn diagram showing genotype concordance between groups:
 
 .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_comparison.png
 
-To create a histogram of tumor mutational burden (TMB) distribution:
+To create various figures for normal-tumor analysis:
 
 .. code:: python3
 
-    >>> from fuc import pyvcf
-    >>> vcf_data = {
-    ...     'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
-    ...     'POS': [100, 101, 102, 103, 103],
-    ...     'ID': ['.', '.', '.', '.', '.'],
-    ...     'REF': ['T', 'T', 'T', 'T', 'T'],
-    ...     'ALT': ['C', 'C', 'C', 'C', 'C'],
-    ...     'QUAL': ['.', '.', '.', '.', '.'],
-    ...     'FILTER': ['.', '.', '.', '.', '.'],
-    ...     'INFO': ['.', '.', '.', '.', '.'],
-    ...     'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'],
-    ...     'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'],
-    ...     'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'],
-    ...     'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'],
-    ...     'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'],
-    ...     'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'],
-    ...     'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'],
-    ...     'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'],
-    ...     'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'],
-    ... }
-    >>> annot_data = {
-    ...     'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'],
-    ...     'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'],
-    ...     'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'],
-    ... }
-    >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data)
-    >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample')
-    >>> vf.plot_histplot(hue='Type', af=af)
-
-.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_histplot.png
+    >>> import matplotlib.pyplot as plt
+    >>> from fuc import common, pyvcf
+    >>> common.load_dataset('pyvcf')
+    >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf')
+    >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample')
+    >>> normal = af.df[af.df.Tissue == 'Normal'].index
+    >>> tumor = af.df[af.df.Tissue == 'Tumor'].index
+    >>> fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 10))
+    >>> vf.plot_tmb(ax=ax1)
+    >>> vf.plot_tmb(ax=ax2, af=af, hue='Tissue')
+    >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue')
+    >>> vf.plot_regplot(normal, tumor, ax=ax4)
+    >>> plt.tight_layout()
+    >>> plt.savefig('normal-tumor.png')
+
+.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png
 
 MAF
 ---
@@ -256,6 +242,7 @@ The following packages are required to run fuc:
    pandas
    pyranges
    pysam
+   scipy
    seaborn
 
 There are various ways you can install fuc. The recommended way is via conda:
@@ -335,7 +322,7 @@ Below is the list of submodules available in API:
 - **pyfq** : The pyfq submodule is designed for working with FASTQ files. It implements ``pyfq.FqFrame`` which stores FASTQ data as ``pandas.DataFrame`` to allow fast computation and easy manipulation.
 - **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pymaf.MafFrame`` class also contains many useful plotting methods such as ``MafFrame.plot_oncoplot`` and ``MafFrame.plot_summary``. The submodule strictly adheres to the standard `MAF specification <https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/>`_.
 - **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff <https://pcingola.github.io/SnpEff/>`_ program. It should be used with ``pyvcf.VcfFrame``.
-- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the standard `VCF specification <https://samtools.github.io/hts-specs/VCFv4.3.pdf>`_.
+- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_tmb``. The submodule strictly adheres to the standard `VCF specification <https://samtools.github.io/hts-specs/VCFv4.3.pdf>`_.
 - **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP <https://asia.ensembl.org/info/docs/tools/vep/index.html>`_ program. It should be used with ``pyvcf.VcfFrame``.
 
 For getting help on a specific module (e.g. pyvcf):
diff --git a/conda.yml b/conda.yml
index 3db176e..2c58966 100644
--- a/conda.yml
+++ b/conda.yml
@@ -14,6 +14,7 @@ dependencies:
   - pandas
   - pyranges
   - pysam
+  - scipy
   - seaborn
   - sphinx-issues
   - sphinx_rtd_theme
diff --git a/docs/api.rst b/docs/api.rst
index bc52021..f51e2c1 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -18,7 +18,7 @@ Below is the list of submodules available in API:
 - **pyfq** : The pyfq submodule is designed for working with FASTQ files. It implements ``pyfq.FqFrame`` which stores FASTQ data as ``pandas.DataFrame`` to allow fast computation and easy manipulation.
 - **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pymaf.MafFrame`` class also contains many useful plotting methods such as ``MafFrame.plot_oncoplot`` and ``MafFrame.plot_summary``. The submodule strictly adheres to the standard `MAF specification <https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/>`_.
 - **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff <https://pcingola.github.io/SnpEff/>`_ program. It should be used with ``pyvcf.VcfFrame``.
-- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the standard `VCF specification <https://samtools.github.io/hts-specs/VCFv4.3.pdf>`_.
+- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_tmb``. The submodule strictly adheres to the standard `VCF specification <https://samtools.github.io/hts-specs/VCFv4.3.pdf>`_.
 - **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP <https://asia.ensembl.org/info/docs/tools/vep/index.html>`_ program. It should be used with ``pyvcf.VcfFrame``.
 
 For getting help on a specific module (e.g. pyvcf):
diff --git a/docs/create.py b/docs/create.py
index b737564..374bff6 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -189,40 +189,26 @@
 
 .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_comparison.png
 
-To create a histogram of tumor mutational burden (TMB) distribution:
+To create various figures for normal-tumor analysis:
 
 .. code:: python3
 
-    >>> from fuc import pyvcf
-    >>> vcf_data = {{
-    ...     'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
-    ...     'POS': [100, 101, 102, 103, 103],
-    ...     'ID': ['.', '.', '.', '.', '.'],
-    ...     'REF': ['T', 'T', 'T', 'T', 'T'],
-    ...     'ALT': ['C', 'C', 'C', 'C', 'C'],
-    ...     'QUAL': ['.', '.', '.', '.', '.'],
-    ...     'FILTER': ['.', '.', '.', '.', '.'],
-    ...     'INFO': ['.', '.', '.', '.', '.'],
-    ...     'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'],
-    ...     'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'],
-    ...     'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'],
-    ...     'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'],
-    ...     'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'],
-    ...     'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'],
-    ...     'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'],
-    ...     'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'],
-    ...     'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'],
-    ... }}
-    >>> annot_data = {{
-    ...     'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'],
-    ...     'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'],
-    ...     'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'],
-    ... }}
-    >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data)
-    >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample')
-    >>> vf.plot_histplot(hue='Type', af=af)
-
-.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_histplot.png
+    >>> import matplotlib.pyplot as plt
+    >>> from fuc import common, pyvcf
+    >>> common.load_dataset('pyvcf')
+    >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf')
+    >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample')
+    >>> normal = af.df[af.df.Tissue == 'Normal'].index
+    >>> tumor = af.df[af.df.Tissue == 'Tumor'].index
+    >>> fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 10))
+    >>> vf.plot_tmb(ax=ax1)
+    >>> vf.plot_tmb(ax=ax2, af=af, hue='Tissue')
+    >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue')
+    >>> vf.plot_regplot(normal, tumor, ax=ax4)
+    >>> plt.tight_layout()
+    >>> plt.savefig('normal-tumor.png')
+
+.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png
 
 MAF
 ---
@@ -284,6 +270,7 @@
    pandas
    pyranges
    pysam
+   scipy
    seaborn
 
 There are various ways you can install fuc. The recommended way is via conda:
diff --git a/docs/requirements.txt b/docs/requirements.txt
index cb841c9..497ebda 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -6,4 +6,5 @@ matplotlib-venn
 numpy
 pandas
 pysam
+scipy
 seaborn
diff --git a/fuc/api/common.py b/fuc/api/common.py
index 8c524ca..a192358 100644
--- a/fuc/api/common.py
+++ b/fuc/api/common.py
@@ -73,6 +73,8 @@ def load_dataset(name, force=False):
         ],
         'pyvcf': [
             'plot_comparison.vcf',
+            'normal-tumor.vcf',
+            'normal-tumor-annot.tsv',
         ],
     }
     base_url = ('https://raw.githubusercontent.com/sbslee/fuc-data/main')
diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
index b7883af..7c61343 100644
--- a/fuc/api/pyvcf.py
+++ b/fuc/api/pyvcf.py
@@ -3,7 +3,7 @@
 ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow
 fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also
 contains many useful plotting methods such as ``VcfFrame.plot_comparison``
-and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the
+and ``VcfFrame.plot_tmb``. The submodule strictly adheres to the
 standard `VCF specification
 <https://samtools.github.io/hts-specs/VCFv4.3.pdf>`_.
 
@@ -58,6 +58,7 @@
 from matplotlib_venn import venn2, venn3
 import os
 import seaborn as sns
+import scipy.stats as stats
 
 CONTIGS = [
     '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
@@ -666,6 +667,60 @@ def from_file(cls, fn, sample_col, sep='\t'):
         df = df.set_index(sample_col)
         return cls(df)
 
+def simulate_genotype(
+    p=0.5, noise_scale=0.05, dp_show=True, dp_loc=30, dp_scale=10,
+    ad_show=True, ad_loc=0.5, ad_scale=0.05, af_show=True
+):
+    lower, upper = 0, 1
+    mu, sigma = p, noise_scale
+    X = stats.truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma)
+    has_var = np.random.binomial(1, X.rvs(1)) == 1
+
+    if has_var:
+        result = '0/1'
+    else:
+        result = '0/0'
+
+    dp = np.random.normal(loc=dp_loc, scale=dp_scale)
+    dp = round(abs(dp))
+
+    if has_var:
+        alt = round(abs(np.random.normal(loc=ad_loc, scale=ad_scale)) * dp)
+        ref = dp - alt
+        ad = f'{ref},{alt}'
+    else:
+        ad = f'{dp},0'
+
+    if has_var:
+        af = round(alt / (ref + alt), 3)
+    else:
+        af = 0
+
+    if dp_show:
+        result += f':{dp}'
+
+    if ad_show:
+        result += f':{ad}'
+
+    if af_show:
+        result += f':{af}'
+
+    return result
+
+def simulate_sample(
+    n, p=0.5, noise_scale=0.1, dp_show=True, dp_loc=30, dp_scale=10,
+    ad_show=True, ad_loc=0.5, ad_scale=0.05, af_show=True
+):
+    l = []
+    for i in range(n):
+        genotype = simulate_genotype(
+            p=p, noise_scale=noise_scale, dp_show=dp_show,
+            dp_loc=dp_loc, dp_scale=dp_scale, ad_show=ad_show,
+            ad_loc=ad_loc, ad_scale=ad_scale
+        )
+        l.append(genotype)
+    return l
+
 class VcfFrame:
     """Class for storing VCF data.
 
@@ -1782,18 +1837,96 @@ def _plot_comparison_three(self, a, b, c, venn_kws):
         out = venn3(subsets=n[:-1], **venn_kws)
         return out
 
-    def plot_histplot(
+    def plot_hist(
+        self, k, af=None, hue=None, kde=True, ax=None, figsize=None, **kwargs
+    ):
+        """
+        Create a histogram showing AD/AF/DP distribution.
+
+        Parameters
+        ----------
+        k : {'AD', 'AF', 'DP'}
+            Genotype key.
+        af : pyvcf.AnnFrame
+            AnnFrame containing sample annotation data (requires ``hue``).
+        hue : list, optional
+            Grouping variable that will produce multiple histograms with
+            different colors (requires ``af``).
+        kde : bool, default: True
+            Compute a kernel density estimate to smooth the distribution.
+        ax : matplotlib.axes.Axes, optional
+            Pre-existing axes for the plot. Otherwise, crete a new one.
+        figsize : tuple, optional
+            Width, height in inches. Format: (float, float).
+        kwargs
+            Other keyword arguments will be passed down to
+            :meth:`seaborn.histplot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes
+            The matplotlib axes containing the plot.
+
+        Examples
+        --------
+        Below is a simple example:
+
+        .. plot::
+            :context: close-figs
+
+            >>> from fuc import common, pyvcf
+            >>> common.load_dataset('pyvcf')
+            >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf')
+            >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample')
+            >>> vf.plot_hist('DP')
+
+        We can draw multiple histograms with hue mapping:
+
+        .. plot::
+            :context: close-figs
+
+            >>> vf.plot_hist('DP', af=af, hue='Tissue')
+
+        We can show AF instead of DP:
+
+        .. plot::
+            :context: close-figs
+
+            >>> vf.plot_hist('AF')
+        """
+        d = {
+            'AD': lambda x: float(x.split(',')[1]),
+            'AF': lambda x: float(x),
+            'DP': lambda x: int(x),
+        }
+        df = self.extract(k, as_nan=True, func=d[k])
+        df = df.T
+        id_vars = ['index']
+        if hue is not None:
+            df = pd.concat([df, af.df[hue]], axis=1, join='inner')
+            id_vars.append(hue)
+        df = df.reset_index()
+        df = pd.melt(df, id_vars=id_vars)
+        df = df.dropna()
+        df = df.rename(columns={'value': k})
+        if ax is None:
+            fig, ax = plt.subplots(figsize=figsize)
+        sns.histplot(data=df, x=k, hue=hue, kde=kde, ax=ax, **kwargs)
+        return ax
+
+    def plot_tmb(
         self, af=None, hue=None, kde=True, ax=None, figsize=None, **kwargs
     ):
-        """Create a histogram of TMB distribution.
+        """
+        Create a histogram showing TMB distribution.
 
         Parameters
         ----------
         af : pyvcf.AnnFrame
-            AnnFrame.
+            AnnFrame containing sample annotation data (requires ``hue``).
         hue : list, optional
-            Grouping variable that will produce bars with different colors.
-            Requires the `af` option.
+            Grouping variable that will produce multiple histograms with
+            different colors (requires ``af``).
         kde : bool, default: True
             Compute a kernel density estimate to smooth the distribution.
         ax : matplotlib.axes.Axes, optional
@@ -1811,43 +1944,23 @@ def plot_histplot(
 
         Examples
         --------
+        Below is a simple example:
 
         .. plot::
             :context: close-figs
 
-            >>> from fuc import pyvcf
-            >>> vcf_data = {
-            ...     'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
-            ...     'POS': [100, 101, 102, 103, 103],
-            ...     'ID': ['.', '.', '.', '.', '.'],
-            ...     'REF': ['T', 'T', 'T', 'T', 'T'],
-            ...     'ALT': ['C', 'C', 'C', 'C', 'C'],
-            ...     'QUAL': ['.', '.', '.', '.', '.'],
-            ...     'FILTER': ['.', '.', '.', '.', '.'],
-            ...     'INFO': ['.', '.', '.', '.', '.'],
-            ...     'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'],
-            ...     'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'],
-            ...     'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'],
-            ...     'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'],
-            ...     'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'],
-            ...     'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'],
-            ...     'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'],
-            ...     'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'],
-            ...     'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'],
-            ... }
-            >>> annot_data = {
-            ...     'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'],
-            ...     'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'],
-            ...     'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'],
-            ... }
-            >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data)
-            >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample')
-            >>> vf.plot_histplot()
+            >>> from fuc import common, pyvcf
+            >>> common.load_dataset('pyvcf')
+            >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf')
+            >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample')
+            >>> vf.plot_tmb()
+
+        We can draw multiple histograms with hue mapping:
 
         .. plot::
             :context: close-figs
 
-            >>> vf.plot_histplot(hue='Type', af=af)
+            >>> vf.plot_tmb(af=af, hue='Tissue')
         """
         s = self.df.iloc[:, 9:].applymap(gt_hasvar).sum()
         s.name = 'TMB'
@@ -1857,7 +1970,7 @@ def plot_histplot(
             df = pd.concat([af.df, s], axis=1, join='inner')
         if ax is None:
             fig, ax = plt.subplots(figsize=figsize)
-        sns.histplot(data=df, x='TMB', hue=hue, kde=kde, **kwargs)
+        sns.histplot(data=df, x='TMB', ax=ax, hue=hue, kde=kde, **kwargs)
         return ax
 
     def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
@@ -1885,29 +1998,13 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs):
 
         .. plot::
 
-            >>> from fuc import pyvcf
-            >>> data = {
-            ...     'CHROM': ['chr1', 'chr1', 'chr1'],
-            ...     'POS': [100, 101, 102],
-            ...     'ID': ['.', '.', '.'],
-            ...     'REF': ['G', 'T', 'T'],
-            ...     'ALT': ['A', 'C', 'C'],
-            ...     'QUAL': ['.', '.', '.'],
-            ...     'FILTER': ['.', '.', '.'],
-            ...     'INFO': ['.', '.', '.'],
-            ...     'FORMAT': ['GT', 'GT', 'GT'],
-            ...     'Steven_A': ['0/1', '0/1', '0/1'],
-            ...     'Steven_B': ['0/1', '0/1', '0/1'],
-            ...     'Sara_A': ['0/0', '1/1', '1/1'],
-            ...     'Sara_B': ['0/0', '1/1', '1/1'],
-            ...     'John_A': ['0/0', '0/0', '1/1'],
-            ...     'John_B': ['0/0', '0/0', '1/1'],
-            ... }
-            >>> vf = pyvcf.VcfFrame.from_dict([], data)
-            >>> vf.df
-            >>> a = ['Steven_A', 'Sara_A', 'John_A']
-            >>> b = ['Steven_B', 'Sara_B', 'John_B']
-            >>> vf.plot_regplot(a, b)
+            >>> from fuc import common, pyvcf
+            >>> common.load_dataset('pyvcf')
+            >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf')
+            >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample')
+            >>> normal = af.df[af.df.Tissue == 'Normal'].index
+            >>> tumor = af.df[af.df.Tissue == 'Tumor'].index
+            >>> vf.plot_regplot(normal, tumor)
         """
         s = self.df.iloc[:, 9:].applymap(gt_hasvar).sum()
         if ax is None:
diff --git a/setup.py b/setup.py
index dbc32f3..32d7c33 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 
 requirements = [
     'biopython', 'lxml', 'matplotlib', 'matplotlib-venn', 'numpy', 'pandas',
-    'pyranges', 'pysam', 'seaborn'
+    'pyranges', 'pysam', 'scipy', 'seaborn'
 ]
 
 setup(

From b280e5902dbac67185eeae6f4dddaf68eb6b9f5c Mon Sep 17 00:00:00 2001
From: Seung-been Lee <sbstevenlee@gmail.com>
Date: Sat, 12 Jun 2021 19:11:04 +0900
Subject: [PATCH 10/10] Update API

---
 README.rst     | 1 -
 docs/create.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/README.rst b/README.rst
index 7f6771b..dcc0fa5 100644
--- a/README.rst
+++ b/README.rst
@@ -178,7 +178,6 @@ To create various figures for normal-tumor analysis:
     >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue')
     >>> vf.plot_regplot(normal, tumor, ax=ax4)
     >>> plt.tight_layout()
-    >>> plt.savefig('normal-tumor.png')
 
 .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png
 
diff --git a/docs/create.py b/docs/create.py
index 374bff6..ba1d8d2 100644
--- a/docs/create.py
+++ b/docs/create.py
@@ -206,7 +206,6 @@
     >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue')
     >>> vf.plot_regplot(normal, tumor, ax=ax4)
     >>> plt.tight_layout()
-    >>> plt.savefig('normal-tumor.png')
 
 .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png