diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f6e5866..18679d3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,16 @@ Changelog ********* +0.11.0 (2021-06-10) +------------------- + +* :issue:`16`: Add new method :meth:`pyvcf.VcfFrame.cfilter_empty`. +* Add new methods :meth:`pyvep.filter_af/lof`. +* Add ``matplotlib-venn`` package as dependency for plotting Venn diagrams. +* Add new methods :meth:`pyvcf.plot_comparison/regplot/histplot`. +* :issue:`17`: Add new method :meth:`pyvep.filter_biotype`. +* Add new class :class:`pyvcf.AnnFrame`. + 0.10.0 (2021-06-03) ------------------- diff --git a/README.rst b/README.rst index cb15882..adb03b2 100644 --- a/README.rst +++ b/README.rst @@ -56,24 +56,36 @@ Your contributions (e.g. feature ideas, pull requests) are most welcome. CLI Examples ============ +SAM/BAM/CRAM +------------ + To print the header of a BAM file: .. code-block:: console $ fuc bam_head example.bam +BED +--- + To find intersection between BED files: .. code-block:: console $ fuc bed_intxn 1.bed 2.bed 3.bed > intersect.bed +FASTQ +----- + To count sequence reads in a FASTQ file: .. code-block:: console $ fuc fq_count example.fastq +FUC +--- + To check whether a file exists in the operating system: .. code-block:: console @@ -86,12 +98,18 @@ To find all VCF files within the current directory recursively: $ fuc fuc_find . vcf +TABLE +----- + To merge two tab-delimited files: .. code-block:: console $ fuc tbl_merge left.txt right.txt > merged.txt +VCF +--- + To merge VCF files: .. code-block:: console @@ -101,6 +119,9 @@ To merge VCF files: API Examples ============ +VCF +--- + To filter a VCF file based on a BED file: .. code:: python3 @@ -119,11 +140,63 @@ To remove indels from a VCF file: >>> filtered_vf = vf.filter_indel() >>> filtered_vf.to_file('no_indels.vcf') +To create a Venn diagram showing genotype concordance between groups: + +.. code:: python3 + + >>> from fuc import pyvcf, common + >>> common.load_dataset('pyvcf') + >>> f = '~/fuc-data/pyvcf/plot_comparison.vcf' + >>> vf = pyvcf.VcfFrame.from_file(f) + >>> a = ['Steven_A', 'John_A', 'Sara_A'] + >>> b = ['Steven_B', 'John_B', 'Sara_B'] + >>> c = ['Steven_C', 'John_C', 'Sara_C'] + >>> vf.plot_comparison(a, b, c) + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_comparison.png + +To create a histogram of tumor mutational burden (TMB) distribution: + +.. code:: python3 + + >>> from fuc import pyvcf + >>> vcf_data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103, 103], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['T', 'T', 'T', 'T', 'T'], + ... 'ALT': ['C', 'C', 'C', 'C', 'C'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], + ... 'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'], + ... 'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'], + ... 'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'], + ... 'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'], + ... 'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], + ... 'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'], + ... 'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], + ... 'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'], + ... } + >>> annot_data = { + ... 'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'], + ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'], + ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data) + >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample') + >>> vf.plot_histplot(hue='Type', af=af) + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_histplot.png + +MAF +--- + To create an oncoplot with a MAF file: .. code:: python3 - >>> import matplotlib.pyplot as plt >>> from fuc import common, pymaf >>> common.load_dataset('tcga-laml') >>> f = '~/fuc-data/tcga-laml/tcga_laml.maf.gz' @@ -140,7 +213,6 @@ To create a summary figure for a MAF file: .. code:: python3 - >>> import matplotlib.pyplot as plt >>> from fuc import common, pymaf >>> common.load_dataset('tcga-laml') >>> f = '~/fuc-data/tcga-laml/tcga_laml.maf.gz' @@ -149,11 +221,13 @@ To create a summary figure for a MAF file: .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/maf_summary.png +SAM/BAM/CRAM +------------ + To create read depth profile of a region from a CRAM file: .. code:: python3 - >>> import matplotlib.pyplot as plt >>> from fuc import pycov >>> cf = pycov.CovFrame.from_file('HG00525.final.cram', zero=True, ... region='chr12:21161194-21239796', names=['HG00525']) @@ -197,7 +271,7 @@ Finally, you can clone the GitHub repository and then install fuc this way: $ cd fuc $ pip install . -The nice thing about this approach is that you will have access to development versions that are not available in Anaconda or PyPI. For example, you can access a development branch with the ``git checkout`` command. +The nice thing about this approach is that you will have access to development versions that are not available in Anaconda or PyPI. For example, you can access a development branch with the ``git checkout`` command. When you do this, please make sure your environment already has all the dependencies installed. Getting Help ============ @@ -251,10 +325,10 @@ Below is the list of submodules available in API: - **pybed** : The pybed submodule is designed for working with BED files. It implements ``pybed.BedFrame`` which stores BED data as ``pandas.DataFrame`` via the `pyranges `_ package to allow fast computation and easy manipulation. The submodule strictly adheres to the standard `BED specification `_. - **pycov** : The pycov submodule is designed for working with depth of coverage data from sequence alingment files (SAM/BAM/CRAM). It implements ``pycov.CovFrame`` which stores read depth data as ``pandas.DataFrame`` via the `pysam `_ package to allow fast computation and easy manipulation. - **pyfq** : The pyfq submodule is designed for working with FASTQ files. It implements ``pyfq.FqFrame`` which stores FASTQ data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. -- **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The class also contains many useful plotting methods such as ``MafFrame.plot_varcls`` and ``MafFrame.plot_waterfall``. The submodule strictly adheres to the standard `MAF specification `_. -- **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff `_ program. It is designed to be used with ``pyvcf.VcfFrame``. -- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` class which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The submodule strictly adheres to the standard `VCF specification `_. -- **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP `_. It is designed to be used with ``pyvcf.VcfFrame``. +- **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pymaf.MafFrame`` class also contains many useful plotting methods such as ``MafFrame.plot_oncoplot`` and ``MafFrame.plot_summary``. The submodule strictly adheres to the standard `MAF specification `_. +- **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff `_ program. It should be used with ``pyvcf.VcfFrame``. +- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the standard `VCF specification `_. +- **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP `_ program. It should be used with ``pyvcf.VcfFrame``. For getting help on a specific module (e.g. pyvcf): diff --git a/conda.yml b/conda.yml index c58df7e..3db176e 100644 --- a/conda.yml +++ b/conda.yml @@ -8,6 +8,7 @@ dependencies: - cython - lxml - matplotlib + - matplotlib-venn - notebook - numpy - pandas diff --git a/docs/api.rst b/docs/api.rst index 80e80fe..bc52021 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -16,10 +16,10 @@ Below is the list of submodules available in API: - **pybed** : The pybed submodule is designed for working with BED files. It implements ``pybed.BedFrame`` which stores BED data as ``pandas.DataFrame`` via the `pyranges `_ package to allow fast computation and easy manipulation. The submodule strictly adheres to the standard `BED specification `_. - **pycov** : The pycov submodule is designed for working with depth of coverage data from sequence alingment files (SAM/BAM/CRAM). It implements ``pycov.CovFrame`` which stores read depth data as ``pandas.DataFrame`` via the `pysam `_ package to allow fast computation and easy manipulation. - **pyfq** : The pyfq submodule is designed for working with FASTQ files. It implements ``pyfq.FqFrame`` which stores FASTQ data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. -- **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The class also contains many useful plotting methods such as ``MafFrame.plot_varcls`` and ``MafFrame.plot_waterfall``. The submodule strictly adheres to the standard `MAF specification `_. -- **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff `_ program. It is designed to be used with ``pyvcf.VcfFrame``. -- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` class which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The submodule strictly adheres to the standard `VCF specification `_. -- **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP `_. It is designed to be used with ``pyvcf.VcfFrame``. +- **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pymaf.MafFrame`` class also contains many useful plotting methods such as ``MafFrame.plot_oncoplot`` and ``MafFrame.plot_summary``. The submodule strictly adheres to the standard `MAF specification `_. +- **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff `_ program. It should be used with ``pyvcf.VcfFrame``. +- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the standard `VCF specification `_. +- **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP `_ program. It should be used with ``pyvcf.VcfFrame``. For getting help on a specific module (e.g. pyvcf): diff --git a/docs/conf.py b/docs/conf.py index 16d6ad4..ee990f2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,6 +49,8 @@ issues_github_path = 'sbslee/fuc' +napoleon_use_param = False + # Include the example source for plots in API docs plot_include_source = True plot_formats = [('png', 90)] diff --git a/docs/create.py b/docs/create.py index 0069e54..f639cfa 100644 --- a/docs/create.py +++ b/docs/create.py @@ -84,24 +84,36 @@ CLI Examples ============ +SAM/BAM/CRAM +------------ + To print the header of a BAM file: .. code-block:: console $ fuc bam_head example.bam +BED +--- + To find intersection between BED files: .. code-block:: console $ fuc bed_intxn 1.bed 2.bed 3.bed > intersect.bed +FASTQ +----- + To count sequence reads in a FASTQ file: .. code-block:: console $ fuc fq_count example.fastq +FUC +--- + To check whether a file exists in the operating system: .. code-block:: console @@ -114,12 +126,18 @@ $ fuc fuc_find . vcf +TABLE +----- + To merge two tab-delimited files: .. code-block:: console $ fuc tbl_merge left.txt right.txt > merged.txt +VCF +--- + To merge VCF files: .. code-block:: console @@ -129,6 +147,9 @@ API Examples ============ +VCF +--- + To filter a VCF file based on a BED file: .. code:: python3 @@ -147,11 +168,63 @@ >>> filtered_vf = vf.filter_indel() >>> filtered_vf.to_file('no_indels.vcf') +To create a Venn diagram showing genotype concordance between groups: + +.. code:: python3 + + >>> from fuc import pyvcf, common + >>> common.load_dataset('pyvcf') + >>> f = '~/fuc-data/pyvcf/plot_comparison.vcf' + >>> vf = pyvcf.VcfFrame.from_file(f) + >>> a = ['Steven_A', 'John_A', 'Sara_A'] + >>> b = ['Steven_B', 'John_B', 'Sara_B'] + >>> c = ['Steven_C', 'John_C', 'Sara_C'] + >>> vf.plot_comparison(a, b, c) + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_comparison.png + +To create a histogram of tumor mutational burden (TMB) distribution: + +.. code:: python3 + + >>> from fuc import pyvcf + >>> vcf_data = {{ + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103, 103], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['T', 'T', 'T', 'T', 'T'], + ... 'ALT': ['C', 'C', 'C', 'C', 'C'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], + ... 'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'], + ... 'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'], + ... 'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'], + ... 'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'], + ... 'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], + ... 'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'], + ... 'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], + ... 'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'], + ... }} + >>> annot_data = {{ + ... 'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'], + ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'], + ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'], + ... }} + >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data) + >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample') + >>> vf.plot_histplot(hue='Type', af=af) + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_histplot.png + +MAF +--- + To create an oncoplot with a MAF file: .. code:: python3 - >>> import matplotlib.pyplot as plt >>> from fuc import common, pymaf >>> common.load_dataset('tcga-laml') >>> f = '~/fuc-data/tcga-laml/tcga_laml.maf.gz' @@ -168,7 +241,6 @@ .. code:: python3 - >>> import matplotlib.pyplot as plt >>> from fuc import common, pymaf >>> common.load_dataset('tcga-laml') >>> f = '~/fuc-data/tcga-laml/tcga_laml.maf.gz' @@ -177,11 +249,13 @@ .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/maf_summary.png +SAM/BAM/CRAM +------------ + To create read depth profile of a region from a CRAM file: .. code:: python3 - >>> import matplotlib.pyplot as plt >>> from fuc import pycov >>> cf = pycov.CovFrame.from_file('HG00525.final.cram', zero=True, ... region='chr12:21161194-21239796', names=['HG00525']) @@ -225,7 +299,7 @@ $ cd fuc $ pip install . -The nice thing about this approach is that you will have access to development versions that are not available in Anaconda or PyPI. For example, you can access a development branch with the ``git checkout`` command. +The nice thing about this approach is that you will have access to development versions that are not available in Anaconda or PyPI. For example, you can access a development branch with the ``git checkout`` command. When you do this, please make sure your environment already has all the dependencies installed. Getting Help ============ diff --git a/docs/requirements.txt b/docs/requirements.txt index 6d20e7a..cb841c9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,6 +2,7 @@ sphinx_rtd_theme sphinx_issues autodocsumm matplotlib +matplotlib-venn numpy pandas pysam diff --git a/fuc/api/common.py b/fuc/api/common.py index e087619..8c524ca 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -69,8 +69,11 @@ def load_dataset(name, force=False): 'tcga-laml': [ 'tcga_cohort.txt.gz', 'tcga_laml.maf.gz', - 'tcga_laml_annot.tsv' - ] + 'tcga_laml_annot.tsv', + ], + 'pyvcf': [ + 'plot_comparison.vcf', + ], } base_url = ('https://raw.githubusercontent.com/sbslee/fuc-data/main') for f in datasets[name]: diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 6c60ffe..67f9f18 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -1,11 +1,49 @@ """ The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow -fast computation and easy manipulation. The class also contains many useful -plotting methods such as ``MafFrame.plot_varcls`` and -``MafFrame.plot_waterfall``. The submodule strictly adheres to the +fast computation and easy manipulation. The ``pymaf.MafFrame`` class also +contains many useful plotting methods such as ``MafFrame.plot_oncoplot`` and +``MafFrame.plot_summary``. The submodule strictly adheres to the standard `MAF specification `_. + +A typical MAF file contains many fields ranging from gene symbol to +protein change. However, most of the analysis in pymaf uses the +following fields: + ++------+------------------------+----------------------+-------------------------------+ +| No. | Name | Description | Examples | ++======+========================+======================+===============================+ +| 1 | Hugo_Symbol | HUGO gene symbol | 'TP53', 'Unknown' | ++------+------------------------+----------------------+-------------------------------+ +| 2 | Entrez_Gene_Id | Entrez or Ensembl ID | 0, 8714 | ++------+------------------------+----------------------+-------------------------------+ +| 3 | Center | Sequencing center | '.', 'genome.wustl.edu' | ++------+------------------------+----------------------+-------------------------------+ +| 4 | NCBI_Build | Genome assembly | '37', 'GRCh38' | ++------+------------------------+----------------------+-------------------------------+ +| 5 | Chromosome | Chromosome name | 'chr1' | ++------+------------------------+----------------------+-------------------------------+ +| 6 | Start_Position | Start coordinate | 119031351 | ++------+------------------------+----------------------+-------------------------------+ +| 7 | End_Position | End coordinate | 44079555 | ++------+------------------------+----------------------+-------------------------------+ +| 8 | Strand | Genomic strand | '+', '-' | ++------+------------------------+----------------------+-------------------------------+ +| 9 | Variant_Classification | Translational effect | 'Missense_Mutation', 'Silent' | ++------+------------------------+----------------------+-------------------------------+ +| 10 | Variant_Type | Mutation type | 'SNP', 'INS', 'DEL' | ++------+------------------------+----------------------+-------------------------------+ +| 11 | Reference_Allele | Reference allele | 'T', '-', 'ACAA' | ++------+------------------------+----------------------+-------------------------------+ +| 12 | Tumor_Seq_Allele1 | First tumor allele | 'A', '-', 'TCA' | ++------+------------------------+----------------------+-------------------------------+ +| 13 | Tumor_Seq_Allele2 | Second tumor allele | 'A', '-', 'TCA' | ++------+------------------------+----------------------+-------------------------------+ +| 14 | Tumor_Sample_Barcode | Sample ID | 'TCGA-AB-3002' | ++------+------------------------+----------------------+-------------------------------+ +| 15 | Protein_Change | Protein change | 'p.L558Q' | ++------+------------------------+----------------------+-------------------------------+ """ import pandas as pd @@ -196,37 +234,131 @@ def legend_handles(name='regular'): return handles class AnnFrame: - """Class for storing annotation data. + """Class for storing sample annotation data. + + This class stores sample annotation data as pandas.DataFrame with sample + names as index. + + Note that an AnnFrame can have a different set of samples than its + accompanying MafFrame. Parameters ---------- df : pandas.DataFrame - DataFrame containing annotation data. The index must be - 'Tumor_Sample_Barcode'. + DataFrame containing sample annotation data. The index must be + sample names. See Also -------- + AnnFrame.from_dict + Construct AnnFrame from dict of array-like or dicts. AnnFrame.from_file - Construct AnnFrame from an annotation file. + Construct AnnFrame from a delimited text file. + + Examples + -------- + + >>> import pandas as pd + >>> from fuc import pymaf + >>> data = { + ... 'Tumor_Sample_Barcode': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T'], + ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara'], + ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor'], + ... 'Age': [30, 30, 57, 57] + ... } + >>> df = pd.DataFrame(data) + >>> df = df.set_index('Tumor_Sample_Barcode') + >>> af = pymaf.AnnFrame(df) + >>> af.df + Subject Type Age + Tumor_Sample_Barcode + Steven_N Steven Normal 30 + Steven_T Steven Tumor 30 + Sara_N Sara Normal 57 + Sara_T Sara Tumor 57 """ def __init__(self, df): - self.df = df + self._df = self._check_df(df) + + def _check_df(self, df): + if type(df.index) == pd.RangeIndex: + m = "Index must be sample names, not 'pandas.RangeIndex'." + raise ValueError(m) + return df + + @property + def df(self): + """pandas.DataFrame : DataFrame containing sample annotation data.""" + return self._df + + @df.setter + def df(self, value): + self._df = self._check_df(value) + + @classmethod + def from_dict(cls, data, sample_col='Tumor_Sample_Barcode'): + """Construct AnnFrame from dict of array-like or dicts. + + The dictionary must have at least one column that represents sample + names which are used as index for pandas.DataFrame. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + sample_col : str, default: 'Tumor_Sample_Barcode' + Column containing sample names. + + Returns + ------- + AnnFrame + AnnFrame object. + + See Also + -------- + AnnFrame + AnnFrame object creation using constructor. + AnnFrame.from_file + Construct AnnFrame from a delimited text file. + + Examples + -------- + + >>> from fuc import pymaf + >>> data = { + ... 'Tumor_Sample_Barcode': ['Steven_Normal', 'Steven_Tumor', 'Sara_Normal', 'Sara_Tumor'], + ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara'], + ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor'], + ... 'Age': [30, 30, 57, 57] + ... } + >>> af = pymaf.AnnFrame.from_dict(data) + >>> af.df + Subject Type Age + Tumor_Sample_Barcode + Steven_Normal Steven Normal 30 + Steven_Tumor Steven Tumor 30 + Sara_Normal Sara Normal 57 + Sara_Tumor Sara Tumor 57 + """ + df = pd.DataFrame(data) + df = df.set_index(sample_col) + return cls(df) @classmethod - def from_file(cls, fn, sep='\t', sample_col=None): - """Construct AnnFrame from an annotation file. + def from_file(cls, fn, sample_col='Tumor_Sample_Barcode', sep='\t'): + """Construct AnnFrame from a delimited text file. - The input text file must contain a column that corresponds to - 'Tumor_Sample_Barcode'. + The text file must have at least one column that represents + sample names which are used as index for pandas.DataFrame. Parameters ---------- fn : str - Annotation file path (zipped or unzipped). - sep : str, default: '\t' + Text file path (zipped or unzipped). + sample_col : str, default: 'Tumor_Sample_Barcode' + Column containing sample names. + sep : str, default: '\\\\t' Delimiter to use. - sample_col : str, optional - If provided, use this column as 'Tumor_Sample_Barcode'. Returns ------- @@ -237,23 +369,44 @@ def from_file(cls, fn, sep='\t', sample_col=None): -------- AnnFrame AnnFrame object creation using constructor. - """ + AnnFrame.from_dict + Construct AnnFrame from dict of array-like or dicts. + + Examples + -------- + >>> from fuc import pymaf + >>> af1 = pymaf.AnnFrame.from_file('sample-annot-1.tsv') + >>> af2 = pymaf.AnnFrame.from_file('sample-annot-2.csv', sample_col='SampleID', sep=',') + """ df = pd.read_table(fn, sep=sep) - if sample_col is not None: - df = df.rename(columns={sample_col: 'Tumor_Sample_Barcode'}) - df = df.set_index('Tumor_Sample_Barcode') + df = df.set_index(sample_col) return cls(df) - def legend_handles(self, col, numeric=False, segments=5, samples=None, decimals=0, cmap='Pastel1'): + def legend_handles( + self, col, samples=None, numeric=False, segments=5, decimals=0, + cmap='Pastel1' + ): """Return legend handles for the given column. + In the case of a numeric column, use ``numeric=True`` which will + divide the values into equal-sized intervals, with the number of + intervals determined by the `segments` option. + Parameters ---------- col : str Column name. samples : list, optional If provided, these samples will be used to create legend handles. + numeric : bool, default: False + If True, the column will be treated as numeric. + segments : int, default: 5 + If ``numeric`` is True, the numbers will be divided + into this number of equal-sized segments. + decimals : int, default: 0 + If ``numeric`` is True, the numbers will be rounded up to this + number of decimals. cmap : str, default: 'Pastel1' Color map. @@ -262,6 +415,11 @@ def legend_handles(self, col, numeric=False, segments=5, samples=None, decimals= list Legend handles. + See Also + -------- + AnnFrame.plot_annot + Create a 1D categorical heatmap for the given column. + Examples -------- @@ -273,26 +431,42 @@ def legend_handles(self, col, numeric=False, segments=5, samples=None, decimals= >>> f = '~/fuc-data/tcga-laml/tcga_laml_annot.tsv' >>> af = pymaf.AnnFrame.from_file(f) >>> fig, ax = plt.subplots() - >>> handles1 = af.legend_handles('FAB_classification') - >>> handles2 = af.legend_handles('Overall_Survival_Status') - >>> leg1 = ax.legend(handles=handles1, title='FAB_classification', loc='center left') - >>> leg2 = ax.legend(handles=handles2, title='Overall_Survival_Status', loc='center right') + >>> handles1 = af.legend_handles('FAB_classification', + ... cmap='Dark2') + >>> handles2 = af.legend_handles('days_to_last_followup', + ... numeric=True, + ... cmap='viridis') + >>> handles3 = af.legend_handles('Overall_Survival_Status') + >>> leg1 = ax.legend(handles=handles1, + ... title='FAB_classification', + ... loc='center left') + >>> leg2 = ax.legend(handles=handles2, + ... title='days_to_last_followup', + ... loc='center') + >>> leg3 = ax.legend(handles=handles3, + ... title='Overall_Survival_Status', + ... loc='center right') >>> ax.add_artist(leg1) >>> ax.add_artist(leg2) + >>> ax.add_artist(leg3) >>> plt.tight_layout() """ - s, l = self._get_col(col, numeric=numeric, segments=segments, samples=samples) + s, l = self._get_col(col, numeric=numeric, segments=segments, + samples=samples) colors = plt.cm.get_cmap(cmap)(np.linspace(0, 1, len(l))) handles = [] for i, label in enumerate(l): handles.append(mpatches.Patch(color=colors[i], label=label)) return handles - def _get_col(self, col, numeric=False, segments=5, samples=None, decimals=0): + def _get_col( + self, col, numeric=False, segments=5, samples=None, decimals=0 + ): s = self.df[col] s = s.replace([np.inf, -np.inf], np.nan) if numeric: - boundaries = list(np.linspace(s.min(), s.max(), segments+1, endpoint=True)) + boundaries = list(np.linspace(s.min(), s.max(), + segments+1, endpoint=True)) intervals = list(zip(boundaries[:-1], boundaries[1:])) def f(x): if pd.isna(x): @@ -308,10 +482,66 @@ def f(x): return s, l def plot_annot( - self, col, numeric=False, segments=5, samples=None, decimals=0, cmap='Pastel1', ax=None, - figsize=None, **kwargs + self, col, samples=None, numeric=False, segments=5, decimals=0, + cmap='Pastel1', ax=None, figsize=None, **kwargs ): - s, l = self._get_col(col, numeric=numeric, segments=segments, samples=samples) + """Create a 1D categorical heatmap for the given column. + + In the case of a numeric column, use ``numeric=True`` which will + divide the values into equal-sized intervals, with the number of + intervals determined by the `segments` option. + + Parameters + ---------- + col : str + Column name. + samples : list, optional + If provided, these samples will be used to create legend handles. + numeric : bool, default: False + If True, the column will be treated as numeric. + segments : int, default: 5 + If ``numeric`` is True, the numbers will be divided + into this number of equal-sized segments. + decimals : int, default: 0 + If ``numeric`` is True, the numbers will be rounded up to this + number of decimals. + cmap : str, default: 'Pastel1' + Color map. + + Returns + ------- + list + Legend handles. + + See Also + -------- + AnnFrame.legend_handles + Return legend handles for the given column. + + Examples + -------- + + .. plot:: + + >>> import matplotlib.pyplot as plt + >>> from fuc import common, pymaf + >>> common.load_dataset('tcga-laml') + >>> f = '~/fuc-data/tcga-laml/tcga_laml_annot.tsv' + >>> af = pymaf.AnnFrame.from_file(f) + >>> fig, [ax1, ax2, ax3] = plt.subplots(3, 1, figsize=(10, 5)) + >>> af.plot_annot('FAB_classification', ax=ax1, linewidths=1, cmap='Dark2') + >>> af.plot_annot('days_to_last_followup', ax=ax2, linewidths=1, cmap='viridis') + >>> af.plot_annot('Overall_Survival_Status', ax=ax3, linewidths=1) + >>> ax1.set_xlabel('') + >>> ax2.set_xlabel('') + >>> ax1.set_ylabel('') + >>> ax2.set_ylabel('') + >>> ax3.set_ylabel('') + >>> plt.tight_layout() + >>> plt.subplots_adjust(wspace=0.01, hspace=0.01) + """ + s, l = self._get_col(col, numeric=numeric, segments=segments, + samples=samples) d = {k: v for v, k in enumerate(l)} df = s.to_frame().applymap(lambda x: x if pd.isna(x) else d[x]) if ax is None: @@ -401,10 +631,22 @@ def from_file(cls, fn): def from_vcf(cls, vcf): """Construct MafFrame from a VCF file or VcfFrame. + The input VCF should already contain functional annotation data + from a tool such as Ensemble VEP, SnpEff, and ANNOVAR. The + recommended method is Ensemble VEP with "RefSeq transcripts" as the + transcript database and the filtering option "Show one selected + consequence per variant". + Parameters ---------- vcf : str or VcfFrame - VCF file path or VcfFrame. + VCF file or VcfFrame. + + Examples + -------- + + >>> from fuc import pymaf + >>> mf = pymaf.MafFrame.from_vcf('annotated.vcf') """ # Parse the input VCF. if isinstance(vcf, str): diff --git a/fuc/api/pysnpeff.py b/fuc/api/pysnpeff.py index f81be48..f37ae2b 100644 --- a/fuc/api/pysnpeff.py +++ b/fuc/api/pysnpeff.py @@ -1,7 +1,7 @@ """ The pysnpeff submodule is designed for parsing VCF annotation data from -the `SnpEff `_ program. It is designed -to be used with ``pyvcf.VcfFrame``. +the `SnpEff `_ program. It should be +used with ``pyvcf.VcfFrame``. One VCF record can have several SnpEff annotations if, for example, the record is a multiallelic site or the variant is shared by diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 3f8fa07..e9f62d6 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -1,18 +1,20 @@ """ The pyvcf submodule is designed for working with VCF files. It implements -``pyvcf.VcfFrame`` class which stores VCF data as -``pandas.DataFrame`` to allow fast computation and easy manipulation. -The submodule strictly adheres to the standard `VCF specification +``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow +fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also +contains many useful plotting methods such as ``VcfFrame.plot_comparison`` +and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the +standard `VCF specification `_. -A VCF file contains metadata lines (prefixed with '##'), a header line -(prefixed with '#'), and genotype lines that begin with a chromosome +A typical VCF file contains metadata lines (prefixed with '##'), a header +line (prefixed with '#'), and genotype lines that begin with a chromosome identifier (e.g. 'chr1'). See the VCF specification above for an example VCF file. Genotype lines have nine required fields for storing variant information and variable-length fields for storing sample genotype data. For some -fields, missing values are tolerated and specified with a dot ('.'). +fields, missing values are tolerated and can be specified with a dot ('.'). The nine required fields are: +-----+--------+------------------------------------+------------------------+ @@ -38,10 +40,10 @@ +-----+--------+------------------------------------+------------------------+ There are several common, reserved genotype keywords that are standards -across the community. Currently, the module is aware of the +across the community. Currently, the pyvcf submodule is aware of the following: -* AD - Total read depth for each allele (R, Integer). +* AD - Total read depth for each allele (R, Integer) * AF - Allele fraction of the event in the tumor (1, Float) * DP - Read depth (1, Integer) """ @@ -52,6 +54,10 @@ from copy import deepcopy from Bio import bgzf from . import pybed +import matplotlib.pyplot as plt +from matplotlib_venn import venn2, venn3 +import os +import seaborn as sns CONTIGS = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', @@ -510,7 +516,155 @@ def row_missval(r): m += ':.' return m -# -- VcfFrame ---------------------------------------------------------------- +class AnnFrame: + """Class for storing sample annotation data. + + This class stores sample annotation data as pandas.DataFrame with sample + names as index. + + Note that an AnnFrame can have a different set of samples than its + accompanying VcfFrame. + + Parameters + ---------- + df : pandas.DataFrame + DataFrame containing sample annotation data. Index must be + sample names. + + See Also + -------- + AnnFrame.from_dict + Construct AnnFrame from dict of array-like or dicts. + AnnFrame.from_file + Construct AnnFrame from a delimited text file. + + Examples + -------- + + >>> import pandas as pd + >>> from fuc import pyvcf + >>> data = { + ... 'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T'], + ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara'], + ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor'], + ... 'Age': [30, 30, 57, 57] + ... } + >>> df = pd.DataFrame(data) + >>> df = df.set_index('Sample') + >>> af = pyvcf.AnnFrame(df) + >>> af.df + Subject Type Age + Sample + Steven_N Steven Normal 30 + Steven_T Steven Tumor 30 + Sara_N Sara Normal 57 + Sara_T Sara Tumor 57 + """ + def _check_df(self, df): + if type(df.index) == pd.RangeIndex: + m = "Index must be sample names, not 'pandas.RangeIndex'." + raise ValueError(m) + return df + + def __init__(self, df): + self._df = self._check_df(df) + + @property + def df(self): + """pandas.DataFrame : DataFrame containing sample annotation data.""" + return self._df + + @df.setter + def df(self, value): + self._df = self._check_df(value) + + @classmethod + def from_dict(cls, data, sample_col): + """Construct AnnFrame from dict of array-like or dicts. + + The dictionary must have at least one column that represents sample + names which are used as index for pandas.DataFrame. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + sample_col : str + Column containing sample names. + + Returns + ------- + AnnFrame + AnnFrame object. + + See Also + -------- + AnnFrame + AnnFrame object creation using constructor. + AnnFrame.from_file + Construct AnnFrame from a delimited text file. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'Sample': ['Steven_Normal', 'Steven_Tumor', 'Sara_Normal', 'Sara_Tumor'], + ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara'], + ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor'], + ... 'Age': [30, 30, 57, 57] + ... } + >>> af = pyvcf.AnnFrame.from_dict(data, 'Sample') + >>> af.df + Subject Type Age + Sample + Steven_Normal Steven Normal 30 + Steven_Tumor Steven Tumor 30 + Sara_Normal Sara Normal 57 + Sara_Tumor Sara Tumor 57 + """ + df = pd.DataFrame(data) + df = df.set_index(sample_col) + return cls(df) + + @classmethod + def from_file(cls, fn, sample_col, sep='\t'): + """Construct AnnFrame from a delimited text file. + + The text file must have at least one column that represents + sample names which are used as index for pandas.DataFrame. + + Parameters + ---------- + fn : str + Text file path (zipped or unzipped). + sample_col : str + Column containing sample names. + sep : str, default: '\\\\t' + Delimiter to use. + + Returns + ------- + AnnFrame + AnnFrame. + + See Also + -------- + AnnFrame + AnnFrame object creation using constructor. + AnnFrame.from_dict + Construct AnnFrame from dict of array-like or dicts. + + Examples + -------- + + >>> from fuc import pyvcf + >>> af1 = pyvcf.AnnFrame.from_file('sample-annot-1.tsv', 'Tumor_Sample_Barcode') + >>> af2 = pyvcf.AnnFrame.from_file('sample-annot-2.csv', 'SampleID', sep=',') + """ + df = pd.read_table(fn, sep=sep) + df = df.set_index(sample_col) + return cls(df) class VcfFrame: """Class for storing VCF data. @@ -561,33 +715,397 @@ def __init__(self, meta, df): self._meta = meta self._df = df.reset_index(drop=True) - @property - def meta(self): - """list : List of metadata lines.""" - return self._meta + @property + def meta(self): + """list : List of metadata lines.""" + return self._meta + + @meta.setter + def meta(self, value): + self._meta = value + + @property + def df(self): + """pandas.DataFrame : DataFrame containing VCF data.""" + return self._df + + @df.setter + def df(self, value): + self._df = value.reset_index(drop=True) + + @property + def samples(self): + """list : List of the sample names.""" + return self.df.columns[9:].to_list() + + @property + def shape(self): + """tuple : Dimensionality of VcfFrame (variants, samples).""" + return (self.df.shape[0], len(self.samples)) + + def add_dp(self): + """Compute DP using AD and add it to the FORMAT field. + + Returns + ------- + VcfFrame + Updated VcfFrame. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], + ... 'POS': [100, 100, 200, 200], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'C', 'C'], + ... 'ALT': ['C', 'T', 'G', 'G,A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT:AD', 'GT:AD', 'GT:AD', 'GT:AD'], + ... 'Steven': ['0/1:12,15', '0/0:32,1', '0/1:16,12', './.:.'], + ... 'Sara': ['0/1:13,17', '0/1:14,15', './.:.', '1/2:0,11,17'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara + 0 chr1 100 . A C . . . GT:AD 0/1:12,15 0/1:13,17 + 1 chr1 100 . A T . . . GT:AD 0/0:32,1 0/1:14,15 + 2 chr2 200 . C G . . . GT:AD 0/1:16,12 ./.:. + 3 chr2 200 . C G,A . . . GT:AD ./.:. 1/2:0,11,17 + + We can add the DP subfield to our genotype data: + + >>> vf.add_dp().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara + 0 chr1 100 . A C . . . GT:AD:DP 0/1:12,15:27 0/1:13,17:30 + 1 chr1 100 . A T . . . GT:AD:DP 0/0:32,1:33 0/1:14,15:29 + 2 chr2 200 . C G . . . GT:AD:DP 0/1:16,12:28 ./.:.:. + 3 chr2 200 . C G,A . . . GT:AD:DP ./.:.:. 1/2:0,11,17:28 + """ + def outfunc(r): + i = r.FORMAT.split(':').index('AD') + def infunc(x): + ad = x.split(':')[i].split(',') + dp = 0 + for depth in ad: + if depth == '.': + return f'{x}:.' + dp += int(depth) + return f'{x}:{dp}' + r.iloc[9:] = r.iloc[9:].apply(infunc) + r.FORMAT += ':DP' + return r + df = self.df.apply(outfunc, axis=1) + vf = self.__class__(self.copy_meta(), df) + return vf + + def add_flag(self, flag, order='last', index=None): + """Add the given flag to the INFO field. + + The default behavior is to add the flag to all rows in the VcfFrame. + + Parameters + ---------- + flag : str + INFO flag. + order : {'last', 'first', False}, default: 'last' + Determines the order in which the flag will be added. + + - ``last`` : Add to the end of the list. + - ``first`` : Add to the beginning of the list. + - ``False`` : Overwrite the existing field. + + index : list or pandas.Series, optional + Boolean index array indicating which rows should be updated. + + Returns + ------- + VcfFrame + Updated VcfFrame. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'A', 'C'], + ... 'ALT': ['A', 'C', 'T', 'A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', 'DB', 'DB', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'Steven': ['0/0', '0/1', '0/1', '1/1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . . GT 0/0 + 1 chr1 101 . T C . . DB GT 0/1 + 2 chr1 102 . A T . . DB GT 0/1 + 3 chr1 103 . C A . . . GT 1/1 + + We can add the SOMATIC flag to the INFO field: + + >>> vf.add_flag('SOMATIC').df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . SOMATIC GT 0/0 + 1 chr1 101 . T C . . DB;SOMATIC GT 0/1 + 2 chr1 102 . A T . . DB;SOMATIC GT 0/1 + 3 chr1 103 . C A . . SOMATIC GT 1/1 + + Setting ``order='first'`` will append the flag at the beginning: + + >>> vf.add_flag('SOMATIC', order='first').df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . SOMATIC GT 0/0 + 1 chr1 101 . T C . . SOMATIC;DB GT 0/1 + 2 chr1 102 . A T . . SOMATIC;DB GT 0/1 + 3 chr1 103 . C A . . SOMATIC GT 1/1 + + Setting ``order=False`` will overwrite the INFO field: + + >>> vf.add_flag('SOMATIC', order=False).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . SOMATIC GT 0/0 + 1 chr1 101 . T C . . SOMATIC GT 0/1 + 2 chr1 102 . A T . . SOMATIC GT 0/1 + 3 chr1 103 . C A . . SOMATIC GT 1/1 + + We can also specify which rows should be updated: + + >>> vf.add_flag('SOMATIC', index=[True, True, False, False]).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . SOMATIC GT 0/0 + 1 chr1 101 . T C . . DB;SOMATIC GT 0/1 + 2 chr1 102 . A T . . DB GT 0/1 + 3 chr1 103 . C A . . . GT 1/1 + """ + if index is None: + index = [True for i in range(self.shape[0])] + def f(r): + if not index[r.name]: + return r + if r.INFO == '.': + r.INFO = flag + elif not order: + r.INFO = flag + elif order == 'first': + r.INFO = f'{flag};{r.INFO}' + else: + r.INFO += f';{flag}' + return r + df = self.df.apply(f, axis=1) + vf = self.__class__(self.copy_meta(), df) + return vf + + def cfilter_empty(self, opposite=False, as_list=False): + """Remove samples whose genotype calls are all missing. + + Parameters + ---------- + opposite : bool, default: False + If True, return samples that don't meet the said criteria. + as_list : bool, default: False + If True, return a list of sample names instead of a VcfFrame. + + Returns + ------- + VcfFrame + Filtered VcfFrame. + + Examples + -------- + Assume we have the following data: + + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'G', 'T'], + ... 'ALT': ['A', 'C', 'C', 'C'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'Steven': ['0/1', '1/1', '1/1', '1/1'], + ... 'Rachel': ['./.', './.', './.', './.'], + ... 'John': ['0/0', './.', '0/0', '0/0'], + ... 'Sara': ['./.', './.', './.', './.'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Rachel John Sara + 0 chr1 100 . G A . . . GT 0/1 ./. 0/0 ./. + 1 chr1 101 . T C . . . GT 1/1 ./. ./. ./. + 2 chr1 102 . G C . . . GT 1/1 ./. 0/0 ./. + 3 chr1 103 . T C . . . GT 1/1 ./. 0/0 ./. + + We can remove samples whose genotypes are all missing: + + >>> vf.cfilter_empty().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven John + 0 chr1 100 . G A . . . GT 0/1 0/0 + 1 chr1 101 . T C . . . GT 1/1 ./. + 2 chr1 102 . G C . . . GT 1/1 0/0 + 3 chr1 103 . T C . . . GT 1/1 0/0 + + We can also select those samples: + + >>> vf.cfilter_empty(opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Rachel Sara + 0 chr1 100 . G A . . . GT ./. ./. + 1 chr1 101 . T C . . . GT ./. ./. + 2 chr1 102 . G C . . . GT ./. ./. + 3 chr1 103 . T C . . . GT ./. ./. + + Finally, we can return a list of sample names from the filtering: + + >>> vf.cfilter_empty(as_list=True) + ['Steven', 'John'] + """ + f = lambda r: r[9:].apply(gt_miss) + s = self.df.apply(f, axis=1).all() + if opposite: + s = s[s == True] + else: + s = s[s == False] + l = s.index.to_list() + if as_list: + return l + return self.subset(l) + + def collapse(self): + """Collapse duplicate records in the VcfFrame. + + Duplicate records have the identical values for CHROM, POS, and REF. + They can result from merging two VCF files. + + .. note:: + The method will sort the order of ALT alleles. + + Returns + ------- + VcfFrame + Collapsed VcfFrame. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], + ... 'POS': [100, 100, 200, 200], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'C', 'C'], + ... 'ALT': ['C', 'T', 'G', 'G,A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT:AD', 'GT:AD', 'GT:AD', 'GT:AD'], + ... 'Steven': ['0/1:12,15', './.:.', '0/1:16,12', './.:.'], + ... 'Sara': ['./.:.', '0/1:14,15', './.:.', '1/2:0,11,17'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara + 0 chr1 100 . A C . . . GT:AD 0/1:12,15 ./.:. + 1 chr1 100 . A T . . . GT:AD ./.:. 0/1:14,15 + 2 chr2 200 . C G . . . GT:AD 0/1:16,12 ./.:. + 3 chr2 200 . C G,A . . . GT:AD ./.:. 1/2:0,11,17 + + We collapse the VcfFrame: + + >>> vf.collapse().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara + 0 chr1 100 . A C,T . . . GT:AD 0/1:12,15,0 0/2:14,0,15 + 2 chr2 200 . C A,G . . . GT:AD 0/2:16,0,12 1/2:0,17,11 + """ + df = self.df.copy() + dup_idx = df.duplicated(['CHROM', 'POS', 'REF'], keep=False) + dups = {} + for i, r in df[dup_idx].iterrows(): + name = f'{r.CHROM}:{r.POS}:{r.REF}' + if name not in dups: + dups[name] = [] + dups[name].append(i) + + def collapse_one(df): + ref_allele = df.REF.unique()[0] + alt_alleles = [] + for i, r in df.iterrows(): + alt_alleles += r.ALT.split(',') + alt_alleles = sorted(list(set(alt_alleles)), + key=lambda x: (len(x), x)) + all_alleles = [ref_allele] + alt_alleles - @meta.setter - def meta(self, value): - self._meta = value + def infunc(x, r_all_alleles, index_map): + if gt_miss(x): + return '' + old_fields = x.split(':') + old_gt = old_fields[0] + new_gt = '/'.join([str(x) for x in + sorted([index_map[int(i)] for i in old_gt.split('/')]) + ]) + new_fields = [new_gt] + for old_field in old_fields[1:]: + old_subfields = old_field.split(',') + new_subfields = ['0' for x in all_alleles] + if len(old_subfields) == len(r_all_alleles): + for i, old_subfield in enumerate(old_subfields): + new_subfields[index_map[i]] = old_subfield + new_fields.append(','.join(new_subfields)) + else: + new_fields.append(old_field) + return ':'.join(new_fields) - @property - def df(self): - """pandas.DataFrame : DataFrame containing VCF data.""" - return self._df + def outfunc(r): + r_alt_alleles = r.ALT.split(',') + r_all_alleles = [r.REF] + r_alt_alleles + old_indicies = [i for i in range(len(r_all_alleles))] + new_indicies = [all_alleles.index(x) for x in r_all_alleles] + index_map = dict(zip(old_indicies, new_indicies)) + r[9:] = r[9:].apply(infunc, args=(r_all_alleles, index_map)) + return r - @df.setter - def df(self, value): - self._df = value.reset_index(drop=True) + df2 = df.apply(outfunc, axis=1) - @property - def samples(self): - """list : List of the sample names.""" - return self.df.columns[9:].to_list() + def raise_error(c): + if sum(c.values != '') > 1: + message = ('cannot collapse following ' + f'records:\n{df.loc[c.index]}') + raise ValueError(message) - @property - def shape(self): - """tuple : Dimensionality of VcfFrame (variants, samples).""" - return (self.df.shape[0], len(self.samples)) + df2.iloc[:, 9:].apply(raise_error) + df2 = df2.groupby(['CHROM', 'POS', 'REF']).agg(''.join) + df2 = df2.reset_index() + cols = list(df2) + cols[2], cols[3] = cols[3], cols[2] + df2 = df2[cols] + df2.ID = df.ID.unique()[0] + df2.ALT = ','.join(alt_alleles) + df2.QUAL = df.QUAL.unique()[0] + df2.FILTER = df.FILTER.unique()[0] + df2.INFO = df.INFO.unique()[0] + df2.FORMAT = df.FORMAT.unique()[0] + s = df2.squeeze() + s = s.replace('', row_missval(s)) + return s + + for name, i in dups.items(): + df.iloc[i] = collapse_one(df.iloc[i]) + df.drop_duplicates(subset=['CHROM', 'POS', 'REF'], inplace=True) + + vf = self.__class__(self.copy_meta(), df) + return vf @classmethod def from_dict(cls, meta, data): @@ -673,6 +1191,8 @@ def from_file(cls, fn, compression=False): """ meta = [] skip_rows = 0 + if fn.startswith('~'): + fn = os.path.expanduser(fn) if fn.endswith('.gz') or compression: f = bgzf.open(fn, 'rt') else: @@ -740,199 +1260,74 @@ def compare(self, a, b, c=None): We compare Steven and Sara: >>> vf.compare('Steven', 'Sara') - (0, 1, 2, 1) - - Next, we compare all three: - - >>> vf.compare('Steven', 'Sara', 'James') - (0, 0, 1, 1, 0, 1, 1, 0) - """ - if c is None: - result = self._compare_two(a, b) - else: - result = self._compare_three(a, b, c) - return result - - def _compare_two(self, a, b): - a = a if isinstance(a, str) else self.samples[a] - b = b if isinstance(b, str) else self.samples[b] - def func(r): - a_has = gt_hasvar(r[a]) - b_has = gt_hasvar(r[b]) - if a_has and not b_has: - return 'Ab' - elif not a_has and b_has: - return 'aB' - elif a_has and b_has: - return 'AB' - else: - return 'ab' - d = self.df.apply(func, axis=1).value_counts().to_dict() - Ab = d['Ab'] if 'Ab' in d else 0 - aB = d['aB'] if 'aB' in d else 0 - AB = d['AB'] if 'AB' in d else 0 - ab = d['ab'] if 'ab' in d else 0 - return (Ab, aB, AB, ab) - - def _compare_three(self, a, b, c): - a = a if isinstance(a, str) else self.samples[a] - b = b if isinstance(b, str) else self.samples[b] - c = c if isinstance(c, str) else self.samples[c] - def func(r): - a_has = gt_hasvar(r[a]) - b_has = gt_hasvar(r[b]) - c_has = gt_hasvar(r[c]) - if a_has and not b_has and not c_has: - return 'Abc' - elif not a_has and b_has and not c_has: - return 'aBc' - elif a_has and b_has and not c_has: - return 'ABc' - elif not a_has and not b_has and c_has: - return 'abC' - elif a_has and not b_has and c_has: - return 'AbC' - elif not a_has and b_has and c_has: - return 'aBC' - elif a_has and b_has and c_has: - return 'ABC' - else: - return 'abc' - d = self.df.apply(func, axis=1).value_counts().to_dict() - Abc = d['Abc'] if 'Abc' in d else 0 - aBc = d['aBc'] if 'aBc' in d else 0 - ABc = d['ABc'] if 'ABc' in d else 0 - abC = d['abC'] if 'abC' in d else 0 - AbC = d['AbC'] if 'AbC' in d else 0 - aBC = d['aBC'] if 'aBC' in d else 0 - ABC = d['ABC'] if 'ABC' in d else 0 - abc = d['abc'] if 'abc' in d else 0 - return (Abc, aBc, ABc, abC, AbC, aBC, ABC, abc) - - def collapse(self): - """Collapse duplicate records in the VcfFrame. - - Duplicate records have the identical values for CHROM, POS, and REF. - They can result from merging two VCF files. - - .. note:: - The method will sort the order of ALT alleles. - - Returns - ------- - VcfFrame - Collapsed VcfFrame. - - Examples - -------- - Assume we have the following data: - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], - ... 'POS': [100, 100, 200, 200], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['A', 'A', 'C', 'C'], - ... 'ALT': ['C', 'T', 'G', 'G,A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT:AD', 'GT:AD', 'GT:AD', 'GT:AD'], - ... 'Steven': ['0/1:12,15', './.:.', '0/1:16,12', './.:.'], - ... 'Sara': ['./.:.', '0/1:14,15', './.:.', '1/2:0,11,17'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara - 0 chr1 100 . A C . . . GT:AD 0/1:12,15 ./.:. - 1 chr1 100 . A T . . . GT:AD ./.:. 0/1:14,15 - 2 chr2 200 . C G . . . GT:AD 0/1:16,12 ./.:. - 3 chr2 200 . C G,A . . . GT:AD ./.:. 1/2:0,11,17 - - We collapse the VcfFrame: - - >>> vf.collapse().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara - 0 chr1 100 . A C,T . . . GT:AD 0/1:12,15,0 0/2:14,0,15 - 2 chr2 200 . C A,G . . . GT:AD 0/2:16,0,12 1/2:0,17,11 - """ - df = self.df.copy() - dup_idx = df.duplicated(['CHROM', 'POS', 'REF'], keep=False) - dups = {} - for i, r in df[dup_idx].iterrows(): - name = f'{r.CHROM}:{r.POS}:{r.REF}' - if name not in dups: - dups[name] = [] - dups[name].append(i) - - def collapse_one(df): - ref_allele = df.REF.unique()[0] - alt_alleles = [] - for i, r in df.iterrows(): - alt_alleles += r.ALT.split(',') - alt_alleles = sorted(list(set(alt_alleles)), - key=lambda x: (len(x), x)) - all_alleles = [ref_allele] + alt_alleles - - def infunc(x, r_all_alleles, index_map): - if gt_miss(x): - return '' - old_fields = x.split(':') - old_gt = old_fields[0] - new_gt = '/'.join([str(x) for x in - sorted([index_map[int(i)] for i in old_gt.split('/')]) - ]) - new_fields = [new_gt] - for old_field in old_fields[1:]: - old_subfields = old_field.split(',') - new_subfields = ['0' for x in all_alleles] - if len(old_subfields) == len(r_all_alleles): - for i, old_subfield in enumerate(old_subfields): - new_subfields[index_map[i]] = old_subfield - new_fields.append(','.join(new_subfields)) - else: - new_fields.append(old_field) - return ':'.join(new_fields) - - def outfunc(r): - r_alt_alleles = r.ALT.split(',') - r_all_alleles = [r.REF] + r_alt_alleles - old_indicies = [i for i in range(len(r_all_alleles))] - new_indicies = [all_alleles.index(x) for x in r_all_alleles] - index_map = dict(zip(old_indicies, new_indicies)) - r[9:] = r[9:].apply(infunc, args=(r_all_alleles, index_map)) - return r - - df2 = df.apply(outfunc, axis=1) + (0, 1, 2, 1) - def raise_error(c): - if sum(c.values != '') > 1: - message = ('cannot collapse following ' - f'records:\n{df.loc[c.index]}') - raise ValueError(message) + Next, we compare all three: - df2.iloc[:, 9:].apply(raise_error) - df2 = df2.groupby(['CHROM', 'POS', 'REF']).agg(''.join) - df2 = df2.reset_index() - cols = list(df2) - cols[2], cols[3] = cols[3], cols[2] - df2 = df2[cols] - df2.ID = df.ID.unique()[0] - df2.ALT = ','.join(alt_alleles) - df2.QUAL = df.QUAL.unique()[0] - df2.FILTER = df.FILTER.unique()[0] - df2.INFO = df.INFO.unique()[0] - df2.FORMAT = df.FORMAT.unique()[0] - s = df2.squeeze() - s = s.replace('', row_missval(s)) - return s + >>> vf.compare('Steven', 'Sara', 'James') + (0, 0, 1, 1, 0, 1, 1, 0) + """ + if c is None: + result = self._compare_two(a, b) + else: + result = self._compare_three(a, b, c) + return result - for name, i in dups.items(): - df.iloc[i] = collapse_one(df.iloc[i]) - df.drop_duplicates(subset=['CHROM', 'POS', 'REF'], inplace=True) + def _compare_two(self, a, b): + a = a if isinstance(a, str) else self.samples[a] + b = b if isinstance(b, str) else self.samples[b] + def func(r): + a_has = gt_hasvar(r[a]) + b_has = gt_hasvar(r[b]) + if a_has and not b_has: + return 'Ab' + elif not a_has and b_has: + return 'aB' + elif a_has and b_has: + return 'AB' + else: + return 'ab' + d = self.df.apply(func, axis=1).value_counts().to_dict() + Ab = d['Ab'] if 'Ab' in d else 0 + aB = d['aB'] if 'aB' in d else 0 + AB = d['AB'] if 'AB' in d else 0 + ab = d['ab'] if 'ab' in d else 0 + return (Ab, aB, AB, ab) - vf = self.__class__(self.copy_meta(), df) - return vf + def _compare_three(self, a, b, c): + a = a if isinstance(a, str) else self.samples[a] + b = b if isinstance(b, str) else self.samples[b] + c = c if isinstance(c, str) else self.samples[c] + def func(r): + a_has = gt_hasvar(r[a]) + b_has = gt_hasvar(r[b]) + c_has = gt_hasvar(r[c]) + if a_has and not b_has and not c_has: + return 'Abc' + elif not a_has and b_has and not c_has: + return 'aBc' + elif a_has and b_has and not c_has: + return 'ABc' + elif not a_has and not b_has and c_has: + return 'abC' + elif a_has and not b_has and c_has: + return 'AbC' + elif not a_has and b_has and c_has: + return 'aBC' + elif a_has and b_has and c_has: + return 'ABC' + else: + return 'abc' + d = self.df.apply(func, axis=1).value_counts().to_dict() + Abc = d['Abc'] if 'Abc' in d else 0 + aBc = d['aBc'] if 'aBc' in d else 0 + ABc = d['ABc'] if 'ABc' in d else 0 + abC = d['abC'] if 'abC' in d else 0 + AbC = d['AbC'] if 'AbC' in d else 0 + aBC = d['aBC'] if 'aBC' in d else 0 + ABC = d['ABC'] if 'ABC' in d else 0 + abc = d['abc'] if 'abc' in d else 0 + return (Abc, aBc, ABc, abC, AbC, aBC, ABC, abc) def combine(self, a, b): """Combine the genotype data of Sample A and Sample B. @@ -1242,168 +1637,218 @@ def meta_keys(self): if '=>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], - ... 'POS': [100, 100, 200, 200], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['A', 'A', 'C', 'C'], - ... 'ALT': ['C', 'T', 'G', 'G,A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT:AD', 'GT:AD', 'GT:AD', 'GT:AD'], - ... 'Steven': ['0/1:12,15', '0/0:32,1', '0/1:16,12', './.:.'], - ... 'Sara': ['0/1:13,17', '0/1:14,15', './.:.', '1/2:0,11,17'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara - 0 chr1 100 . A C . . . GT:AD 0/1:12,15 0/1:13,17 - 1 chr1 100 . A T . . . GT:AD 0/0:32,1 0/1:14,15 - 2 chr2 200 . C G . . . GT:AD 0/1:16,12 ./.:. - 3 chr2 200 . C G,A . . . GT:AD ./.:. 1/2:0,11,17 - We can add the DP subfield to our genotype data: + .. plot:: + :context: close-figs - >>> vf.add_dp().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara - 0 chr1 100 . A C . . . GT:AD:DP 0/1:12,15:27 0/1:13,17:30 - 1 chr1 100 . A T . . . GT:AD:DP 0/0:32,1:33 0/1:14,15:29 - 2 chr2 200 . C G . . . GT:AD:DP 0/1:16,12:28 ./.:.:. - 3 chr2 200 . C G,A . . . GT:AD:DP ./.:.:. 1/2:0,11,17:28 - """ - def outfunc(r): - i = r.FORMAT.split(':').index('AD') - def infunc(x): - ad = x.split(':')[i].split(',') - dp = 0 - for depth in ad: - if depth == '.': - return f'{x}:.' - dp += int(depth) - return f'{x}:{dp}' - r.iloc[9:] = r.iloc[9:].apply(infunc) - r.FORMAT += ':DP' - return r - df = self.df.apply(outfunc, axis=1) - vf = self.__class__(self.copy_meta(), df) - return vf + >>> from fuc import pyvcf, common + >>> common.load_dataset('pyvcf') + >>> f = '~/fuc-data/pyvcf/plot_comparison.vcf' + >>> vf = pyvcf.VcfFrame.from_file(f) + >>> a = ['Steven_A', 'John_A', 'Sara_A'] + >>> b = ['Steven_B', 'John_B', 'Sara_B'] + >>> c = ['Steven_C', 'John_C', 'Sara_C'] + >>> vf.plot_comparison(a, b) - def add_flag(self, flag, order='last', index=None): - """Add the given flag to the INFO field. + .. plot:: + :context: close-figs - The default behavior is to add the flag to all rows in the VcfFrame. + >>> vf.plot_comparison(a, b, c) + """ + if len(a) != len(b): + raise ValueError('Groups A and B have different length.') + if c is not None and len(a) != len(c): + raise ValueError('Group C has unmatched length.') + if labels is None: + if c is None: + labels = ('A', 'B') + else: + labels = ('A', 'B', 'C') + if ax is None: + fig, ax = plt.subplots(figsize=figsize) + venn_kws = dict(ax=ax, alpha=0.5, set_labels=labels) + if c is None: + out = self._plot_comparison_two(a, b, venn_kws) + else: + out = self._plot_comparison_three(a, b, c, venn_kws) + return ax, out + + def _plot_comparison_two(self, a, b, venn_kws): + n = [0, 0, 0, 0] + for i in range(len(a)): + n = [x + y for x, y in zip(n, self.compare(a[i], b[i]))] + out = venn2(subsets=n[:-1], **venn_kws) + return out + + def _plot_comparison_three(self, a, b, c, venn_kws): + n = [0, 0, 0, 0, 0, 0, 0, 0] + for i in range(len(a)): + n = [x + y for x, y in zip(n, self.compare(a[i], b[i], c[i]))] + out = venn3(subsets=n[:-1], **venn_kws) + return out + + def plot_histplot( + self, af=None, hue=None, kde=True, ax=None, figsize=None, **kwargs + ): + """Create a histogram of TMB distribution. Parameters ---------- - flag : str - INFO flag. - order : {'last', 'first', False}, default: 'last' - Determines the order in which the flag will be added. - - - ``last`` : Add to the end of the list. - - ``first`` : Add to the beginning of the list. - - ``False`` : Overwrite the existing field. - - index : list or pandas.Series, optional - Boolean index array indicating which rows should be updated. + af : pyvcf.AnnFrame + AnnFrame. + hue : list, optional + Grouping variable that will produce bars with different colors. + Requires the `af` option. + kde : bool, default: True + Compute a kernel density estimate to smooth the distribution. + ax : matplotlib.axes.Axes, optional + Pre-existing axes for the plot. Otherwise, crete a new one. + figsize : tuple, optional + Width, height in inches. Format: (float, float). + kwargs + Other keyword arguments will be passed down to + :meth:`seaborn.histplot`. Returns ------- - VcfFrame - Updated VcfFrame. + matplotlib.axes.Axes + The matplotlib axes containing the plot. Examples -------- - Assume we have the following data: - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'T', 'A', 'C'], - ... 'ALT': ['A', 'C', 'T', 'A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', 'DB', 'DB', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/0', '0/1', '0/1', '1/1'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/0 - 1 chr1 101 . T C . . DB GT 0/1 - 2 chr1 102 . A T . . DB GT 0/1 - 3 chr1 103 . C A . . . GT 1/1 - - We can add the SOMATIC flag to the INFO field: - - >>> vf.add_flag('SOMATIC').df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . SOMATIC GT 0/0 - 1 chr1 101 . T C . . DB;SOMATIC GT 0/1 - 2 chr1 102 . A T . . DB;SOMATIC GT 0/1 - 3 chr1 103 . C A . . SOMATIC GT 1/1 - Setting ``order='first'`` will append the flag at the beginning: + .. plot:: + :context: close-figs + + >>> from fuc import pyvcf + >>> vcf_data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103, 103], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['T', 'T', 'T', 'T', 'T'], + ... 'ALT': ['C', 'C', 'C', 'C', 'C'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], + ... 'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'], + ... 'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'], + ... 'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'], + ... 'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'], + ... 'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], + ... 'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'], + ... 'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], + ... 'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'], + ... } + >>> annot_data = { + ... 'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'], + ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'], + ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data) + >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample') + >>> vf.plot_histplot() + + .. plot:: + :context: close-figs + + >>> vf.plot_histplot(hue='Type', af=af) + """ + s = self.df.iloc[:, 9:].applymap(gt_hasvar).sum() + s.name = 'TMB' + if af is None: + df = s.to_frame() + else: + df = pd.concat([af.df, s], axis=1, join='inner') + if ax is None: + fig, ax = plt.subplots(figsize=figsize) + sns.histplot(data=df, x='TMB', hue=hue, kde=kde, **kwargs) + return ax - >>> vf.add_flag('SOMATIC', order='first').df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . SOMATIC GT 0/0 - 1 chr1 101 . T C . . SOMATIC;DB GT 0/1 - 2 chr1 102 . A T . . SOMATIC;DB GT 0/1 - 3 chr1 103 . C A . . SOMATIC GT 1/1 + def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs): + """Create a scatter plot showing TMB between paired samples. - Setting ``order=False`` will overwrite the INFO field: + Parameters + ---------- + a, b : list + Sample names. The lists must have the same shape. + ax : matplotlib.axes.Axes, optional + Pre-existing axes for the plot. Otherwise, crete a new one. + figsize : tuple, optional + Width, height in inches. Format: (float, float). + kwargs + Other keyword arguments will be passed down to + :meth:`seaborn.regplot`. - >>> vf.add_flag('SOMATIC', order=False).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . SOMATIC GT 0/0 - 1 chr1 101 . T C . . SOMATIC GT 0/1 - 2 chr1 102 . A T . . SOMATIC GT 0/1 - 3 chr1 103 . C A . . SOMATIC GT 1/1 + Returns + ------- + matplotlib.axes.Axes + The matplotlib axes containing the plot. - We can also specify which rows should be updated: + Examples + -------- - >>> vf.add_flag('SOMATIC', index=[True, True, False, False]).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . SOMATIC GT 0/0 - 1 chr1 101 . T C . . DB;SOMATIC GT 0/1 - 2 chr1 102 . A T . . DB GT 0/1 - 3 chr1 103 . C A . . . GT 1/1 + .. plot:: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102], + ... 'ID': ['.', '.', '.'], + ... 'REF': ['G', 'T', 'T'], + ... 'ALT': ['A', 'C', 'C'], + ... 'QUAL': ['.', '.', '.'], + ... 'FILTER': ['.', '.', '.'], + ... 'INFO': ['.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT'], + ... 'Steven_A': ['0/1', '0/1', '0/1'], + ... 'Steven_B': ['0/1', '0/1', '0/1'], + ... 'Sara_A': ['0/0', '1/1', '1/1'], + ... 'Sara_B': ['0/0', '1/1', '1/1'], + ... 'John_A': ['0/0', '0/0', '1/1'], + ... 'John_B': ['0/0', '0/0', '1/1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + >>> a = ['Steven_A', 'Sara_A', 'John_A'] + >>> b = ['Steven_B', 'Sara_B', 'John_B'] + >>> vf.plot_regplot(a, b) """ - if index is None: - index = [True for i in range(self.shape[0])] - def f(r): - if not index[r.name]: - return r - if r.INFO == '.': - r.INFO = flag - elif not order: - r.INFO = flag - elif order == 'first': - r.INFO = f'{flag};{r.INFO}' - else: - r.INFO += f';{flag}' - return r - df = self.df.apply(f, axis=1) - vf = self.__class__(self.copy_meta(), df) - return vf + s = self.df.iloc[:, 9:].applymap(gt_hasvar).sum() + if ax is None: + fig, ax = plt.subplots(figsize=figsize) + sns.regplot(x=s[a], y=s[b], ax=ax, **kwargs) + return ax def markmiss_ad(self, threshold, samples=None, full=False, as_nan=False): """Mark genotypes whose AD is below threshold as missing. @@ -1428,9 +1873,9 @@ def markmiss_ad(self, threshold, samples=None, full=False, as_nan=False): See Also -------- - markmiss_af + VcfFrame.markmiss_af Similar method using AF. - markmiss_dp + VcfFrame.markmiss_dp Similar method using DP. Examples @@ -1530,9 +1975,9 @@ def markmiss_af(self, threshold, samples=None, full=False, as_nan=False): See Also -------- - markmiss_ad + VcfFrame.markmiss_ad Similar method using AD. - markmiss_dp + VcfFrame.markmiss_dp Similar method using DP. Examples @@ -1632,9 +2077,9 @@ def markmiss_dp(self, threshold, samples=None, full=False, as_nan=False): See Also -------- - markmiss_ad + VcfFrame.markmiss_ad Similar method using allele depth. - markmiss_af + VcfFrame.markmiss_af Similar method using allele fraction. Examples @@ -1723,7 +2168,7 @@ def expand(self): See Also -------- - collapse + VcfFrame.collapse Collapse duplicate records in the VcfFrame. Examples @@ -1786,7 +2231,7 @@ def one_gt(g, i): data.append(s) return self.__class__(self.copy_meta(), pd.concat(data, axis=1).T) - def filter_bed(self, bed, opposite=False, index=False): + def filter_bed(self, bed, opposite=False, as_index=False): """Select rows that overlap with the given BED data. Parameters @@ -1795,7 +2240,7 @@ def filter_bed(self, bed, opposite=False, index=False): BedFrame or path to a BED file. opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -1855,7 +2300,7 @@ def filter_bed(self, bed, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_bed(bf, index=True) + >>> vf.filter_bed(bf, as_index=True) 0 True 1 False 2 True @@ -1871,18 +2316,18 @@ def filter_bed(self, bed, opposite=False, index=False): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_empty(self, opposite=False, index=False): + def filter_empty(self, opposite=False, as_index=False): """Remove rows with no genotype calls at all. Parameters ---------- opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -1933,7 +2378,7 @@ def filter_empty(self, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_empty(index=True) + >>> vf.filter_empty(as_index=True) 0 True 1 False 2 True @@ -1944,18 +2389,18 @@ def filter_empty(self, opposite=False, index=False): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_indel(self, opposite=False, index=False): + def filter_indel(self, opposite=False, as_index=False): """Remove rows with an indel. Parameters ---------- opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2004,7 +2449,7 @@ def filter_indel(self, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_indel(index=True) + >>> vf.filter_indel(as_index=True) 0 True 1 False 2 False @@ -2014,11 +2459,11 @@ def filter_indel(self, opposite=False, index=False): i = ~self.df.apply(row_hasindel, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_flagall(self, flags, opposite=False, index=False): + def filter_flagall(self, flags, opposite=False, as_index=False): """Select rows if all of the given INFO flags are present. Parameters @@ -2027,7 +2472,7 @@ def filter_flagall(self, flags, opposite=False, index=False): List of INFO flags. opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2037,7 +2482,7 @@ def filter_flagall(self, flags, opposite=False, index=False): See Also -------- - filter_flagany + VcfFrame.filter_flagany Similar method that selects rows if any one of the given INFO flags is present. @@ -2082,7 +2527,7 @@ def filter_flagall(self, flags, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_flagall(['H2', 'DB'], index=True) + >>> vf.filter_flagall(['H2', 'DB'], as_index=True) 0 False 1 True 2 True @@ -2095,11 +2540,11 @@ def f(r): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_flagany(self, flags, opposite=False, index=False): + def filter_flagany(self, flags, opposite=False, as_index=False): """Select rows if any one of the given INFO flags is present. Parameters @@ -2108,7 +2553,7 @@ def filter_flagany(self, flags, opposite=False, index=False): List of INFO flags. opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2118,7 +2563,7 @@ def filter_flagany(self, flags, opposite=False, index=False): See Also -------- - filter_flagall + VcfFrame.filter_flagall Similar method that selects rows if all of the given INFO flags are present. @@ -2163,7 +2608,7 @@ def filter_flagany(self, flags, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_flagany(['H2'], index=True) + >>> vf.filter_flagany(['H2'], as_index=True) 0 False 1 True 2 True @@ -2176,18 +2621,18 @@ def f(r): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_multialt(self, opposite=False, index=False): + def filter_multialt(self, opposite=False, as_index=False): """Remove rows with multiple ALT alleles. Parameters ---------- opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2238,7 +2683,7 @@ def filter_multialt(self, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_multialt(index=True) + >>> vf.filter_multialt(as_index=True) 0 False 1 True 2 True @@ -2248,18 +2693,18 @@ def filter_multialt(self, opposite=False, index=False): i = self.df.apply(lambda r: ',' not in r.ALT, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_pass(self, opposite=False, index=False): + def filter_pass(self, opposite=False, as_index=False): """Select rows with PASS in the FILTER field. Parameters ---------- opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2308,7 +2753,7 @@ def filter_pass(self, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_pass(index=True) + >>> vf.filter_pass(as_index=True) 0 True 1 False 2 True @@ -2319,18 +2764,18 @@ def filter_pass(self, opposite=False, index=False): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_phased(self, opposite=False, index=False): + def filter_phased(self, opposite=False, as_index=False): """Remove rows with phased genotypes. Parameters ---------- opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2379,7 +2824,7 @@ def filter_phased(self, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_phased(index=True) + >>> vf.filter_phased(as_index=True) 0 False 1 True 2 True @@ -2390,17 +2835,19 @@ def filter_phased(self, opposite=False, index=False): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_polyp(self, opposite=False, index=False): + def filter_polyp(self, opposite=False, as_index=False): """Remove rows with a polyploid genotype call. Parameters ---------- - include : bool, default: False - If True, include only such rows instead of excluding them. + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. Returns ------- @@ -2448,7 +2895,7 @@ def filter_polyp(self, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_polyp(index=True) + >>> vf.filter_polyp(as_index=True) 0 False 1 True 2 False @@ -2459,11 +2906,11 @@ def filter_polyp(self, opposite=False, index=False): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_qual(self, threshold, opposite=False, index=False): + def filter_qual(self, threshold, opposite=False, as_index=False): """Select rows with minimum QUAL value. Parameters @@ -2472,7 +2919,7 @@ def filter_qual(self, threshold, opposite=False, index=False): Minimum QUAL value. opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2522,7 +2969,7 @@ def filter_qual(self, threshold, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_qual(30, index=True) + >>> vf.filter_qual(30, as_index=True) 0 False 1 True 2 False @@ -2537,11 +2984,11 @@ def one_row(r): i = self.df.apply(one_row, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_sampall(self, samples=None, opposite=False, index=False): + def filter_sampall(self, samples=None, opposite=False, as_index=False): """Select rows if all of the given samples have the variant. The default behavior is to use all samples in the VcfFrame. @@ -2552,7 +2999,7 @@ def filter_sampall(self, samples=None, opposite=False, index=False): List of sample names or indicies. opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2562,7 +3009,7 @@ def filter_sampall(self, samples=None, opposite=False, index=False): See Also -------- - filter_sampany + VcfFrame.filter_sampany Similar method that selects rows if any one of the given samples has the variant. @@ -2617,7 +3064,7 @@ def filter_sampall(self, samples=None, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_sampall(index=True) + >>> vf.filter_sampall(as_index=True) 0 True 1 False 2 False @@ -2633,11 +3080,11 @@ def filter_sampall(self, samples=None, opposite=False, index=False): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_sampany(self, samples=None, opposite=False, index=False): + def filter_sampany(self, samples=None, opposite=False, as_index=False): """Select rows if any one of the given samples has the variant. The default behavior is to use all samples in the VcfFrame. @@ -2648,7 +3095,7 @@ def filter_sampany(self, samples=None, opposite=False, index=False): List of sample names or indicies. opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2658,7 +3105,7 @@ def filter_sampany(self, samples=None, opposite=False, index=False): See Also -------- - filter_sampall + VcfFrame.filter_sampall Similar method that selects rows if all of the given samples have the variant. @@ -2712,7 +3159,7 @@ def filter_sampany(self, samples=None, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_sampany(index=True) + >>> vf.filter_sampany(as_index=True) 0 True 1 True 2 True @@ -2732,7 +3179,7 @@ def filter_sampany(self, samples=None, opposite=False, index=False): return i return self.__class__(self.copy_meta(), self.df[i]) - def filter_sampnum(self, threshold, opposite=False, index=False): + def filter_sampnum(self, threshold, opposite=False, as_index=False): """Select rows if the variant is prevalent enough. Parameters @@ -2741,7 +3188,7 @@ def filter_sampnum(self, threshold, opposite=False, index=False): Minimum number or fraction of samples with the variant. opposite : bool, default: False If True, return rows that don't meet the said criteria. - index : bool, default: False + as_index : bool, default: False If True, return boolean index array instead of VcfFrame. Returns @@ -2798,7 +3245,7 @@ def filter_sampnum(self, threshold, opposite=False, index=False): Finally, we can return boolean index array from the filtering: - >>> vf.filter_sampnum(2, index=True) + >>> vf.filter_sampnum(2, as_index=True) 0 True 1 True 2 False @@ -2813,7 +3260,7 @@ def f(r): i = self.df.apply(f, axis=1) if opposite: i = ~i - if index: + if as_index: return i vf = self.__class__(self.copy_meta(), self.df[i]) return vf diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index e176c58..3a724dc 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -1,7 +1,13 @@ """ The pyvep submodule is designed for parsing VCF annotation data from the -`Ensembl VEP `_. -It is designed to be used with ``pyvcf.VcfFrame``. +`Ensembl VEP `_ +program. It should be used with ``pyvcf.VcfFrame``. + +The input VCF should already contain functional annotation data from +Ensemble VEP. The recommended method is Ensemble VEP's +`web interface `_ with +“RefSeq transcripts” as the transcript database and the filtering option +“Show one selected consequence per variant”. """ import re @@ -47,14 +53,6 @@ 'intergenic_variant' ] -def _get_keys(vf): - """Return existing annotation keys (e.g. Allele, IMPACT).""" - l = [] - for i, line in enumerate(vf.meta): - if 'ID=CSQ' in line: - l = re.search(r'Format: (.*?)">', vf.meta[i]).group(1).split('|') - return l - def row_firstann(r): """Return the first result in the row. @@ -492,7 +490,7 @@ def parseann(vf, targets, sep=' | ', as_series=False): 3 MTOR dtype: object """ - _targets = [x if isinstance(x, int) else _get_keys(vf).index(x) for x in targets] + _targets = [x if isinstance(x, int) else annot_names(vf).index(x) for x in targets] def func(r): ann = row_firstann(r) if not ann: @@ -509,13 +507,13 @@ def func(r): def get_index(vf, target): """Return the index of the target field (e.g. CLIN_SIG).""" - headers = _get_keys(vf) + headers = annot_names(vf) return headers.index(target) def get_table(vf): """Write the VcfFrame as a tab-delimited text file.""" df = vf.df.copy() - headers = _get_keys(vf) + headers = annot_names(vf) def func(r): ann = row_firstann(r) if ann: @@ -597,3 +595,126 @@ def pick_result(vf, mode='mostsevere'): one_row = lambda r: pyvcf.row_updateinfo(r, 'CSQ', funcs[mode](r)) new_vf.df.INFO = vf.df.apply(one_row, axis=1) return new_vf + +def annot_names(vf): + """Return the list of avaialble consequence annotations in the VcfFrame. + + Parameters + ---------- + vf : VcfFrame + VcfFrame. + + Returns + ------- + list + List of consequence annotations. + """ + l = [] + for i, line in enumerate(vf.meta): + if 'ID=CSQ' in line: + l = re.search(r'Format: (.*?)">', vf.meta[i]).group(1).split('|') + return l + +def filter_af(vf, name, threshold, opposite=None, as_index=False): + """Select rows whose selected AF is below threshold. + + Parameters + ---------- + name : str + Name of the consequence annotation representing AF (e.g. 'gnomAD_AF'). + threshold : float + Minimum AF. + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. + + Returns + ------- + VcfFrame or pandas.Series + Filtered VcfFrame or boolean index array. + """ + i = annot_names(vf).index(name) + def one_row(r): + af = row_firstann(r).split('|')[i] + if not af: + af = 0 + return float(af) < threshold + i = vf.df.apply(one_row, axis=1) + if opposite: + i = ~i + if as_index: + return i + df = vf.df[i].reset_index(drop=True) + vf = vf.__class__(vf.copy_meta(), df) + return vf + +def filter_lof(vf, opposite=None, as_index=False): + """Select rows whose conseuqence annotation is deleterious and/or LoF. + + Parameters + ---------- + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. + + Returns + ------- + VcfFrame or pandas.Series + Filtered VcfFrame or boolean index array. + """ + consequence_index = annot_names(vf).index('Consequence') + impact_index = annot_names(vf).index('IMPACT') + polyphen_index = annot_names(vf).index('PolyPhen') + sift_index = annot_names(vf).index('SIFT') + def one_row(r): + l = row_firstann(r).split('|') + consequence = l[consequence_index] + impact = l[impact_index] + polyphen = l[polyphen_index] + sift = l[sift_index] + if impact not in ['HIGH', 'MODERATE']: + return False + if consequence == 'missense_variant': + if 'damaging' in polyphen or 'deleterious' in sift: + return True + else: + return False + return True + i = vf.df.apply(one_row, axis=1) + if opposite: + i = ~i + if as_index: + return i + df = vf.df[i] + vf = vf.__class__(vf.copy_meta(), df) + return vf + +def filter_biotype(vf, value, opposite=None, as_index=False): + """Select rows whose BIOTYPE matches the given value. + + Parameters + ---------- + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. + + Returns + ------- + VcfFrame or pandas.Series + Filtered VcfFrame or boolean index array. + """ + i = annot_names(vf).index('BIOTYPE') + def one_row(r): + l = row_firstann(r).split('|') + return l[i] == value + i = vf.df.apply(one_row, axis=1) + if opposite: + i = ~i + if as_index: + return i + df = vf.df[i] + vf = vf.__class__(vf.copy_meta(), df) + return vf diff --git a/fuc/version.py b/fuc/version.py index 9d1bb72..f323a57 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.10.0' +__version__ = '0.11.0' diff --git a/setup.py b/setup.py index 33161ea..dbc32f3 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ exec(open('fuc/version.py').read()) requirements = [ - 'biopython', 'lxml', 'matplotlib', 'numpy', 'pandas', + 'biopython', 'lxml', 'matplotlib', 'matplotlib-venn', 'numpy', 'pandas', 'pyranges', 'pysam', 'seaborn' ]