From c2ee025d1ec69d6980507579ef8aa4a9704422f9 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 10 Jun 2021 16:18:12 +0900 Subject: [PATCH 01/10] Add pyvcf.VcfFrame.add_af --- CHANGELOG.rst | 5 ++++ README.rst | 1 + docs/create.py | 1 + fuc/api/pyvcf.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ fuc/version.py | 2 +- 5 files changed, 73 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 18679d3..005e738 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,11 @@ Changelog ********* +0.12.0 (in development) +----------------------- + +* Add new method :meth:`pyvcf.VcfFrame.add_af`. + 0.11.0 (2021-06-10) ------------------- diff --git a/README.rst b/README.rst index adb03b2..c2f0c57 100644 --- a/README.rst +++ b/README.rst @@ -245,6 +245,7 @@ The following packages are required to run fuc: biopython lxml matplotlib + matplotlib-venn numpy pandas pyranges diff --git a/docs/create.py b/docs/create.py index f639cfa..598b398 100644 --- a/docs/create.py +++ b/docs/create.py @@ -273,6 +273,7 @@ biopython lxml matplotlib + matplotlib-venn numpy pandas pyranges diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index e9f62d6..a9c49a3 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -743,6 +743,71 @@ def shape(self): """tuple : Dimensionality of VcfFrame (variants, samples).""" return (self.df.shape[0], len(self.samples)) + def add_af(self, decimals=3): + """Compute AF using AD and add it to the FORMAT field. + + Parameters + ---------- + decimals : int, default: 3 + Number of decimals to display. + + Returns + ------- + VcfFrame + Updated VcfFrame. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], + ... 'POS': [100, 100, 200, 200], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'C', 'C'], + ... 'ALT': ['C', 'T', 'G', 'G,A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT:AD', 'GT:AD', 'GT:AD', 'GT:AD'], + ... 'Steven': ['0/1:12,15', '0/0:32,1', '0/1:16,12', './.:.'], + ... 'Sara': ['0/1:13,17', '0/1:14,15', './.:.', '1/2:0,11,17'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara + 0 chr1 100 . A C . . . GT:AD 0/1:12,15 0/1:13,17 + 1 chr1 100 . A T . . . GT:AD 0/0:32,1 0/1:14,15 + 2 chr2 200 . C G . . . GT:AD 0/1:16,12 ./.:. + 3 chr2 200 . C G,A . . . GT:AD ./.:. 1/2:0,11,17 + + We can add the AF subfield to our genotype data: + + >>> vf.add_af().df + CHROM POS ID REF ... INFO FORMAT Steven Sara + 0 chr1 100 . A ... . GT:AD:AF 0/1:12,15:0.556 0/1:13,17:0.567 + 1 chr1 100 . A ... . GT:AD:AF 0/0:32,1:0.030 0/1:14,15:0.517 + 2 chr2 200 . C ... . GT:AD:AF 0/1:16,12:0.429 ./.:.:. + 3 chr2 200 . C ... . GT:AD:AF ./.:.:. 1/2:0,11,17:0.607 + """ + def one_row(r): + i = r.FORMAT.split(':').index('AD') + def one_gt(g): + ad = g.split(':')[i] + if ad == '.': + return f'{g}:.' + depths = [int(x) for x in ad.split(',')] + total = sum(depths) + af = max(depths[1:]) / total + return f'{g}:{af:.{decimals}f}' + r.iloc[9:] = r.iloc[9:].apply(one_gt) + r.FORMAT += ':AF' + return r + df = self.df.apply(one_row, axis=1) + vf = self.__class__(self.copy_meta(), df) + return vf + def add_dp(self): """Compute DP using AD and add it to the FORMAT field. diff --git a/fuc/version.py b/fuc/version.py index f323a57..2c7bffb 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.11.0' +__version__ = '0.12.0' From 4e038dcf4804af0aa8eb2bef61c15fee7690b717 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 10 Jun 2021 17:18:45 +0900 Subject: [PATCH 02/10] Update API --- fuc/api/pymaf.py | 66 ++++++++++++++++++++++++------------------------ fuc/api/pyvep.py | 36 ++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 33 deletions(-) diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 67f9f18..783ef2b 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -11,39 +11,39 @@ protein change. However, most of the analysis in pymaf uses the following fields: -+------+------------------------+----------------------+-------------------------------+ -| No. | Name | Description | Examples | -+======+========================+======================+===============================+ -| 1 | Hugo_Symbol | HUGO gene symbol | 'TP53', 'Unknown' | -+------+------------------------+----------------------+-------------------------------+ -| 2 | Entrez_Gene_Id | Entrez or Ensembl ID | 0, 8714 | -+------+------------------------+----------------------+-------------------------------+ -| 3 | Center | Sequencing center | '.', 'genome.wustl.edu' | -+------+------------------------+----------------------+-------------------------------+ -| 4 | NCBI_Build | Genome assembly | '37', 'GRCh38' | -+------+------------------------+----------------------+-------------------------------+ -| 5 | Chromosome | Chromosome name | 'chr1' | -+------+------------------------+----------------------+-------------------------------+ -| 6 | Start_Position | Start coordinate | 119031351 | -+------+------------------------+----------------------+-------------------------------+ -| 7 | End_Position | End coordinate | 44079555 | -+------+------------------------+----------------------+-------------------------------+ -| 8 | Strand | Genomic strand | '+', '-' | -+------+------------------------+----------------------+-------------------------------+ -| 9 | Variant_Classification | Translational effect | 'Missense_Mutation', 'Silent' | -+------+------------------------+----------------------+-------------------------------+ -| 10 | Variant_Type | Mutation type | 'SNP', 'INS', 'DEL' | -+------+------------------------+----------------------+-------------------------------+ -| 11 | Reference_Allele | Reference allele | 'T', '-', 'ACAA' | -+------+------------------------+----------------------+-------------------------------+ -| 12 | Tumor_Seq_Allele1 | First tumor allele | 'A', '-', 'TCA' | -+------+------------------------+----------------------+-------------------------------+ -| 13 | Tumor_Seq_Allele2 | Second tumor allele | 'A', '-', 'TCA' | -+------+------------------------+----------------------+-------------------------------+ -| 14 | Tumor_Sample_Barcode | Sample ID | 'TCGA-AB-3002' | -+------+------------------------+----------------------+-------------------------------+ -| 15 | Protein_Change | Protein change | 'p.L558Q' | -+------+------------------------+----------------------+-------------------------------+ ++-----+------------------------+----------------------+-------------------------------+ +| No. | Name | Description | Examples | ++=====+========================+======================+===============================+ +| 1 | Hugo_Symbol | HUGO gene symbol | 'TP53', 'Unknown' | ++-----+------------------------+----------------------+-------------------------------+ +| 2 | Entrez_Gene_Id | Entrez or Ensembl ID | 0, 8714 | ++-----+------------------------+----------------------+-------------------------------+ +| 3 | Center | Sequencing center | '.', 'genome.wustl.edu' | ++-----+------------------------+----------------------+-------------------------------+ +| 4 | NCBI_Build | Genome assembly | '37', 'GRCh38' | ++-----+------------------------+----------------------+-------------------------------+ +| 5 | Chromosome | Chromosome name | 'chr1' | ++-----+------------------------+----------------------+-------------------------------+ +| 6 | Start_Position | Start coordinate | 119031351 | ++-----+------------------------+----------------------+-------------------------------+ +| 7 | End_Position | End coordinate | 44079555 | ++-----+------------------------+----------------------+-------------------------------+ +| 8 | Strand | Genomic strand | '+', '-' | ++-----+------------------------+----------------------+-------------------------------+ +| 9 | Variant_Classification | Translational effect | 'Missense_Mutation', 'Silent' | ++-----+------------------------+----------------------+-------------------------------+ +| 10 | Variant_Type | Mutation type | 'SNP', 'INS', 'DEL' | ++-----+------------------------+----------------------+-------------------------------+ +| 11 | Reference_Allele | Reference allele | 'T', '-', 'ACAA' | ++-----+------------------------+----------------------+-------------------------------+ +| 12 | Tumor_Seq_Allele1 | First tumor allele | 'A', '-', 'TCA' | ++-----+------------------------+----------------------+-------------------------------+ +| 13 | Tumor_Seq_Allele2 | Second tumor allele | 'A', '-', 'TCA' | ++-----+------------------------+----------------------+-------------------------------+ +| 14 | Tumor_Sample_Barcode | Sample ID | 'TCGA-AB-3002' | ++-----+------------------------+----------------------+-------------------------------+ +| 15 | Protein_Change | Protein change | 'p.L558Q' | ++-----+------------------------+----------------------+-------------------------------+ """ import pandas as pd diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index 3a724dc..b487acb 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -8,6 +8,42 @@ `web interface `_ with “RefSeq transcripts” as the transcript database and the filtering option “Show one selected consequence per variant”. + +A typical VEP-annotated VCF file contains many fields ranging from gene +symbol to protein change. However, most of the analysis in pyvep uses the +following fields: + ++-----+--------------------+---------------------+----------------------------------------------+ +| No. | Name | Description | Examples | ++=====+====================+=====================+==============================================+ +| 1 | Allele | Variant allele | 'C', 'T', 'CG' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 2 | Consequence | Consequence type | 'missense_variant', 'stop_gained' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 3 | IMPACT | Consequence impact | 'MODERATE', 'HIGH' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 4 | SYMBOL | Gene symbol | 'TP53', 'APC' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 5 | Gene | Ensembl ID | 7157, 55294 | ++-----+--------------------+---------------------+----------------------------------------------+ +| 6 | BIOTYPE | Transcript biotype | 'protein_coding', 'pseudogene', 'miRNA' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 7 | EXON | Exon number | '8/17', '17/17' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 8 | Protein_position | Amino acid position | 234, 1510 | ++-----+--------------------+---------------------+----------------------------------------------+ +| 9 | Amino_acids | Amino acid changes | 'R/\*', 'A/X', 'V/A' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 10 | Existing_variation | Variant identifier | 'rs17851045', 'rs786201856&COSV57325157' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 11 | STRAND | Genomic strand | 1, -1 | ++-----+--------------------+---------------------+----------------------------------------------+ +| 12 | SIFT | SIFT prediction | 'deleterious(0)', 'tolerated(0.6)' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 13 | PolyPhen | PolyPhen prediction | 'benign(0.121)', 'probably_damaging(0.999)' | ++-----+--------------------+---------------------+----------------------------------------------+ +| 14 | CLIN_SIG | ClinVar prediction | 'pathogenic', 'likely_pathogenic&pathogenic' | ++-----+--------------------+---------------------+----------------------------------------------+ """ import re From a6692dc5245770a74f61ccff9eb8fab9dc300c5f Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 11 Jun 2021 13:30:56 +0900 Subject: [PATCH 03/10] Update CLI --- CHANGELOG.rst | 2 ++ README.rst | 1 + docs/cli.rst | 23 ++++++++++++++ fuc/api/pyvcf.py | 75 ++++++++++++++++++++++++++++++++++++++++++++ fuc/api/pyvep.py | 31 +++++++----------- fuc/cli/vep_query.py | 30 ++++++++++++++++++ 6 files changed, 143 insertions(+), 19 deletions(-) create mode 100644 fuc/cli/vep_query.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 005e738..78b28ae 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,8 @@ Changelog ----------------------- * Add new method :meth:`pyvcf.VcfFrame.add_af`. +* Add new method :meth:`pyvcf.VcfFrame.extract`. +* Add new command :command:`vep_query`. 0.11.0 (2021-06-10) ------------------- diff --git a/README.rst b/README.rst index c2f0c57..14e71da 100644 --- a/README.rst +++ b/README.rst @@ -308,6 +308,7 @@ For getting help on CLI: vcf_merge [VCF] merge two or more VCF files vcf_slice [VCF] slice a VCF file vcf_vcf2bed [VCF] convert a VCF file to a BED file + vep_query [VEP] filter a VEP-annotated VCF file optional arguments: -h, --help show this help message and exit diff --git a/docs/cli.rst b/docs/cli.rst index d1d489e..db63edd 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -38,6 +38,7 @@ For getting help on CLI: vcf_merge [VCF] merge two or more VCF files vcf_slice [VCF] slice a VCF file vcf_vcf2bed [VCF] convert a VCF file to a BED file + vep_query [VEP] filter a VEP-annotated VCF file optional arguments: -h, --help show this help message and exit @@ -481,3 +482,25 @@ vcf_vcf2bed optional arguments: -h, --help show this help message and exit +vep_query +========= + +.. code-block:: console + + $ fuc vep_query -h + usage: fuc vep_query [-h] vcf expr + + This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). + + examples: + $ fuc vep_query in.vcf 'SYMBOL == "TP53"' > out.vcf + $ fuc vep_query in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf + $ fuc vep_query in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf + + positional arguments: + vcf VEP-annotated VCF file + expr query expression to evaluate + + optional arguments: + -h, --help show this help message and exit + diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index a9c49a3..b7883af 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -3685,3 +3685,78 @@ def one_row(r): df = df.drop_duplicates() bf = pybed.BedFrame.from_frame([], df) return bf + + def extract(self, k, func=None, as_nan=False): + """ + Create a DataFrame containing analysis-ready data for the given + genotype key. + + Parameters + ---------- + k : str + Genotype key such as 'DP' and 'AD'. + func : function, optional + Function to apply to each of the returned values. For example, if + the goal is to extract AD only for reference alleles, one can use + ``func=lambda x: x.split(',')[0]`` + as_nan : bool, default: False + If True, missing values will be returned as ``NaN``. + + Returns + ------- + pandas.DataFrame + DataFrame containing requested data. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1'], + ... 'POS': [100, 101], + ... 'ID': ['.', '.'], + ... 'REF': ['A', 'T'], + ... 'ALT': ['A', 'T'], + ... 'QUAL': ['.', '.'], + ... 'FILTER': ['.', '.'], + ... 'INFO': ['.', '.'], + ... 'FORMAT': ['GT:AD:DP', 'GT:AD:DP'], + ... 'A': ['0/1:15,13:28', '0/0:28,1:29'], + ... 'B': ['./.:.:.', '1/1:0,26:26'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . A A . . . GT:AD:DP 0/1:15,13:28 ./.:.:. + 1 chr1 101 . T T . . . GT:AD:DP 0/0:28,1:29 1/1:0,26:26 + >>> vf.extract('GT') + A B + 0 0/1 ./. + 1 0/0 1/1 + >>> vf.extract('GT', as_nan=True) + A B + 0 0/1 NaN + 1 0/0 1/1 + >>> vf.extract('AD') + A B + 0 15,13 . + 1 28,1 0,26 + >>> vf.extract('AD', as_nan=True, func=lambda x: float(x.split(',')[1])) + A B + 0 13.0 NaN + 1 1.0 26.0 + """ + def one_row(r): + i = r.FORMAT.split(':').index(k) + def one_gt(g): + v = g.split(':')[i] + if as_nan and not i and gt_miss(v): + return np.nan + if as_nan and v == '.': + return np.nan + if func is not None: + return func(v) + return v + return r[9:].apply(one_gt) + df = self.df.apply(one_row, axis=1) + return df diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index b487acb..dad9382 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -48,6 +48,7 @@ import re import pandas as pd +import numpy as np from . import pyvcf SEVERITIY = [ @@ -546,26 +547,18 @@ def get_index(vf, target): headers = annot_names(vf) return headers.index(target) -def get_table(vf): - """Write the VcfFrame as a tab-delimited text file.""" - df = vf.df.copy() - headers = annot_names(vf) - def func(r): - ann = row_firstann(r) - if ann: - s = ['.' if x == '' else x for x in ann.split('|')] - else: - s = ['.' for x in headers] - s = pd.Series(s, index=headers) - s = pd.concat([r[:9], s, r[9:]]) - return s - df = df.apply(func, axis=1) - return df +def to_frame(vf): + """ + Create a DataFrame containing analysis-ready annotation data. -def write_table(vf, fn): - """Write the VcfFrame as a tab-delimited text file.""" - df = get_table(vf) - df.to_csv(fn, sep='\t', index=False) + Returns + ------- + pandas.DataFrame + DataFrame containing annotation data. + """ + df = vf.df.apply(lambda r: pd.Series(row_firstann(r).split('|')), axis=1) + df.columns = annot_names(vf) + return df def pick_result(vf, mode='mostsevere'): """Return a new VcfFrame after picking one result per row. diff --git a/fuc/cli/vep_query.py b/fuc/cli/vep_query.py new file mode 100644 index 0000000..e89fdfd --- /dev/null +++ b/fuc/cli/vep_query.py @@ -0,0 +1,30 @@ +from .. import api +from argparse import RawTextHelpFormatter + +command = api.common.script_name(__file__) + +description = f""" +This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). + +examples: + $ fuc {command} in.vcf 'SYMBOL == "TP53"' > out.vcf + $ fuc {command} in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf + $ fuc {command} in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf +""" + +def create_parser(subparsers): + parser = subparsers.add_parser( + command, + help='[VEP] filter a VEP-annotated VCF file', + description=description, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument('vcf', help='VEP-annotated VCF file') + parser.add_argument('expr', help='query expression to evaluate') + +def main(args): + vf = api.pyvcf.VcfFrame.from_file(args.vcf) + df = api.pyvep.to_frame(vf) + i = df.query(args.expr).index + filtered_vf = api.pyvcf.VcfFrame(vf.copy_meta(), vf.df.iloc[i]) + print(filtered_vf.to_string()) From abb7ffd08f7d9be9180a78ab655949c14ff3e1bd Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 11 Jun 2021 13:32:06 +0900 Subject: [PATCH 04/10] Update CHANGELOG.rst --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 78b28ae..103e31c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ Changelog * Add new method :meth:`pyvcf.VcfFrame.add_af`. * Add new method :meth:`pyvcf.VcfFrame.extract`. -* Add new command :command:`vep_query`. +* :issue:`19`: Add new command :command:`vep_query`. 0.11.0 (2021-06-10) ------------------- From 572f6cb01fd33c8321cf98ad81326cc67ecc0afd Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 11 Jun 2021 15:47:36 +0900 Subject: [PATCH 05/10] Update API --- CHANGELOG.rst | 2 + docs/cli.rst | 12 +++- fuc/api/pyvep.py | 142 ++++++++++++++++++++++++++++--------------- fuc/cli/bam_slice.py | 2 +- fuc/cli/vep_query.py | 20 ++++-- 5 files changed, 121 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 103e31c..dbd5db2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,8 @@ Changelog * Add new method :meth:`pyvcf.VcfFrame.add_af`. * Add new method :meth:`pyvcf.VcfFrame.extract`. +* Deprecate methods :meth:`pyvep.filter_af/biotype`. +* Add new method :meth:`pyvep.filter_query`. * :issue:`19`: Add new command :command:`vep_query`. 0.11.0 (2021-06-10) diff --git a/docs/cli.rst b/docs/cli.rst index db63edd..7fa6920 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -118,7 +118,7 @@ bam_slice optional arguments: -h, --help show this help message and exit - --no_index use to this flag to skip indexing + --no_index use this flag to skip indexing bed_intxn ========= @@ -488,14 +488,18 @@ vep_query .. code-block:: console $ fuc vep_query -h - usage: fuc vep_query [-h] vcf expr + usage: fuc vep_query [-h] [--opposite] [--as_zero] vcf expr - This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). + This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). examples: $ fuc vep_query in.vcf 'SYMBOL == "TP53"' > out.vcf + $ fuc vep_query in.vcf 'SYMBOL != "TP53"' > out.vcf + $ fuc vep_query in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf $ fuc vep_query in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf $ fuc vep_query in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf + $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' > out.vcf + $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf positional arguments: vcf VEP-annotated VCF file @@ -503,4 +507,6 @@ vep_query optional arguments: -h, --help show this help message and exit + --opposite use this flag to return records that don’t meet the said criteria + --as_zero use this flag to treat missing values as zero instead of NaN diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index dad9382..4aeb209 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -51,6 +51,69 @@ import numpy as np from . import pyvcf +DATA_TYPES = { + 'Allele': 'str', + 'Consequence': 'str', + 'IMPACT': 'str', + 'SYMBOL': 'str', + 'Gene': 'str', + 'Feature_type': 'str', + 'Feature': 'str', + 'BIOTYPE': 'str', + 'EXON': 'str', + 'INTRON': 'str', + 'HGVSc': 'str', + 'HGVSp': 'str', + 'cDNA_position': 'str', + 'CDS_position': 'str', + 'Protein_position': 'str', + 'Amino_acids': 'str', + 'Codons': 'str', + 'Existing_variation': 'str', + 'DISTANCE': 'str', + 'STRAND': 'str', + 'FLAGS': 'str', + 'SYMBOL_SOURCE': 'str', + 'HGNC_ID': 'str', + 'MANE_SELECT': 'str', + 'MANE_PLUS_CLINICAL': 'str', + 'TSL': 'str', + 'APPRIS': 'str', + 'REFSEQ_MATCH': 'str', + 'REFSEQ_OFFSET': 'str', + 'GIVEN_REF': 'str', + 'USED_REF': 'str', + 'BAM_EDIT': 'str', + 'SIFT': 'str', + 'PolyPhen': 'str', + 'AF': 'float64', + 'AFR_AF': 'float64', + 'AMR_AF': 'float64', + 'EAS_AF': 'float64', + 'EUR_AF': 'float64', + 'SAS_AF': 'float64', + 'AA_AF': 'float64', + 'EA_AF': 'float64', + 'gnomAD_AF': 'float64', + 'gnomAD_AFR_AF': 'float64', + 'gnomAD_AMR_AF': 'float64', + 'gnomAD_ASJ_AF': 'float64', + 'gnomAD_EAS_AF': 'float64', + 'gnomAD_FIN_AF': 'float64', + 'gnomAD_NFE_AF': 'float64', + 'gnomAD_OTH_AF': 'float64', + 'gnomAD_SAS_AF': 'float64', + 'CLIN_SIG': 'str', + 'SOMATIC': 'str', + 'PHENO': 'str', + 'PUBMED': 'str', + 'MOTIF_NAME': 'str', + 'MOTIF_POS': 'str', + 'HIGH_INF_POS': 'str', + 'MOTIF_SCORE_CHANGE': 'str', + 'TRANSCRIPTION_FACTORS': 'str', +} + SEVERITIY = [ 'transcript_ablation', 'splice_acceptor_variant', @@ -547,17 +610,30 @@ def get_index(vf, target): headers = annot_names(vf) return headers.index(target) -def to_frame(vf): +def to_frame(vf, as_zero=False): """ - Create a DataFrame containing analysis-ready annotation data. + Create a DataFrame containing analysis-ready VEP annotation data. + + Parameters + ---------- + vf : VcfFrame + VcfFrame object. + as_zero : bool, default: False + If True, missing values will be converted to zeros instead of ``NaN``. Returns ------- pandas.DataFrame - DataFrame containing annotation data. + DataFrame containing VEP annotation data. """ - df = vf.df.apply(lambda r: pd.Series(row_firstann(r).split('|')), axis=1) + f = lambda r: pd.Series(row_firstann(r).split('|')) + df = vf.df.apply(f, axis=1) + if as_zero: + df = df.replace('', 0) + else: + df = df.replace('', np.nan) df.columns = annot_names(vf) + df = df.astype(DATA_TYPES) return df def pick_result(vf, mode='mostsevere'): @@ -566,7 +642,7 @@ def pick_result(vf, mode='mostsevere'): Parameters ---------- vf : VcfFrame - VcfFrame. + VcfFrame object. mode : {'mostsevere', 'firstann'}, default: 'mostsevere' Selection mode: @@ -644,40 +720,6 @@ def annot_names(vf): l = re.search(r'Format: (.*?)">', vf.meta[i]).group(1).split('|') return l -def filter_af(vf, name, threshold, opposite=None, as_index=False): - """Select rows whose selected AF is below threshold. - - Parameters - ---------- - name : str - Name of the consequence annotation representing AF (e.g. 'gnomAD_AF'). - threshold : float - Minimum AF. - opposite : bool, default: False - If True, return rows that don't meet the said criteria. - as_index : bool, default: False - If True, return boolean index array instead of VcfFrame. - - Returns - ------- - VcfFrame or pandas.Series - Filtered VcfFrame or boolean index array. - """ - i = annot_names(vf).index(name) - def one_row(r): - af = row_firstann(r).split('|')[i] - if not af: - af = 0 - return float(af) < threshold - i = vf.df.apply(one_row, axis=1) - if opposite: - i = ~i - if as_index: - return i - df = vf.df[i].reset_index(drop=True) - vf = vf.__class__(vf.copy_meta(), df) - return vf - def filter_lof(vf, opposite=None, as_index=False): """Select rows whose conseuqence annotation is deleterious and/or LoF. @@ -720,30 +762,32 @@ def one_row(r): vf = vf.__class__(vf.copy_meta(), df) return vf -def filter_biotype(vf, value, opposite=None, as_index=False): - """Select rows whose BIOTYPE matches the given value. +def filter_query(vf, expr, opposite=None, as_index=False, as_zero=False): + """ + Select rows that satisfy the query expression. + + This method essentially wraps the :meth:`pandas.DataFrame.query` method. Parameters ---------- + expr : str + The query string to evaluate. opposite : bool, default: False If True, return rows that don't meet the said criteria. as_index : bool, default: False If True, return boolean index array instead of VcfFrame. + as_zero : bool, default: False + If True, missing values will be converted to zeros instead of ``NaN``. Returns ------- VcfFrame or pandas.Series Filtered VcfFrame or boolean index array. """ - i = annot_names(vf).index('BIOTYPE') - def one_row(r): - l = row_firstann(r).split('|') - return l[i] == value - i = vf.df.apply(one_row, axis=1) + df = to_frame(vf, as_zero=as_zero) + i = vf.df.index.isin(df.query(expr).index) if opposite: i = ~i if as_index: return i - df = vf.df[i] - vf = vf.__class__(vf.copy_meta(), df) - return vf + return vf.__class__(vf.copy_meta(), vf.df[i]) diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index 73501cd..f851e35 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -25,7 +25,7 @@ def create_parser(subparsers): parser.add_argument( '--no_index', action='store_true', - help='use to this flag to skip indexing' + help='use this flag to skip indexing' ) def main(args): diff --git a/fuc/cli/vep_query.py b/fuc/cli/vep_query.py index e89fdfd..2d1ec70 100644 --- a/fuc/cli/vep_query.py +++ b/fuc/cli/vep_query.py @@ -4,12 +4,16 @@ command = api.common.script_name(__file__) description = f""" -This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on the method, please visit its documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). +This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). examples: $ fuc {command} in.vcf 'SYMBOL == "TP53"' > out.vcf + $ fuc {command} in.vcf 'SYMBOL != "TP53"' > out.vcf + $ fuc {command} in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf $ fuc {command} in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf $ fuc {command} in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf + $ fuc {command} in.vcf 'gnomAD_AF < 0.001' > out.vcf + $ fuc {command} in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf """ def create_parser(subparsers): @@ -21,10 +25,18 @@ def create_parser(subparsers): ) parser.add_argument('vcf', help='VEP-annotated VCF file') parser.add_argument('expr', help='query expression to evaluate') + parser.add_argument( + '--opposite', + action='store_true', + help='use this flag to return records that don’t meet the said criteria' + ) + parser.add_argument( + '--as_zero', + action='store_true', + help='use this flag to treat missing values as zero instead of NaN' + ) def main(args): vf = api.pyvcf.VcfFrame.from_file(args.vcf) - df = api.pyvep.to_frame(vf) - i = df.query(args.expr).index - filtered_vf = api.pyvcf.VcfFrame(vf.copy_meta(), vf.df.iloc[i]) + filtered_vf = api.pyvep.filter_query(vf, args.expr, opposite=args.opposite, as_zero=args.as_zero) print(filtered_vf.to_string()) From f2a4726e5caccda0b3b9e1a452c7b91f1428a8e1 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 11 Jun 2021 15:56:48 +0900 Subject: [PATCH 06/10] Update API --- CHANGELOG.rst | 2 +- fuc/api/pyvep.py | 134 +---------------------------------------------- 2 files changed, 3 insertions(+), 133 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dbd5db2..c1ba0b9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ Changelog * Add new method :meth:`pyvcf.VcfFrame.add_af`. * Add new method :meth:`pyvcf.VcfFrame.extract`. -* Deprecate methods :meth:`pyvep.filter_af/biotype`. +* Deprecate methods :meth:`pyvep.filter_af/biotype/nothas/impact`. * Add new method :meth:`pyvep.filter_query`. * :issue:`19`: Add new command :command:`vep_query`. diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index 4aeb209..0553992 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -257,33 +257,6 @@ def row_mostsevere(r): f = lambda x: SEVERITIY.index(x.split('|')[1].split('&')[0]) return sorted(results.split(','), key=f)[0] -def filter_nothas(vf, s, include=False): - """Filter out rows based on the VEP annotations. - - Parameters - ---------- - vf : VcfFrame - Input VcfFrame. - targets : list - List of annotations (e.g. ['missense_variant', 'stop_gained']). - include : bool, default: False - If True, include only such rows instead of excluding them. - - Returns - ------- - vf : VcfFrame - Filtered VcfFrame. - """ - def func(r): - ann = row_firstann(r) - return s not in ann - i = vf.df.apply(func, axis=1) - if include: - i = ~i - df = vf.df[i].reset_index(drop=True) - vf = vf.__class__(vf.copy_meta(), df) - return vf - def filter_clinsig(vf, whitelist=None, blacklist=None, opposite=False, index=False): """Select rows based on the given CLIN_SIG values. @@ -390,7 +363,8 @@ def filter_clinsig(vf, whitelist=None, blacklist=None, opposite=False, blacklist = [] def func(r): ann = row_firstann(r) - values = ann.split('|')[get_index(vf, 'CLIN_SIG')].split('&') + i = annot_names(vf).index('CLIN_SIG') + values = ann.split('|')[i].split('&') if not list(set(values) & set(whitelist)): return False if list(set(values) & set(blacklist)): @@ -405,105 +379,6 @@ def func(r): vf = vf.__class__(vf.copy_meta(), df) return vf -def filter_impact(vf, values, opposite=False, index=False): - """Select rows based on the given IMPACT values. - - List of IMPACT values: - - - LOW - - MODERATE - - HIGH - - MODIFIER - - Parameters - ---------- - vf : VcfFrame - VcfFrame. - values : list, default: None - If any one of the IMPACT values is present, select the row. - opposite : bool, default: False - If True, return rows that don't meet the said criteria. - index : bool, default: False - If True, return boolean index array instead of VcfFrame. - - Returns - ------- - VcfFrame or pandas.Series - Filtered VcfFrame or boolean index array. - - Examples - -------- - Assume we have the following data: - - >>> meta = [ - ... '##fileformat=VCFv4.1', - ... '##VEP="v104" time="2021-05-20 10:50:12" cache="/net/isilonP/public/ro/ensweb-data/latest/tools/grch37/e104/vep/cache/homo_sapiens/104_GRCh37" db="homo_sapiens_core_104_37@hh-mysql-ens-grch37-web" 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"', - ... '##INFO=' - ... ] - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], - ... 'POS': [100, 101, 200, 201], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'C', 'CAG', 'C'], - ... 'ALT': ['T', 'T', 'C', 'T'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': [ - ... 'CSQ=T|missense_variant|MODERATE|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|47/58||||6721|6644|2215|S/Y|tCt/tAt|rs587777894&COSV63868278&COSV63868313||-1||HGNC|3942|||||deleterious(0)|possibly_damaging(0.876)||likely_pathogenic&pathogenic|0&1&1|1&1&1|26619011&27159400&24631838&26018084&27830187|||||', - ... 'CSQ=T|synonymous_variant|LOW|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|49/58||||6986|6909|2303|L|ctG/ctA|rs11121691&COSV63870864||-1||HGNC|3942|||||||0.2206|benign|0&1|1&1|24996771|||||', - ... 'CSQ=-|frameshift_variant|HIGH|BRCA2|ENSG00000139618|Transcript|ENST00000380152.3|protein_coding|18/27||||8479-8480|8246-8247|2749|Q/X|cAG/c|rs80359701||1||HGNC|1101||||||||pathogenic||1|26467025&26295337&15340362|||||', - ... 'CSQ=T|missense_variant|MODERATE|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|30/58||||4516|4439|1480|R/H|cGc/cAc|rs780930764&COSV63868373||-1||HGNC|3942|||||tolerated(0.13)|benign(0)||likely_benign|0&1|1&1||||||' - ... ], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '0/1', '0/1', '0/1'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict(meta, data) - >>> pyvep.parseann(vf, ['IMPACT']).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G T . . MODERATE GT 0/1 - 1 chr1 101 . C T . . LOW GT 0/1 - 2 chr2 200 . CAG C . . HIGH GT 0/1 - 3 chr2 201 . C T . . MODERATE GT 0/1 - - We can select rows with either LOW or HIGH: - - >>> temp_vf = pyvep.filter_impact(vf, ['LOW', 'HIGH']) - >>> pyvep.parseann(temp_vf, ['IMPACT']).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G T . . MODERATE GT 0/1 - 1 chr2 201 . C T . . MODERATE GT 0/1 - - We can also remove those rows: - - >>> temp_vf = pyvep.filter_impact(vf, ['LOW', 'HIGH'], opposite=True) - >>> pyvep.parseann(temp_vf, ['IMPACT']).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G T . . MODERATE GT 0/1 - 1 chr2 201 . C T . . MODERATE GT 0/1 - - Finally, we can return boolean index array from the filtering: - - >>> pyvep.filter_impact(vf, ['LOW', 'HIGH'], index=True) - 0 True - 1 False - 2 False - 3 True - dtype: bool - - """ - def func(r): - ann = row_firstann(r) - impact = ann.split('|')[get_index(vf, 'IMPACT')] - return impact in values - i = vf.df.apply(func, axis=1) - if opposite: - i = ~i - if index: - return i - df = vf.df[i].reset_index(drop=True) - vf = vf.__class__(vf.copy_meta(), df) - return vf - def parseann(vf, targets, sep=' | ', as_series=False): """Parse variant annotations in VcfFrame. @@ -605,11 +480,6 @@ def func(r): new_vf.df.INFO = s return new_vf -def get_index(vf, target): - """Return the index of the target field (e.g. CLIN_SIG).""" - headers = annot_names(vf) - return headers.index(target) - def to_frame(vf, as_zero=False): """ Create a DataFrame containing analysis-ready VEP annotation data. From 376cd8ef060e7a11f59a9f61ffc88e32795d8372 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 11 Jun 2021 17:20:05 +0900 Subject: [PATCH 07/10] Update API --- fuc/api/pyvep.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index 0553992..5f99eec 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -51,6 +51,8 @@ import numpy as np from . import pyvcf +# https://m.ensembl.org/info/docs/tools/vep/vep_formats.html + DATA_TYPES = { 'Allele': 'str', 'Consequence': 'str', From 61ef904c98639ceed7051e832ac55545cd78d5b1 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Jun 2021 10:46:20 +0900 Subject: [PATCH 08/10] Update API --- CHANGELOG.rst | 2 +- README.rst | 8 +++++++- docs/cli.rst | 28 ++++++++++++++-------------- docs/create.py | 6 ++++++ fuc/cli/{vep_query.py => vcf_vep.py} | 6 +++--- 5 files changed, 31 insertions(+), 19 deletions(-) rename fuc/cli/{vep_query.py => vcf_vep.py} (76%) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c1ba0b9..56f3a02 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Changelog * Add new method :meth:`pyvcf.VcfFrame.extract`. * Deprecate methods :meth:`pyvep.filter_af/biotype/nothas/impact`. * Add new method :meth:`pyvep.filter_query`. -* :issue:`19`: Add new command :command:`vep_query`. +* :issue:`19`: Add new command :command:`vcf_vep`. 0.11.0 (2021-06-10) ------------------- diff --git a/README.rst b/README.rst index 14e71da..e91ce79 100644 --- a/README.rst +++ b/README.rst @@ -116,6 +116,12 @@ To merge VCF files: $ fuc vcf_merge 1.vcf 2.vcf 3.vcf > merged.vcf +To filter a VCF file annotated by Ensemble VEP: + +.. code-block:: console + + $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' > out.vcf + API Examples ============ @@ -308,7 +314,7 @@ For getting help on CLI: vcf_merge [VCF] merge two or more VCF files vcf_slice [VCF] slice a VCF file vcf_vcf2bed [VCF] convert a VCF file to a BED file - vep_query [VEP] filter a VEP-annotated VCF file + vcf_vep [VCF] filter a VCF file annotated by Ensemble VEP optional arguments: -h, --help show this help message and exit diff --git a/docs/cli.rst b/docs/cli.rst index 7fa6920..66e121a 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -38,7 +38,7 @@ For getting help on CLI: vcf_merge [VCF] merge two or more VCF files vcf_slice [VCF] slice a VCF file vcf_vcf2bed [VCF] convert a VCF file to a BED file - vep_query [VEP] filter a VEP-annotated VCF file + vcf_vep [VCF] filter a VCF file annotated by Ensemble VEP optional arguments: -h, --help show this help message and exit @@ -482,27 +482,27 @@ vcf_vcf2bed optional arguments: -h, --help show this help message and exit -vep_query -========= +vcf_vep +======= .. code-block:: console - $ fuc vep_query -h - usage: fuc vep_query [-h] [--opposite] [--as_zero] vcf expr + $ fuc vcf_vep -h + usage: fuc vcf_vep [-h] [--opposite] [--as_zero] vcf expr - This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). + This command will filter a VCF file annotated by Ensemble VEP. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). examples: - $ fuc vep_query in.vcf 'SYMBOL == "TP53"' > out.vcf - $ fuc vep_query in.vcf 'SYMBOL != "TP53"' > out.vcf - $ fuc vep_query in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf - $ fuc vep_query in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf - $ fuc vep_query in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf - $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' > out.vcf - $ fuc vep_query in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf + $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' > out.vcf + $ fuc vcf_vep in.vcf 'SYMBOL != "TP53"' > out.vcf + $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf + $ fuc vcf_vep in.vcf 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf + $ fuc vcf_vep in.vcf '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf + $ fuc vcf_vep in.vcf 'gnomAD_AF < 0.001' > out.vcf + $ fuc vcf_vep in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf positional arguments: - vcf VEP-annotated VCF file + vcf Ensemble VEP-annotated VCF file expr query expression to evaluate optional arguments: diff --git a/docs/create.py b/docs/create.py index 598b398..b737564 100644 --- a/docs/create.py +++ b/docs/create.py @@ -144,6 +144,12 @@ $ fuc vcf_merge 1.vcf 2.vcf 3.vcf > merged.vcf +To filter a VCF file annotated by Ensemble VEP: + +.. code-block:: console + + $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' > out.vcf + API Examples ============ diff --git a/fuc/cli/vep_query.py b/fuc/cli/vcf_vep.py similarity index 76% rename from fuc/cli/vep_query.py rename to fuc/cli/vcf_vep.py index 2d1ec70..ca94f3b 100644 --- a/fuc/cli/vep_query.py +++ b/fuc/cli/vcf_vep.py @@ -4,7 +4,7 @@ command = api.common.script_name(__file__) description = f""" -This command will filter a VEP-annotated VCF file. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). +This command will filter a VCF file annotated by Ensemble VEP. It essentially wraps the `pandas.DataFrame.query` method. For details on query expression, please visit the method's documentation page (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html#pandas-dataframe-query). examples: $ fuc {command} in.vcf 'SYMBOL == "TP53"' > out.vcf @@ -19,11 +19,11 @@ def create_parser(subparsers): parser = subparsers.add_parser( command, - help='[VEP] filter a VEP-annotated VCF file', + help='[VCF] filter a VCF file annotated by Ensemble VEP', description=description, formatter_class=RawTextHelpFormatter, ) - parser.add_argument('vcf', help='VEP-annotated VCF file') + parser.add_argument('vcf', help='Ensemble VEP-annotated VCF file') parser.add_argument('expr', help='query expression to evaluate') parser.add_argument( '--opposite', From f771f571379a88060c1144b0d55fdb4ff955899b Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Jun 2021 19:04:42 +0900 Subject: [PATCH 09/10] Update API --- CHANGELOG.rst | 7 +- README.rst | 51 ++++------ conda.yml | 1 + docs/api.rst | 2 +- docs/create.py | 49 ++++------ docs/requirements.txt | 1 + fuc/api/common.py | 2 + fuc/api/pyvcf.py | 215 ++++++++++++++++++++++++++++++------------ setup.py | 2 +- 9 files changed, 204 insertions(+), 126 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 56f3a02..a34e9ca 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,14 +1,17 @@ Changelog ********* -0.12.0 (in development) ------------------------ +0.12.0 (2021-06-12) +------------------- * Add new method :meth:`pyvcf.VcfFrame.add_af`. * Add new method :meth:`pyvcf.VcfFrame.extract`. * Deprecate methods :meth:`pyvep.filter_af/biotype/nothas/impact`. * Add new method :meth:`pyvep.filter_query`. * :issue:`19`: Add new command :command:`vcf_vep`. +* Rename :meth:`pyvcf.VcfFrame.plot_histplot` to :meth:`pyvcf.VcfFrame.plot_tmb`. +* Add ``scipy`` package as dependency for performing statistical analysis. +* Add new method :meth:`pyvcf.VcfFrame.plot_hist`. 0.11.0 (2021-06-10) ------------------- diff --git a/README.rst b/README.rst index e91ce79..7f6771b 100644 --- a/README.rst +++ b/README.rst @@ -161,40 +161,26 @@ To create a Venn diagram showing genotype concordance between groups: .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_comparison.png -To create a histogram of tumor mutational burden (TMB) distribution: +To create various figures for normal-tumor analysis: .. code:: python3 - >>> from fuc import pyvcf - >>> vcf_data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103, 103], - ... 'ID': ['.', '.', '.', '.', '.'], - ... 'REF': ['T', 'T', 'T', 'T', 'T'], - ... 'ALT': ['C', 'C', 'C', 'C', 'C'], - ... 'QUAL': ['.', '.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], - ... 'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'], - ... 'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'], - ... 'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'], - ... 'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'], - ... 'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], - ... 'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'], - ... 'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], - ... 'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'], - ... } - >>> annot_data = { - ... 'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'], - ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'], - ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data) - >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample') - >>> vf.plot_histplot(hue='Type', af=af) - -.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_histplot.png + >>> import matplotlib.pyplot as plt + >>> from fuc import common, pyvcf + >>> common.load_dataset('pyvcf') + >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf') + >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample') + >>> normal = af.df[af.df.Tissue == 'Normal'].index + >>> tumor = af.df[af.df.Tissue == 'Tumor'].index + >>> fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 10)) + >>> vf.plot_tmb(ax=ax1) + >>> vf.plot_tmb(ax=ax2, af=af, hue='Tissue') + >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue') + >>> vf.plot_regplot(normal, tumor, ax=ax4) + >>> plt.tight_layout() + >>> plt.savefig('normal-tumor.png') + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png MAF --- @@ -256,6 +242,7 @@ The following packages are required to run fuc: pandas pyranges pysam + scipy seaborn There are various ways you can install fuc. The recommended way is via conda: @@ -335,7 +322,7 @@ Below is the list of submodules available in API: - **pyfq** : The pyfq submodule is designed for working with FASTQ files. It implements ``pyfq.FqFrame`` which stores FASTQ data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. - **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pymaf.MafFrame`` class also contains many useful plotting methods such as ``MafFrame.plot_oncoplot`` and ``MafFrame.plot_summary``. The submodule strictly adheres to the standard `MAF specification `_. - **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff `_ program. It should be used with ``pyvcf.VcfFrame``. -- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the standard `VCF specification `_. +- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_tmb``. The submodule strictly adheres to the standard `VCF specification `_. - **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP `_ program. It should be used with ``pyvcf.VcfFrame``. For getting help on a specific module (e.g. pyvcf): diff --git a/conda.yml b/conda.yml index 3db176e..2c58966 100644 --- a/conda.yml +++ b/conda.yml @@ -14,6 +14,7 @@ dependencies: - pandas - pyranges - pysam + - scipy - seaborn - sphinx-issues - sphinx_rtd_theme diff --git a/docs/api.rst b/docs/api.rst index bc52021..f51e2c1 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -18,7 +18,7 @@ Below is the list of submodules available in API: - **pyfq** : The pyfq submodule is designed for working with FASTQ files. It implements ``pyfq.FqFrame`` which stores FASTQ data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. - **pymaf** : The pymaf submodule is designed for working with MAF files. It implements ``pymaf.MafFrame`` which stores MAF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pymaf.MafFrame`` class also contains many useful plotting methods such as ``MafFrame.plot_oncoplot`` and ``MafFrame.plot_summary``. The submodule strictly adheres to the standard `MAF specification `_. - **pysnpeff** : The pysnpeff submodule is designed for parsing VCF annotation data from the `SnpEff `_ program. It should be used with ``pyvcf.VcfFrame``. -- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the standard `VCF specification `_. +- **pyvcf** : The pyvcf submodule is designed for working with VCF files. It implements ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` and ``VcfFrame.plot_tmb``. The submodule strictly adheres to the standard `VCF specification `_. - **pyvep** : The pyvep submodule is designed for parsing VCF annotation data from the `Ensembl VEP `_ program. It should be used with ``pyvcf.VcfFrame``. For getting help on a specific module (e.g. pyvcf): diff --git a/docs/create.py b/docs/create.py index b737564..374bff6 100644 --- a/docs/create.py +++ b/docs/create.py @@ -189,40 +189,26 @@ .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_comparison.png -To create a histogram of tumor mutational burden (TMB) distribution: +To create various figures for normal-tumor analysis: .. code:: python3 - >>> from fuc import pyvcf - >>> vcf_data = {{ - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103, 103], - ... 'ID': ['.', '.', '.', '.', '.'], - ... 'REF': ['T', 'T', 'T', 'T', 'T'], - ... 'ALT': ['C', 'C', 'C', 'C', 'C'], - ... 'QUAL': ['.', '.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], - ... 'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'], - ... 'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'], - ... 'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'], - ... 'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'], - ... 'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], - ... 'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'], - ... 'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], - ... 'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'], - ... }} - >>> annot_data = {{ - ... 'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'], - ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'], - ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'], - ... }} - >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data) - >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample') - >>> vf.plot_histplot(hue='Type', af=af) - -.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/plot_histplot.png + >>> import matplotlib.pyplot as plt + >>> from fuc import common, pyvcf + >>> common.load_dataset('pyvcf') + >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf') + >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample') + >>> normal = af.df[af.df.Tissue == 'Normal'].index + >>> tumor = af.df[af.df.Tissue == 'Tumor'].index + >>> fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 10)) + >>> vf.plot_tmb(ax=ax1) + >>> vf.plot_tmb(ax=ax2, af=af, hue='Tissue') + >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue') + >>> vf.plot_regplot(normal, tumor, ax=ax4) + >>> plt.tight_layout() + >>> plt.savefig('normal-tumor.png') + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png MAF --- @@ -284,6 +270,7 @@ pandas pyranges pysam + scipy seaborn There are various ways you can install fuc. The recommended way is via conda: diff --git a/docs/requirements.txt b/docs/requirements.txt index cb841c9..497ebda 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,4 +6,5 @@ matplotlib-venn numpy pandas pysam +scipy seaborn diff --git a/fuc/api/common.py b/fuc/api/common.py index 8c524ca..a192358 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -73,6 +73,8 @@ def load_dataset(name, force=False): ], 'pyvcf': [ 'plot_comparison.vcf', + 'normal-tumor.vcf', + 'normal-tumor-annot.tsv', ], } base_url = ('https://raw.githubusercontent.com/sbslee/fuc-data/main') diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index b7883af..7c61343 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -3,7 +3,7 @@ ``pyvcf.VcfFrame`` which stores VCF data as ``pandas.DataFrame`` to allow fast computation and easy manipulation. The ``pyvcf.VcfFrame`` class also contains many useful plotting methods such as ``VcfFrame.plot_comparison`` -and ``VcfFrame.plot_regplot``. The submodule strictly adheres to the +and ``VcfFrame.plot_tmb``. The submodule strictly adheres to the standard `VCF specification `_. @@ -58,6 +58,7 @@ from matplotlib_venn import venn2, venn3 import os import seaborn as sns +import scipy.stats as stats CONTIGS = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', @@ -666,6 +667,60 @@ def from_file(cls, fn, sample_col, sep='\t'): df = df.set_index(sample_col) return cls(df) +def simulate_genotype( + p=0.5, noise_scale=0.05, dp_show=True, dp_loc=30, dp_scale=10, + ad_show=True, ad_loc=0.5, ad_scale=0.05, af_show=True +): + lower, upper = 0, 1 + mu, sigma = p, noise_scale + X = stats.truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma) + has_var = np.random.binomial(1, X.rvs(1)) == 1 + + if has_var: + result = '0/1' + else: + result = '0/0' + + dp = np.random.normal(loc=dp_loc, scale=dp_scale) + dp = round(abs(dp)) + + if has_var: + alt = round(abs(np.random.normal(loc=ad_loc, scale=ad_scale)) * dp) + ref = dp - alt + ad = f'{ref},{alt}' + else: + ad = f'{dp},0' + + if has_var: + af = round(alt / (ref + alt), 3) + else: + af = 0 + + if dp_show: + result += f':{dp}' + + if ad_show: + result += f':{ad}' + + if af_show: + result += f':{af}' + + return result + +def simulate_sample( + n, p=0.5, noise_scale=0.1, dp_show=True, dp_loc=30, dp_scale=10, + ad_show=True, ad_loc=0.5, ad_scale=0.05, af_show=True +): + l = [] + for i in range(n): + genotype = simulate_genotype( + p=p, noise_scale=noise_scale, dp_show=dp_show, + dp_loc=dp_loc, dp_scale=dp_scale, ad_show=ad_show, + ad_loc=ad_loc, ad_scale=ad_scale + ) + l.append(genotype) + return l + class VcfFrame: """Class for storing VCF data. @@ -1782,18 +1837,96 @@ def _plot_comparison_three(self, a, b, c, venn_kws): out = venn3(subsets=n[:-1], **venn_kws) return out - def plot_histplot( + def plot_hist( + self, k, af=None, hue=None, kde=True, ax=None, figsize=None, **kwargs + ): + """ + Create a histogram showing AD/AF/DP distribution. + + Parameters + ---------- + k : {'AD', 'AF', 'DP'} + Genotype key. + af : pyvcf.AnnFrame + AnnFrame containing sample annotation data (requires ``hue``). + hue : list, optional + Grouping variable that will produce multiple histograms with + different colors (requires ``af``). + kde : bool, default: True + Compute a kernel density estimate to smooth the distribution. + ax : matplotlib.axes.Axes, optional + Pre-existing axes for the plot. Otherwise, crete a new one. + figsize : tuple, optional + Width, height in inches. Format: (float, float). + kwargs + Other keyword arguments will be passed down to + :meth:`seaborn.histplot`. + + Returns + ------- + matplotlib.axes.Axes + The matplotlib axes containing the plot. + + Examples + -------- + Below is a simple example: + + .. plot:: + :context: close-figs + + >>> from fuc import common, pyvcf + >>> common.load_dataset('pyvcf') + >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf') + >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample') + >>> vf.plot_hist('DP') + + We can draw multiple histograms with hue mapping: + + .. plot:: + :context: close-figs + + >>> vf.plot_hist('DP', af=af, hue='Tissue') + + We can show AF instead of DP: + + .. plot:: + :context: close-figs + + >>> vf.plot_hist('AF') + """ + d = { + 'AD': lambda x: float(x.split(',')[1]), + 'AF': lambda x: float(x), + 'DP': lambda x: int(x), + } + df = self.extract(k, as_nan=True, func=d[k]) + df = df.T + id_vars = ['index'] + if hue is not None: + df = pd.concat([df, af.df[hue]], axis=1, join='inner') + id_vars.append(hue) + df = df.reset_index() + df = pd.melt(df, id_vars=id_vars) + df = df.dropna() + df = df.rename(columns={'value': k}) + if ax is None: + fig, ax = plt.subplots(figsize=figsize) + sns.histplot(data=df, x=k, hue=hue, kde=kde, ax=ax, **kwargs) + return ax + + def plot_tmb( self, af=None, hue=None, kde=True, ax=None, figsize=None, **kwargs ): - """Create a histogram of TMB distribution. + """ + Create a histogram showing TMB distribution. Parameters ---------- af : pyvcf.AnnFrame - AnnFrame. + AnnFrame containing sample annotation data (requires ``hue``). hue : list, optional - Grouping variable that will produce bars with different colors. - Requires the `af` option. + Grouping variable that will produce multiple histograms with + different colors (requires ``af``). kde : bool, default: True Compute a kernel density estimate to smooth the distribution. ax : matplotlib.axes.Axes, optional @@ -1811,43 +1944,23 @@ def plot_histplot( Examples -------- + Below is a simple example: .. plot:: :context: close-figs - >>> from fuc import pyvcf - >>> vcf_data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103, 103], - ... 'ID': ['.', '.', '.', '.', '.'], - ... 'REF': ['T', 'T', 'T', 'T', 'T'], - ... 'ALT': ['C', 'C', 'C', 'C', 'C'], - ... 'QUAL': ['.', '.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], - ... 'Steven_N': ['0/0', '0/0', '0/1', '0/0', '0/0'], - ... 'Steven_T': ['0/0', '0/1', '0/1', '0/1', '0/1'], - ... 'Sara_N': ['0/0', '0/1', '0/0', '0/0', '0/0'], - ... 'Sara_T': ['0/0', '0/0', '1/1', '1/1', '0/1'], - ... 'John_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], - ... 'John_T': ['0/1', '0/0', '1/1', '1/1', '0/1'], - ... 'Rachel_N': ['0/0', '0/0', '0/0', '0/0', '0/0'], - ... 'Rachel_T': ['0/1', '0/1', '0/0', '0/1', '0/1'], - ... } - >>> annot_data = { - ... 'Sample': ['Steven_N', 'Steven_T', 'Sara_N', 'Sara_T', 'John_N', 'John_T', 'Rachel_N', 'Rachel_T'], - ... 'Subject': ['Steven', 'Steven', 'Sara', 'Sara', 'John', 'John', 'Rachel', 'Rachel'], - ... 'Type': ['Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor', 'Normal', 'Tumor'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], vcf_data) - >>> af = pyvcf.AnnFrame.from_dict(annot_data, 'Sample') - >>> vf.plot_histplot() + >>> from fuc import common, pyvcf + >>> common.load_dataset('pyvcf') + >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf') + >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample') + >>> vf.plot_tmb() + + We can draw multiple histograms with hue mapping: .. plot:: :context: close-figs - >>> vf.plot_histplot(hue='Type', af=af) + >>> vf.plot_tmb(af=af, hue='Tissue') """ s = self.df.iloc[:, 9:].applymap(gt_hasvar).sum() s.name = 'TMB' @@ -1857,7 +1970,7 @@ def plot_histplot( df = pd.concat([af.df, s], axis=1, join='inner') if ax is None: fig, ax = plt.subplots(figsize=figsize) - sns.histplot(data=df, x='TMB', hue=hue, kde=kde, **kwargs) + sns.histplot(data=df, x='TMB', ax=ax, hue=hue, kde=kde, **kwargs) return ax def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs): @@ -1885,29 +1998,13 @@ def plot_regplot(self, a, b, ax=None, figsize=None, **kwargs): .. plot:: - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102], - ... 'ID': ['.', '.', '.'], - ... 'REF': ['G', 'T', 'T'], - ... 'ALT': ['A', 'C', 'C'], - ... 'QUAL': ['.', '.', '.'], - ... 'FILTER': ['.', '.', '.'], - ... 'INFO': ['.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT'], - ... 'Steven_A': ['0/1', '0/1', '0/1'], - ... 'Steven_B': ['0/1', '0/1', '0/1'], - ... 'Sara_A': ['0/0', '1/1', '1/1'], - ... 'Sara_B': ['0/0', '1/1', '1/1'], - ... 'John_A': ['0/0', '0/0', '1/1'], - ... 'John_B': ['0/0', '0/0', '1/1'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - >>> a = ['Steven_A', 'Sara_A', 'John_A'] - >>> b = ['Steven_B', 'Sara_B', 'John_B'] - >>> vf.plot_regplot(a, b) + >>> from fuc import common, pyvcf + >>> common.load_dataset('pyvcf') + >>> vf = pyvcf.VcfFrame.from_file('~/fuc-data/pyvcf/normal-tumor.vcf') + >>> af = pyvcf.AnnFrame.from_file('~/fuc-data/pyvcf/normal-tumor-annot.tsv', 'Sample') + >>> normal = af.df[af.df.Tissue == 'Normal'].index + >>> tumor = af.df[af.df.Tissue == 'Tumor'].index + >>> vf.plot_regplot(normal, tumor) """ s = self.df.iloc[:, 9:].applymap(gt_hasvar).sum() if ax is None: diff --git a/setup.py b/setup.py index dbc32f3..32d7c33 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ requirements = [ 'biopython', 'lxml', 'matplotlib', 'matplotlib-venn', 'numpy', 'pandas', - 'pyranges', 'pysam', 'seaborn' + 'pyranges', 'pysam', 'scipy', 'seaborn' ] setup( From b280e5902dbac67185eeae6f4dddaf68eb6b9f5c Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Jun 2021 19:11:04 +0900 Subject: [PATCH 10/10] Update API --- README.rst | 1 - docs/create.py | 1 - 2 files changed, 2 deletions(-) diff --git a/README.rst b/README.rst index 7f6771b..dcc0fa5 100644 --- a/README.rst +++ b/README.rst @@ -178,7 +178,6 @@ To create various figures for normal-tumor analysis: >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue') >>> vf.plot_regplot(normal, tumor, ax=ax4) >>> plt.tight_layout() - >>> plt.savefig('normal-tumor.png') .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png diff --git a/docs/create.py b/docs/create.py index 374bff6..ba1d8d2 100644 --- a/docs/create.py +++ b/docs/create.py @@ -206,7 +206,6 @@ >>> vf.plot_hist('DP', ax=ax3, af=af, hue='Tissue') >>> vf.plot_regplot(normal, tumor, ax=ax4) >>> plt.tight_layout() - >>> plt.savefig('normal-tumor.png') .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/normal-tumor.png