From 29e668a58711c489cbf8cd5c6a2ddad1887c4bfd Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 5 Feb 2022 18:40:57 +0900 Subject: [PATCH 01/22] Bump up version number --- CHANGELOG.rst | 3 +++ fuc/version.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a1187a3..dae6157 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,9 @@ Changelog ********* +0.31.0 (in development) +----------------------- + 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/version.py b/fuc/version.py index e187e0a..c3d10d7 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.30.0' +__version__ = '0.31.0' From fc0ec14294b7c388da47b4e4a79b068c4e05ce89 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 9 Feb 2022 15:16:05 +0900 Subject: [PATCH 02/22] Fix bug in `pykallisto.KallistoFrame.compute_fold_change` --- CHANGELOG.rst | 2 ++ fuc/api/pykallisto.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dae6157..915a55e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,8 @@ Changelog 0.31.0 (in development) ----------------------- +* Fix bug in :meth:`pykallisto.KallistoFrame.compute_fold_change` method. + 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/api/pykallisto.py b/fuc/api/pykallisto.py index fac4f17..1e8ed63 100644 --- a/fuc/api/pykallisto.py +++ b/fuc/api/pykallisto.py @@ -163,7 +163,7 @@ def compute_fold_change(self, group, genes, unit='tpm', flip=False): df = getattr(self, f'df_gene_{unit}') df = df[df.index.isin(genes)].T df = df.merge(self.metadata[group], left_index=True, right_index=True) - df = df.groupby('lauren').mean().T + df = df.groupby(group).mean().T a, b = df.columns if flip: a, b = b, a From 9a04852bf920fcff4de2d5378e3503bcce363621 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 11 Feb 2022 17:19:32 +0900 Subject: [PATCH 03/22] Add `pyvcf.call` and `vcf-call` --- CHANGELOG.rst | 1 + README.rst | 7 +++- docs/cli.rst | 43 ++++++++++++++++++++- fuc/api/pyvcf.py | 89 ++++++++++++++++++++++++++++++++++++++++++- fuc/cli/fuc_undetm.py | 4 +- fuc/cli/ngs_fq2bam.py | 3 +- fuc/cli/vcf_call.py | 66 ++++++++++++++++++++++++++++++++ 7 files changed, 205 insertions(+), 8 deletions(-) create mode 100644 fuc/cli/vcf_call.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 915a55e..463d6ec 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ Changelog ----------------------- * Fix bug in :meth:`pykallisto.KallistoFrame.compute_fold_change` method. +* Add new method :meth:`pyvcf.call` and new command :command:`vcf-call`. 0.30.0 (2022-02-05) ------------------- diff --git a/README.rst b/README.rst index 4c7621e..b2246a7 100644 --- a/README.rst +++ b/README.rst @@ -131,13 +131,15 @@ For getting help on the fuc CLI: fuc-exist Check whether certain files exist. fuc-find Retrieve absolute paths of files whose name matches a specified pattern, optionally recursively. - fuc-undetm Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. + fuc-undetm Compute top unknown barcodes using undertermined FASTQ from + bcl2fastq. maf-maf2vcf Convert a MAF file to a VCF file. maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. ngs-bam2fq Pipeline for converting BAM files to FASTQ files. - ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. + ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM + files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. ngs-pon Pipeline for constructing a panel of normals (PoN). @@ -148,6 +150,7 @@ For getting help on the fuc CLI: tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. + vcf-call Perform variant calling and filtering for BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. diff --git a/docs/cli.rst b/docs/cli.rst index ae904c5..65f2bb8 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -37,13 +37,15 @@ For getting help on the fuc CLI: fuc-exist Check whether certain files exist. fuc-find Retrieve absolute paths of files whose name matches a specified pattern, optionally recursively. - fuc-undetm Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. + fuc-undetm Compute top unknown barcodes using undertermined FASTQ from + bcl2fastq. maf-maf2vcf Convert a MAF file to a VCF file. maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. ngs-bam2fq Pipeline for converting BAM files to FASTQ files. - ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. + ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM + files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. ngs-pon Pipeline for constructing a panel of normals (PoN). @@ -54,6 +56,7 @@ For getting help on the fuc CLI: tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. + vcf-call Perform variant calling and filtering for BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. @@ -1210,6 +1213,42 @@ tbl-sum [Example] Summarize a table: $ fuc tbl-sum table.tsv +vcf-call +======== + +.. code-block:: text + + $ fuc vcf-call -h + usage: fuc vcf-call [-h] [--regions TEXT [TEXT ...]] [--min-mq INT] + [--max-depth INT] + fasta bam [bam ...] + + Perform variant calling and filtering for BAM files. + + Positional arguments: + fasta FASTA file. + bam One or more BAM files. + + Optional arguments: + -h, --help Show this help message and exit. + --regions TEXT [TEXT ...] + Only call variants in given regions. Each region must + have the format chrom:start-end and be a half-open + interval with (start, end]. This means, for example, + chr1:100-103 will extract positions 101, 102, and + 103. Alternatively, you can provide a BED file + (compressed or uncompressed) to specify regions. Note + that the 'chr' prefix in contig names (e.g. 'chr1' + vs. '1') will be automatically added or removed as + necessary to match the input VCF's contig names. + --min-mq INT Minimum mapping quality for an alignment to be used + (default: 1). + --max-depth INT At a position, read maximally this number of reads + per input file (default: 250). + + [Example] Specify regions manually: + $ fuc vcf-call ref.fa in1.bam in2.bam -r chr1:100-200 chr2:300-400 > out.vcf + vcf-filter ========== diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 418e658..ad1bcd6 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -58,9 +58,11 @@ import os import re +import sys import gzip from copy import deepcopy import warnings +import tempfile from . import pybed, common, pymaf @@ -72,7 +74,7 @@ import matplotlib.pyplot as plt from matplotlib_venn import venn2, venn3 import seaborn as sns -from pysam import VariantFile +from pysam import VariantFile, bcftools from io import StringIO HEADERS = { @@ -135,6 +137,91 @@ '#AD_FRAC_ALT': ['AD', lambda x: np.nan if sum([int(y) for y in x.split(',')]) == 0 else sum([int(y) for y in x.split(',')[1:]]) / sum([int(y) for y in x.split(',')]), True], } +def call( + fasta, bams, path=None, regions=None, min_mq=1, max_depth=250 +): + """ + Perform variant calling and filtering for BAM files. + + This method will run a fully customizable, bcftools-based pipeline for + calling and filtering variants. + + Parameters + ---------- + fasta : str + FASTA file. + bams : str or list + One or more BAM files. + path : str, optional + Output VCF file. Writes to stdout when ``path='-'``. If None is + provided the result is returned as a string. + regions : str, list, or pybed.BedFrame + Only call variants in given regions. Each region must have the format + chrom:start-end and be a half-open interval with (start, end]. This + means, for example, chr1:100-103 will extract positions 101, 102, and + 103. Alternatively, you can provide a BED file (compressed or + uncompressed) to specify regions. Note that the 'chr' prefix in + contig names (e.g. 'chr1' vs. '1') will be automatically added or + removed as necessary to match the input VCF's contig names. + min_mq : int, default: 1 + Minimum mapping quality for an alignment to be used. + max_depth : int, default: 250 + At a position, read maximally this number of reads per input file. + + Returns + ------- + str + VcfFrame object. + """ + if isinstance(bams, str): + bams = [bams] + + # Parse target regions, if provided. + if regions is not None: + if isinstance(regions, str): + regions = [regions] + if '.bed' in regions[0]: + parsed_regions = ['-R', regions[0]] + else: + parsed_regions = ['-r'] + regions + + with tempfile.TemporaryDirectory() as t: + # Step 1: Get genotype likelihoods. + args = ['-Ou', '-a', 'AD'] + args += ['-q', str(min_mq)] + args += ['--max-depth', str(max_depth)] + args += ['-f', fasta] + if parsed_regions is not None: + args += parsed_regions + args += bams + results = bcftools.mpileup(*args) + with open(f'{t}/likelihoods.bcf', 'wb') as f: + f.write(results) + + # Step 2: Call variants. + args = [f'{t}/likelihoods.bcf', '-Oz', '-mv'] + results = bcftools.call(*args) + with open('calls.bcf', 'wb') as f: + f.write(results) + + # Step 3: Normalize indels. + args = ['calls.bcf', '-Ob', '-f', fasta] + results = bcftools.norm(*args) + with open('calls.normalized.bcf', 'wb') as f: + f.write(results) + + # Step 4: Filter variant. + args = ['calls.normalized.bcf', '-Ov', '--IndelGap', '5'] + results = bcftools.filter(*args) + + if path is None: + return results + elif path == '-': + sys.stdout.write(results) + else: + with open(path, 'w') as f: + f.write(results) + def rescue_filtered_variants(vfs, format='GT'): """ Rescue filtered variants if they are PASS in at least one of the input diff --git a/fuc/cli/fuc_undetm.py b/fuc/cli/fuc_undetm.py index 3bc82d9..a257419 100644 --- a/fuc/cli/fuc_undetm.py +++ b/fuc/cli/fuc_undetm.py @@ -20,8 +20,8 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compute top unknown barcodes using undertermined FASTQ ' - 'from bcl2fastq.', + help='Compute top unknown barcodes using undertermined FASTQ from \n' + 'bcl2fastq.' ) parser.add_argument( 'fastq', diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index c207b0a..8ae3588 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -54,7 +54,8 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for converting FASTQ files to analysis-ready BAM files.', + help='Pipeline for converting FASTQ files to analysis-ready BAM \n' + 'files.' ) parser.add_argument( 'manifest', diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py new file mode 100644 index 0000000..2e9e9d2 --- /dev/null +++ b/fuc/cli/vcf_call.py @@ -0,0 +1,66 @@ +import sys + +from .. import api + +description = f""" +Perform variant calling and filtering for BAM files. +""" + +epilog = f""" +[Example] Specify regions manually: + $ fuc {api.common._script_name()} ref.fa in1.bam in2.bam -r chr1:100-200 chr2:300-400 > out.vcf +""" + +def create_parser(subparsers): + parser = api.common._add_parser( + subparsers, + api.common._script_name(), + description=description, + epilog=epilog, + help='Perform variant calling and filtering for BAM files.', + ) + parser.add_argument( + 'fasta', + help="FASTA file." + ) + parser.add_argument( + 'bam', + nargs='+', + help="One or more BAM files." + ) + parser.add_argument( + '--regions', + nargs='+', + metavar='TEXT', + help="Only call variants in given regions. Each region must \n" + "have the format chrom:start-end and be a half-open \n" + "interval with (start, end]. This means, for example, \n" + "chr1:100-103 will extract positions 101, 102, and \n" + "103. Alternatively, you can provide a BED file \n" + "(compressed or uncompressed) to specify regions. Note \n" + "that the 'chr' prefix in contig names (e.g. 'chr1' \n" + "vs. '1') will be automatically added or removed as \n" + "necessary to match the input VCF's contig names." + ) + parser.add_argument( + '--min-mq', + metavar='INT', + type=int, + default=1, + help="Minimum mapping quality for an alignment to be used \n" + "(default: 1)." + ) + parser.add_argument( + '--max-depth', + metavar='INT', + type=int, + default=250, + help="At a position, read maximally this number of reads \n" + "per input file (default: 250)." + ) + +def main(args): + api.pyvcf.call( + args.fasta, args.bam, path='-', regions=args.regions, + min_mq=args.min_mq, max_depth=args.max_depth + ) From 4bd6f383538799ac16e7cffd6d67e0c1f4f77bb0 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Fri, 11 Feb 2022 20:25:39 +0900 Subject: [PATCH 04/22] Update docs --- docs/cli.rst | 11 ++++++++--- fuc/api/pyvcf.py | 16 ++++++++-------- fuc/cli/vcf_call.py | 13 +++++++++---- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index 65f2bb8..5abf537 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -1221,13 +1221,18 @@ vcf-call $ fuc vcf-call -h usage: fuc vcf-call [-h] [--regions TEXT [TEXT ...]] [--min-mq INT] [--max-depth INT] - fasta bam [bam ...] + fasta bams [bams ...] Perform variant calling and filtering for BAM files. + This command will run a fully customizable, bcftools-based pipeline for + calling and filtering variants. + Positional arguments: - fasta FASTA file. - bam One or more BAM files. + fasta Reference FASTA file. + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. Optional arguments: -h, --help Show this help message and exit. diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index ad1bcd6..959fa5e 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -149,9 +149,10 @@ def call( Parameters ---------- fasta : str - FASTA file. + Reference FASTA file. bams : str or list - One or more BAM files. + One or more input BAM files. Alternatively, you can provide a text + file (.txt, .tsv, .csv, or .list) containing one BAM file per line. path : str, optional Output VCF file. Writes to stdout when ``path='-'``. If None is provided the result is returned as a string. @@ -173,8 +174,7 @@ def call( str VcfFrame object. """ - if isinstance(bams, str): - bams = [bams] + bams = common.parse_list_or_file(bams) # Parse target regions, if provided. if regions is not None: @@ -201,17 +201,17 @@ def call( # Step 2: Call variants. args = [f'{t}/likelihoods.bcf', '-Oz', '-mv'] results = bcftools.call(*args) - with open('calls.bcf', 'wb') as f: + with open(f'{t}/calls.bcf', 'wb') as f: f.write(results) # Step 3: Normalize indels. - args = ['calls.bcf', '-Ob', '-f', fasta] + args = [f'{t}/calls.bcf', '-Ob', '-f', fasta] results = bcftools.norm(*args) - with open('calls.normalized.bcf', 'wb') as f: + with open(f'{t}/calls.normalized.bcf', 'wb') as f: f.write(results) # Step 4: Filter variant. - args = ['calls.normalized.bcf', '-Ov', '--IndelGap', '5'] + args = [f'{t}/calls.normalized.bcf', '-Ov', '--IndelGap', '5'] results = bcftools.filter(*args) if path is None: diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py index 2e9e9d2..68d7f9f 100644 --- a/fuc/cli/vcf_call.py +++ b/fuc/cli/vcf_call.py @@ -4,6 +4,9 @@ description = f""" Perform variant calling and filtering for BAM files. + +This command will run a fully customizable, bcftools-based pipeline for +calling and filtering variants. """ epilog = f""" @@ -21,12 +24,14 @@ def create_parser(subparsers): ) parser.add_argument( 'fasta', - help="FASTA file." + help="Reference FASTA file." ) parser.add_argument( - 'bam', + 'bams', nargs='+', - help="One or more BAM files." + help="One or more input BAM files. Alternatively, you can \n" + "provide a text file (.txt, .tsv, .csv, or .list) \n" + "containing one BAM file per line." ) parser.add_argument( '--regions', @@ -61,6 +66,6 @@ def create_parser(subparsers): def main(args): api.pyvcf.call( - args.fasta, args.bam, path='-', regions=args.regions, + args.fasta, args.bams, path='-', regions=args.regions, min_mq=args.min_mq, max_depth=args.max_depth ) From f544745e51d0f0a1b12e0bf82d377838a728453e Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Fri, 11 Feb 2022 20:49:22 +0900 Subject: [PATCH 05/22] Update `pycov.CovFrame.from_bam` and `bam-depth`: * Combine optional arguments ``bam`` and ``fn`` into single positional argument ``bams`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bam`` and ``--fn`` into ``bams``). --- CHANGELOG.rst | 1 + docs/cli.rst | 23 ++++++++++++----------- fuc/api/pycov.py | 43 +++++++++++++------------------------------ fuc/cli/bam_depth.py | 16 +++++----------- 4 files changed, 31 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 463d6ec..3ecd7bf 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Changelog * Fix bug in :meth:`pykallisto.KallistoFrame.compute_fold_change` method. * Add new method :meth:`pyvcf.call` and new command :command:`vcf-call`. +* Combine optional arguments ``bam`` and ``fn`` into single positional argument ``bams`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bam`` and ``--fn`` into ``bams``). 0.30.0 (2022-02-05) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index 5abf537..d004d73 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -107,8 +107,8 @@ bam-depth .. code-block:: text $ fuc bam-depth -h - usage: fuc bam-depth [-h] [--bam PATH [PATH ...]] [--fn PATH] [--bed PATH] - [--region TEXT] [--zero] + usage: fuc bam-depth [-h] [--bed PATH] [--region TEXT] [--zero] + bams [bams ...] Compute read depth from SAM/BAM/CRAM files. @@ -117,16 +117,17 @@ bam-depth close attention to the 'chr' string in contig names (e.g. 'chr1' vs. '1'). Note also that --region requires the input files be indexed. + Positional arguments: + bams One or more input BAM files. Alternatively, you can provide + a text file (.txt, .tsv, .csv, or .list) containing one BAM + file per line. + Optional arguments: - -h, --help Show this help message and exit. - --bam PATH [PATH ...] - One or more alignment files. Cannot be used with --fn. - --fn PATH File containing one alignment file per line. Cannot - be used with --bam. - --bed PATH BED file. Cannot be used with --region. - --region TEXT Target region ('chrom:start-end'). Cannot be used - with --bed. - --zero Output all positions including those with zero depth. + -h, --help Show this help message and exit. + --bed PATH BED file. Cannot be used with --region. + --region TEXT Target region ('chrom:start-end'). Cannot be used + with --bed. + --zero Output all positions including those with zero depth. [Example] To specify regions with a BED file: $ fuc bam-depth \ diff --git a/fuc/api/pycov.py b/fuc/api/pycov.py index 7cc2b67..ba76940 100644 --- a/fuc/api/pycov.py +++ b/fuc/api/pycov.py @@ -165,15 +165,12 @@ def shape(self): @classmethod def from_bam( - cls, bam=None, fn=None, bed=None, region=None, zero=False, - map_qual=None, names=None + cls, bams, bed=None, region=None, zero=False, map_qual=None, + names=None ): """ Construct CovFrame from one or more SAM/BAM/CRAM files. - Alignment files must be specified with either ``bam`` or ``fn``, but - it's an error to use both. - By default, the method will count all reads within the alignment files. You can specify target regions with either ``bed`` or ``region``, but not both. When you do this, pay close attention to @@ -188,10 +185,10 @@ def from_bam( Parameters ---------- - bam : str or list, optional - One or more alignment files. - fn : str, optional - File containing one alignment file per line. + bams : str or list + One or more input BAM files. Alternatively, you can provide a + text file (.txt, .tsv, .csv, or .list) containing one BAM file + per line. bed : str, optional BED file. region : str, optional @@ -226,21 +223,7 @@ def from_bam( >>> cf = pycov.CovFrame.from_bam([bam1, bam2]) >>> cf = pycov.CovFrame.from_bam(bam, region='19:41497204-41524301') """ - bam_files = [] - - if bam is None and fn is None: - raise ValueError( - "Either the 'bam' or 'fn' parameter must be provided.") - elif bam is not None and fn is not None: - raise ValueError( - "The 'bam' and 'fn' parameters cannot be used together.") - elif bam is not None and fn is None: - if isinstance(bam, str): - bam_files.append(bam) - else: - bam_files += bam - else: - bam_files += common.convert_file2list(fn) + bams = common.parse_list_or_file(bams) args = [] @@ -253,25 +236,25 @@ def from_bam( if bed is not None: args += ['-b', bed] - args += bam_files + args += bams s = pysam.depth(*args) headers = ['Chromosome', 'Position'] dtype = {'Chromosome': str,'Position': int} - for i, bam_file in enumerate(bam_files): + for i, bam in enumerate(bams): if names: name = names[i] else: - samples = pybam.tag_sm(bam_file) + samples = pybam.tag_sm(bam) if not samples: - basename = Path(bam_file).stem + basename = Path(bam).stem message = ( - f'SM tags were not found for {bam_file}, will use ' + f'SM tags were not found for {bam}, will use ' f'file name as sample name ({basename})' ) samples = [basename] warnings.warn(message) if len(samples) > 1: - m = f'multiple sample names detected: {bam_file}' + m = f'multiple sample names detected: {bam}' raise ValueError(m) name = samples[0] headers.append(name) diff --git a/fuc/cli/bam_depth.py b/fuc/cli/bam_depth.py index fdb2f45..6e3afab 100644 --- a/fuc/cli/bam_depth.py +++ b/fuc/cli/bam_depth.py @@ -34,16 +34,11 @@ def create_parser(subparsers): help='Compute read depth from SAM/BAM/CRAM files.', ) parser.add_argument( - '--bam', - metavar='PATH', + 'bams', nargs='+', - help='One or more alignment files. Cannot be used with --fn.' - ) - parser.add_argument( - '--fn', - metavar='PATH', - help='File containing one alignment file per line. Cannot \n' - 'be used with --bam.' + help="One or more input BAM files. Alternatively, you can provide \n" + "a text file (.txt, .tsv, .csv, or .list) containing one BAM \n" + "file per line." ) parser.add_argument( '--bed', @@ -64,7 +59,6 @@ def create_parser(subparsers): def main(args): cf = api.pycov.CovFrame.from_bam( - bam=args.bam, fn=args.fn, bed=args.bed, region=args.region, - zero=args.zero + args.bams, bed=args.bed, region=args.region, zero=args.zero ) sys.stdout.write(cf.to_string()) From 6fa67285ff8988e93763b56b0a7982f5fb9c210c Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Feb 2022 10:10:20 +0900 Subject: [PATCH 06/22] Update docs --- fuc/api/pycov.py | 2 +- fuc/api/pyvcf.py | 2 +- fuc/cli/vcf_call.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fuc/api/pycov.py b/fuc/api/pycov.py index ba76940..5616f4b 100644 --- a/fuc/api/pycov.py +++ b/fuc/api/pycov.py @@ -169,7 +169,7 @@ def from_bam( names=None ): """ - Construct CovFrame from one or more SAM/BAM/CRAM files. + Construct CovFrame from BAM files. By default, the method will count all reads within the alignment files. You can specify target regions with either ``bed`` or diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 959fa5e..70648a8 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -141,7 +141,7 @@ def call( fasta, bams, path=None, regions=None, min_mq=1, max_depth=250 ): """ - Perform variant calling and filtering for BAM files. + Call variants (SNVs/indels) from BAM files. This method will run a fully customizable, bcftools-based pipeline for calling and filtering variants. diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py index 68d7f9f..a055695 100644 --- a/fuc/cli/vcf_call.py +++ b/fuc/cli/vcf_call.py @@ -3,7 +3,7 @@ from .. import api description = f""" -Perform variant calling and filtering for BAM files. +Call variants (SNVs/indels) from BAM files. This command will run a fully customizable, bcftools-based pipeline for calling and filtering variants. @@ -20,7 +20,7 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Perform variant calling and filtering for BAM files.', + help='Call variants (SNVs/indels) from BAM files.', ) parser.add_argument( 'fasta', From e567b3a098671538c5e650e91879f79dbc540d92 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Feb 2022 13:29:26 +0900 Subject: [PATCH 07/22] Update docs --- fuc/api/pybed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fuc/api/pybed.py b/fuc/api/pybed.py index 495fe42..6383dab 100644 --- a/fuc/api/pybed.py +++ b/fuc/api/pybed.py @@ -493,9 +493,9 @@ def to_regions(self, merge=True): ... 'End': [40, 50, 25, 35, 60, 80] ... } >>> bf = pybed.BedFrame.from_dict([], data) - >>> bf.regions() + >>> bf.to_regions() ['chr1:10-50', 'chr2:15-35', 'chr3:50-60', 'chr3:61-80'] - >>> bf.regions(merge=False) + >>> bf.to_regions(merge=False) ['chr1:10-40', 'chr1:30-50', 'chr2:15-25', 'chr2:25-35', 'chr3:50-60', 'chr3:61-80'] """ if merge: From 600aaf67e9f0c30b5198d62a65c9e9dae92dc05b Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Feb 2022 13:29:40 +0900 Subject: [PATCH 08/22] Update docs --- fuc/api/pyvcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 70648a8..f74692e 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -156,7 +156,7 @@ def call( path : str, optional Output VCF file. Writes to stdout when ``path='-'``. If None is provided the result is returned as a string. - regions : str, list, or pybed.BedFrame + regions : str, list, or pybed.BedFrame, optional Only call variants in given regions. Each region must have the format chrom:start-end and be a half-open interval with (start, end]. This means, for example, chr1:100-103 will extract positions 101, 102, and From f6171047afa897ad1d44f8104dfa30920af928a2 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Feb 2022 13:49:45 +0900 Subject: [PATCH 09/22] Update `pycov.CovFrame.from_bam` and `bam-depth`: * Update :meth:`pycov.CovFrame.from_bam` method and :command:`bam-depth` command to automatically handle the 'chr' string. --- CHANGELOG.rst | 2 ++ README.rst | 4 +-- docs/cli.rst | 60 +++++++++++++++++-------------- fuc/api/pycov.py | 85 +++++++++++++++++++++++++++++--------------- fuc/cli/bam_depth.py | 60 +++++++++++++++++-------------- 5 files changed, 126 insertions(+), 85 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3ecd7bf..97c1149 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,8 @@ Changelog * Fix bug in :meth:`pykallisto.KallistoFrame.compute_fold_change` method. * Add new method :meth:`pyvcf.call` and new command :command:`vcf-call`. * Combine optional arguments ``bam`` and ``fn`` into single positional argument ``bams`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bam`` and ``--fn`` into ``bams``). +* Combine optional arguments ``bed`` and ``region`` into single optional argument ``regions`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bed`` and ``--region`` into ``--regions``). +* Update :meth:`pycov.CovFrame.from_bam` method and :command:`bam-depth` command to automatically handle the 'chr' string. 0.30.0 (2022-02-05) ------------------- diff --git a/README.rst b/README.rst index b2246a7..7f66236 100644 --- a/README.rst +++ b/README.rst @@ -113,7 +113,7 @@ For getting help on the fuc CLI: positional arguments: COMMAND bam-aldepth Compute allelic depth from a SAM/BAM/CRAM file. - bam-depth Compute read depth from SAM/BAM/CRAM files. + bam-depth Compute per-base read depth from BAM files. bam-head Print the header of a SAM/BAM/CRAM file. bam-index Index a SAM/BAM/CRAM file. bam-rename Rename the sample in a SAM/BAM/CRAM file. @@ -150,7 +150,7 @@ For getting help on the fuc CLI: tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. - vcf-call Perform variant calling and filtering for BAM files. + vcf-call Call variants (SNVs/indels) from BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. diff --git a/docs/cli.rst b/docs/cli.rst index d004d73..173a384 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -19,7 +19,7 @@ For getting help on the fuc CLI: positional arguments: COMMAND bam-aldepth Compute allelic depth from a SAM/BAM/CRAM file. - bam-depth Compute read depth from SAM/BAM/CRAM files. + bam-depth Compute per-base read depth from BAM files. bam-head Print the header of a SAM/BAM/CRAM file. bam-index Index a SAM/BAM/CRAM file. bam-rename Rename the sample in a SAM/BAM/CRAM file. @@ -56,7 +56,7 @@ For getting help on the fuc CLI: tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. - vcf-call Perform variant calling and filtering for BAM files. + vcf-call Call variants (SNVs/indels) from BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. @@ -107,37 +107,43 @@ bam-depth .. code-block:: text $ fuc bam-depth -h - usage: fuc bam-depth [-h] [--bed PATH] [--region TEXT] [--zero] - bams [bams ...] + usage: fuc bam-depth [-h] [--regions TEXT [TEXT ...]] [--zero] bams [bams ...] - Compute read depth from SAM/BAM/CRAM files. + Compute per-base read depth from BAM files. - By default, the command will count all reads within the alignment files. You - can specify regions of interest with --bed or --region. When you do this, pay - close attention to the 'chr' string in contig names (e.g. 'chr1' vs. '1'). - Note also that --region requires the input files be indexed. + Under the hood, the command computes read depth using the 'samtools depth' + command. Positional arguments: - bams One or more input BAM files. Alternatively, you can provide - a text file (.txt, .tsv, .csv, or .list) containing one BAM - file per line. + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. Optional arguments: - -h, --help Show this help message and exit. - --bed PATH BED file. Cannot be used with --region. - --region TEXT Target region ('chrom:start-end'). Cannot be used - with --bed. - --zero Output all positions including those with zero depth. - - [Example] To specify regions with a BED file: - $ fuc bam-depth \ - --bam 1.bam 2.bam \ - --bed in.bed > out.tsv + -h, --help Show this help message and exit. + --regions TEXT [TEXT ...] + By default, the command counts all reads in BAM + files, which can be excruciatingly slow for large + files (e.g. whole genome sequencing). Therefore, use + this argument to only output positions in given + regions. Each region must have the format + chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 + will extract positions 101, 102, and 103. + Alternatively, you can provide a BED file (compressed + or uncompressed) to specify regions. Note that the + 'chr' prefix in contig names (e.g. 'chr1' vs. '1') + will be automatically added or removed as necessary + to match the input BAM's contig names. + --zero Output all positions including those with zero depth. + + [Example] Specify regions with a BED file: + $ fuc bam-depth 1.bam 2.bam \ + --regions in.bed > out.tsv - [Example] To specify regions manually: - $ fuc bam-depth \ - --fn bam.list \ - --region chr1:100-200 > out.tsv + [Example] Specify regions manually: + $ fuc bam-depth bam.list \ + --regions chr1:100-200 chr2:400-500 > out.tsv bam-head ======== @@ -1224,7 +1230,7 @@ vcf-call [--max-depth INT] fasta bams [bams ...] - Perform variant calling and filtering for BAM files. + Call variants (SNVs/indels) from BAM files. This command will run a fully customizable, bcftools-based pipeline for calling and filtering variants. diff --git a/fuc/api/pycov.py b/fuc/api/pycov.py index 5616f4b..f32fb96 100644 --- a/fuc/api/pycov.py +++ b/fuc/api/pycov.py @@ -95,7 +95,7 @@ class CovFrame: See Also -------- CovFrame.from_bam - Construct CovFrame from one or more SAM/BAM/CRAM files. + Construct CovFrame from BAM files. CovFrame.from_dict Construct CovFrame from dict of array-like or dicts. CovFrame.from_file @@ -165,41 +165,42 @@ def shape(self): @classmethod def from_bam( - cls, bams, bed=None, region=None, zero=False, map_qual=None, - names=None + cls, bams, regions=None, zero=False, map_qual=None, names=None ): """ Construct CovFrame from BAM files. - By default, the method will count all reads within the alignment - files. You can specify target regions with either ``bed`` or - ``region``, but not both. When you do this, pay close attention to - the 'chr' string in contig names (e.g. 'chr1' vs. '1'). Note also - that ``region`` requires the input files be indexed. - Under the hood, the method computes read depth using the :command:`samtools depth` command. - Sample name is extracted from the SM tag. If the tag is missing, the - method will set the filename as sample name. - Parameters ---------- bams : str or list One or more input BAM files. Alternatively, you can provide a text file (.txt, .tsv, .csv, or .list) containing one BAM file per line. - bed : str, optional - BED file. - region : str, optional - Target region ('chrom:start-end'). + regions : str, list, or pybed.BedFrame, optional + By default (``regions=None``), the method counts all reads in BAM + files, which can be excruciatingly slow for large files (e.g. + whole genome sequencing). Therefore, use this argument to only + output positions in given regions. Each region must have the + format chrom:start-end and be a half-open interval with (start, + end]. This means, for example, chr1:100-103 will extract + positions 101, 102, and 103. Alternatively, you can provide a BED + file (compressed or uncompressed) or a :class:`pybed.BedFrame` + object to specify regions. Note that the 'chr' prefix in contig + names (e.g. 'chr1' vs. '1') will be automatically added or + removed as necessary to match the input BAM's contig names. zero : bool, default: False - If True, output all positions (including those with zero depth). + If True, output all positions including those with zero depth. map_qual: int, optional Only count reads with mapping quality greater than or equal to this number. names : list, optional - Use these as sample names instead of the SM tags. + By default (``names=None``), sample name is extracted using SM + tag in BAM files. If the tag is missing, the method will set the + filename as sample name. Use this argument to manually provide + sample names. Returns ------- @@ -223,23 +224,45 @@ def from_bam( >>> cf = pycov.CovFrame.from_bam([bam1, bam2]) >>> cf = pycov.CovFrame.from_bam(bam, region='19:41497204-41524301') """ + # Parse input BAM files. bams = common.parse_list_or_file(bams) - args = [] + # Check the 'chr' prefix. + if all([pybam.has_chr_prefix(x) for x in bams]): + chr_prefix = 'chr' + else: + chr_prefix = '' + # Run the 'samtools depth' command. + args = [] if zero: args += ['-a'] - if region is not None: - args += ['-r', region] if map_qual is not None: args += ['-Q', str(map_qual)] - if bed is not None: - args += ['-b', bed] + results = '' + if regions is None: + results += pysam.depth(*(bams + args)) + else: + if isinstance(regions, str): + regions = [regions] + elif isinstance(regions, list): + pass + elif isinstance(regions, pybed.BedFrame): + regions = bf.to_regions() + else: + raise TypeError("Incorrect type of argument 'regions'") + if '.bed' in regions[0]: + bf = pybed.BedFrame.from_file(regions[0]) + regions = bf.to_regions() + else: + regions = common.sort_regions(regions) + for region in regions: + region = chr_prefix + region.replace('chr', '') + results += pysam.depth(*(bams + args + ['-r', region])) - args += bams - s = pysam.depth(*args) headers = ['Chromosome', 'Position'] dtype = {'Chromosome': str,'Position': int} + for i, bam in enumerate(bams): if names: name = names[i] @@ -259,8 +282,12 @@ def from_bam( name = samples[0] headers.append(name) dtype[name] = int - df = pd.read_csv(StringIO(s), sep='\t', header=None, - names=headers, dtype=dtype) + + df = pd.read_csv( + StringIO(results), sep='\t', header=None, + names=headers, dtype=dtype + ) + return cls(df) @classmethod @@ -283,7 +310,7 @@ def from_dict(cls, data): CovFrame CovFrame object creation using constructor. CovFrame.from_bam - Construct CovFrame from one or more SAM/BAM/CRAM files. + Construct CovFrame from BAM files. CovFrame.from_file Construct CovFrame from a text file containing read depth data. @@ -333,7 +360,7 @@ def from_file(cls, fn, compression=False): CovFrame CovFrame object creation using constructor. CovFrame.from_bam - Construct CovFrame from one or more SAM/BAM/CRAM files. + Construct CovFrame from BAM files. CovFrame.from_dict Construct CovFrame from dict of array-like or dicts. diff --git a/fuc/cli/bam_depth.py b/fuc/cli/bam_depth.py index 6e3afab..b000ba2 100644 --- a/fuc/cli/bam_depth.py +++ b/fuc/cli/bam_depth.py @@ -5,24 +5,20 @@ import pysam description = """ -Compute read depth from SAM/BAM/CRAM files. +Compute per-base read depth from BAM files. -By default, the command will count all reads within the alignment files. You -can specify regions of interest with --bed or --region. When you do this, pay -close attention to the 'chr' string in contig names (e.g. 'chr1' vs. '1'). -Note also that --region requires the input files be indexed. +Under the hood, the command computes read depth using the 'samtools depth' +command. """ epilog = f""" -[Example] To specify regions with a BED file: - $ fuc {api.common._script_name()} \\ - --bam 1.bam 2.bam \\ - --bed in.bed > out.tsv +[Example] Specify regions with a BED file: + $ fuc {api.common._script_name()} 1.bam 2.bam \\ + --regions in.bed > out.tsv -[Example] To specify regions manually: - $ fuc {api.common._script_name()} \\ - --fn bam.list \\ - --region chr1:100-200 > out.tsv +[Example] Specify regions manually: + $ fuc {api.common._script_name()} bam.list \\ + --regions chr1:100-200 chr2:400-500 > out.tsv """ def create_parser(subparsers): @@ -31,34 +27,44 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compute read depth from SAM/BAM/CRAM files.', + help="Compute per-base read depth from BAM files." ) parser.add_argument( 'bams', nargs='+', - help="One or more input BAM files. Alternatively, you can provide \n" - "a text file (.txt, .tsv, .csv, or .list) containing one BAM \n" - "file per line." + help= +"""One or more input BAM files. Alternatively, you can +provide a text file (.txt, .tsv, .csv, or .list) +containing one BAM file per line.""" ) parser.add_argument( - '--bed', - metavar='PATH', - help='BED file. Cannot be used with --region.' - ) - parser.add_argument( - '--region', + '--regions', + nargs='+', metavar='TEXT', - help="Target region ('chrom:start-end'). Cannot be used \n" - "with --bed." + help= +"""By default, the command counts all reads in BAM +files, which can be excruciatingly slow for large +files (e.g. whole genome sequencing). Therefore, use +this argument to only output positions in given +regions. Each region must have the format +chrom:start-end and be a half-open interval with +(start, end]. This means, for example, chr1:100-103 +will extract positions 101, 102, and 103. +Alternatively, you can provide a BED file (compressed +or uncompressed) to specify regions. Note that the +'chr' prefix in contig names (e.g. 'chr1' vs. '1') +will be automatically added or removed as necessary +to match the input BAM's contig names.""" ) parser.add_argument( '--zero', action='store_true', - help='Output all positions including those with zero depth.' + help= +"""Output all positions including those with zero depth.""" ) def main(args): cf = api.pycov.CovFrame.from_bam( - args.bams, bed=args.bed, region=args.region, zero=args.zero + args.bams, regions=args.regions, zero=args.zero ) sys.stdout.write(cf.to_string()) From fccf85e6fbc5c55fb1de02190cc18fd54b130980 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Feb 2022 14:40:05 +0900 Subject: [PATCH 10/22] Update docs --- README.rst | 2 +- docs/cli.rst | 61 ++++++++++++++++++++++++------------------- fuc/api/pyvcf.py | 24 ++++++++++------- fuc/cli/bam_depth.py | 12 +++++---- fuc/cli/bam_rename.py | 2 +- fuc/cli/fuc_undetm.py | 2 +- fuc/cli/ngs_bam2fq.py | 2 +- fuc/cli/ngs_fq2bam.py | 2 +- fuc/cli/ngs_pon.py | 2 +- fuc/cli/ngs_quant.py | 2 +- fuc/cli/ngs_trim.py | 2 +- fuc/cli/vcf_call.py | 61 +++++++++++++++++++++++++++---------------- fuc/cli/vcf_merge.py | 2 +- fuc/cli/vcf_slice.py | 2 +- fuc/cli/vcf_split.py | 2 +- fuc/cli/vcf_vep.py | 2 +- 16 files changed, 105 insertions(+), 77 deletions(-) diff --git a/README.rst b/README.rst index 7f66236..12f95e2 100644 --- a/README.rst +++ b/README.rst @@ -150,7 +150,7 @@ For getting help on the fuc CLI: tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. - vcf-call Call variants (SNVs/indels) from BAM files. + vcf-call Call SNVs and indels from BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. diff --git a/docs/cli.rst b/docs/cli.rst index 173a384..dcc4aaa 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -56,7 +56,7 @@ For getting help on the fuc CLI: tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. - vcf-call Call variants (SNVs/indels) from BAM files. + vcf-call Call SNVs and indels from BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. @@ -107,7 +107,7 @@ bam-depth .. code-block:: text $ fuc bam-depth -h - usage: fuc bam-depth [-h] [--regions TEXT [TEXT ...]] [--zero] bams [bams ...] + usage: fuc bam-depth [-h] [-r TEXT [TEXT ...]] [--zero] bams [bams ...] Compute per-base read depth from BAM files. @@ -121,7 +121,7 @@ bam-depth Optional arguments: -h, --help Show this help message and exit. - --regions TEXT [TEXT ...] + -r TEXT [TEXT ...], --regions TEXT [TEXT ...] By default, the command counts all reads in BAM files, which can be excruciatingly slow for large files (e.g. whole genome sequencing). Therefore, use @@ -137,13 +137,13 @@ bam-depth to match the input BAM's contig names. --zero Output all positions including those with zero depth. - [Example] Specify regions with a BED file: + [Example] Specify regions manually: $ fuc bam-depth 1.bam 2.bam \ - --regions in.bed > out.tsv + -r chr1:100-200 chr2:400-500 > out.tsv - [Example] Specify regions manually: + [Example] Specify regions with a BED file: $ fuc bam-depth bam.list \ - --regions chr1:100-200 chr2:400-500 > out.tsv + -r in.bed > out.tsv bam-head ======== @@ -1226,40 +1226,47 @@ vcf-call .. code-block:: text $ fuc vcf-call -h - usage: fuc vcf-call [-h] [--regions TEXT [TEXT ...]] [--min-mq INT] - [--max-depth INT] + usage: fuc vcf-call [-h] [-r TEXT [TEXT ...]] [--min-mq INT] [--max-depth INT] fasta bams [bams ...] - Call variants (SNVs/indels) from BAM files. + Call SNVs and indels from BAM files. - This command will run a fully customizable, bcftools-based pipeline for - calling and filtering variants. + Under the hood, the command utilizes the bcftool program to call variants. Positional arguments: fasta Reference FASTA file. - bams One or more input BAM files. Alternatively, you can - provide a text file (.txt, .tsv, .csv, or .list) + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) containing one BAM file per line. Optional arguments: -h, --help Show this help message and exit. - --regions TEXT [TEXT ...] - Only call variants in given regions. Each region must - have the format chrom:start-end and be a half-open - interval with (start, end]. This means, for example, - chr1:100-103 will extract positions 101, 102, and - 103. Alternatively, you can provide a BED file - (compressed or uncompressed) to specify regions. Note - that the 'chr' prefix in contig names (e.g. 'chr1' - vs. '1') will be automatically added or removed as - necessary to match the input VCF's contig names. - --min-mq INT Minimum mapping quality for an alignment to be used + -r TEXT [TEXT ...], --regions TEXT [TEXT ...] + By default, the command looks at each genomic + position with coverage in BAM files, which can be + excruciatingly slow for large files (e.g. whole + genome sequencing). Therefore, use this argument to + only call variants in given regions. Each region must + have the format chrom:start-end and be a half-open + interval with (start, end]. This means, for example, + chr1:100-103 will extract positions 101, 102, and + 103. Alternatively, you can provide a BED file + (compressed or uncompressed) to specify regions. Note + that the 'chr' prefix in contig names (e.g. 'chr1' + vs. '1') will be automatically added or removed as + necessary to match the input BAM's contig names. + --min-mq INT Minimum mapping quality for an alignment to be used (default: 1). - --max-depth INT At a position, read maximally this number of reads + --max-depth INT At a position, read maximally this number of reads per input file (default: 250). [Example] Specify regions manually: - $ fuc vcf-call ref.fa in1.bam in2.bam -r chr1:100-200 chr2:300-400 > out.vcf + $ fuc vcf-call ref.fa 1.bam 2.bam \ + -r chr1:100-200 chr2:400-500 > out.vcf + + [Example] Specify regions with a BED file: + $ fuc vcf-call ref.fa bam.list \ + -r in.bed > out.vcf vcf-filter ========== diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index f74692e..1fe7431 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -141,10 +141,9 @@ def call( fasta, bams, path=None, regions=None, min_mq=1, max_depth=250 ): """ - Call variants (SNVs/indels) from BAM files. + Call SNVs and indels from BAM files. - This method will run a fully customizable, bcftools-based pipeline for - calling and filtering variants. + Under the hood, the method utilizes the bcftool program to call variants. Parameters ---------- @@ -157,13 +156,17 @@ def call( Output VCF file. Writes to stdout when ``path='-'``. If None is provided the result is returned as a string. regions : str, list, or pybed.BedFrame, optional - Only call variants in given regions. Each region must have the format - chrom:start-end and be a half-open interval with (start, end]. This - means, for example, chr1:100-103 will extract positions 101, 102, and - 103. Alternatively, you can provide a BED file (compressed or - uncompressed) to specify regions. Note that the 'chr' prefix in - contig names (e.g. 'chr1' vs. '1') will be automatically added or - removed as necessary to match the input VCF's contig names. + By default (``regions=None``), the method looks at each genomic + position with coverage in BAM files, which can be excruciatingly slow + for large files (e.g. whole genome sequencing). Therefore, use this + argument to only call variants in given regions. Each region must + have the format chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 will extract + positions 101, 102, and 103. Alternatively, you can provide a BED + file (compressed or uncompressed) or a :class:`pybed.BedFrame` object + to specify regions. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as necessary + to match the input VCF's contig names. min_mq : int, default: 1 Minimum mapping quality for an alignment to be used. max_depth : int, default: 250 @@ -174,6 +177,7 @@ def call( str VcfFrame object. """ + # Parse input BAM files. bams = common.parse_list_or_file(bams) # Parse target regions, if provided. diff --git a/fuc/cli/bam_depth.py b/fuc/cli/bam_depth.py index b000ba2..77f90e4 100644 --- a/fuc/cli/bam_depth.py +++ b/fuc/cli/bam_depth.py @@ -12,13 +12,13 @@ """ epilog = f""" -[Example] Specify regions with a BED file: +[Example] Specify regions manually: $ fuc {api.common._script_name()} 1.bam 2.bam \\ - --regions in.bed > out.tsv + -r chr1:100-200 chr2:400-500 > out.tsv -[Example] Specify regions manually: +[Example] Specify regions with a BED file: $ fuc {api.common._script_name()} bam.list \\ - --regions chr1:100-200 chr2:400-500 > out.tsv + -r in.bed > out.tsv """ def create_parser(subparsers): @@ -27,7 +27,8 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help="Compute per-base read depth from BAM files." + help= +"""Compute per-base read depth from BAM files.""" ) parser.add_argument( 'bams', @@ -38,6 +39,7 @@ def create_parser(subparsers): containing one BAM file per line.""" ) parser.add_argument( + '-r', '--regions', nargs='+', metavar='TEXT', diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index 714c4c4..0ed6d79 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -5,7 +5,7 @@ import pysam -description = f""" +description = """ Rename the sample in a SAM/BAM/CRAM file. """ diff --git a/fuc/cli/fuc_undetm.py b/fuc/cli/fuc_undetm.py index a257419..30d10bf 100644 --- a/fuc/cli/fuc_undetm.py +++ b/fuc/cli/fuc_undetm.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. This command will compute top unknown barcodes using undertermined FASTQ from diff --git a/fuc/cli/ngs_bam2fq.py b/fuc/cli/ngs_bam2fq.py index 7561c20..3e2bcb8 100644 --- a/fuc/cli/ngs_bam2fq.py +++ b/fuc/cli/ngs_bam2fq.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for converting BAM files to FASTQ files. This pipeline will assume input BAM files consist of paired-end reads diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index 8ae3588..ffdd5f4 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for converting FASTQ files to analysis-ready BAM files. Here, "analysis-ready" means that the final BAM files are: 1) aligned to a diff --git a/fuc/cli/ngs_pon.py b/fuc/cli/ngs_pon.py index 459942b..2cb0ed1 100644 --- a/fuc/cli/ngs_pon.py +++ b/fuc/cli/ngs_pon.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for constructing a panel of normals (PoN). Dependencies: diff --git a/fuc/cli/ngs_quant.py b/fuc/cli/ngs_quant.py index 28d41de..a0f9196 100644 --- a/fuc/cli/ngs_quant.py +++ b/fuc/cli/ngs_quant.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for running RNAseq quantification from FASTQ files with Kallisto. External dependencies: diff --git a/fuc/cli/ngs_trim.py b/fuc/cli/ngs_trim.py index e44a5b5..4384fc3 100644 --- a/fuc/cli/ngs_trim.py +++ b/fuc/cli/ngs_trim.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for trimming adapters from FASTQ files. External dependencies: diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py index a055695..d0d27c5 100644 --- a/fuc/cli/vcf_call.py +++ b/fuc/cli/vcf_call.py @@ -2,16 +2,20 @@ from .. import api -description = f""" -Call variants (SNVs/indels) from BAM files. +description = """ +Call SNVs and indels from BAM files. -This command will run a fully customizable, bcftools-based pipeline for -calling and filtering variants. +Under the hood, the command utilizes the bcftool program to call variants. """ epilog = f""" [Example] Specify regions manually: - $ fuc {api.common._script_name()} ref.fa in1.bam in2.bam -r chr1:100-200 chr2:300-400 > out.vcf + $ fuc {api.common._script_name()} ref.fa 1.bam 2.bam \\ + -r chr1:100-200 chr2:400-500 > out.vcf + +[Example] Specify regions with a BED file: + $ fuc {api.common._script_name()} ref.fa bam.list \\ + -r in.bed > out.vcf """ def create_parser(subparsers): @@ -20,48 +24,59 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Call variants (SNVs/indels) from BAM files.', + help= +"""Call SNVs and indels from BAM files.""" ) parser.add_argument( 'fasta', - help="Reference FASTA file." + help= +"""Reference FASTA file.""" ) parser.add_argument( 'bams', nargs='+', - help="One or more input BAM files. Alternatively, you can \n" - "provide a text file (.txt, .tsv, .csv, or .list) \n" - "containing one BAM file per line." + help= +"""One or more input BAM files. Alternatively, you can +provide a text file (.txt, .tsv, .csv, or .list) +containing one BAM file per line.""" ) parser.add_argument( + '-r', '--regions', nargs='+', metavar='TEXT', - help="Only call variants in given regions. Each region must \n" - "have the format chrom:start-end and be a half-open \n" - "interval with (start, end]. This means, for example, \n" - "chr1:100-103 will extract positions 101, 102, and \n" - "103. Alternatively, you can provide a BED file \n" - "(compressed or uncompressed) to specify regions. Note \n" - "that the 'chr' prefix in contig names (e.g. 'chr1' \n" - "vs. '1') will be automatically added or removed as \n" - "necessary to match the input VCF's contig names." + help= +"""By default, the command looks at each genomic +position with coverage in BAM files, which can be +excruciatingly slow for large files (e.g. whole +genome sequencing). Therefore, use this argument to +only call variants in given regions. Each region must +have the format chrom:start-end and be a half-open +interval with (start, end]. This means, for example, +chr1:100-103 will extract positions 101, 102, and +103. Alternatively, you can provide a BED file +(compressed or uncompressed) to specify regions. Note +that the 'chr' prefix in contig names (e.g. 'chr1' +vs. '1') will be automatically added or removed as +necessary to match the input BAM's contig names.""" ) parser.add_argument( '--min-mq', metavar='INT', type=int, default=1, - help="Minimum mapping quality for an alignment to be used \n" - "(default: 1)." + help= +"""Minimum mapping quality for an alignment to be used +(default: 1).""" ) parser.add_argument( '--max-depth', metavar='INT', type=int, default=250, - help="At a position, read maximally this number of reads \n" - "per input file (default: 250)." + help= +"""At a position, read maximally this number of reads +per input file (default: 250).""" ) def main(args): diff --git a/fuc/cli/vcf_merge.py b/fuc/cli/vcf_merge.py index 0f7a05b..60d5353 100644 --- a/fuc/cli/vcf_merge.py +++ b/fuc/cli/vcf_merge.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Merge two or more VCF files. """ diff --git a/fuc/cli/vcf_slice.py b/fuc/cli/vcf_slice.py index 3acfba2..388fcea 100644 --- a/fuc/cli/vcf_slice.py +++ b/fuc/cli/vcf_slice.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Slice a VCF file for specified regions. """ diff --git a/fuc/cli/vcf_split.py b/fuc/cli/vcf_split.py index 8761a09..8d14dd8 100644 --- a/fuc/cli/vcf_split.py +++ b/fuc/cli/vcf_split.py @@ -4,7 +4,7 @@ from .. import api -description = f""" +description = """ Split a VCF file by individual. """ diff --git a/fuc/cli/vcf_vep.py b/fuc/cli/vcf_vep.py index eb1774a..969b21f 100644 --- a/fuc/cli/vcf_vep.py +++ b/fuc/cli/vcf_vep.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Filter a VCF file by annotations from Ensembl VEP. """ From 2e596b84fc7ff8b987726c6c1e9efc37bad1e7bc Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sat, 12 Feb 2022 15:26:11 +0900 Subject: [PATCH 11/22] Update `pyvcf.call` --- fuc/api/pyvcf.py | 35 ++++++++++++++++++++++++----------- fuc/cli/vcf_call.py | 2 +- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 1fe7431..4b643e4 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -64,7 +64,7 @@ import warnings import tempfile -from . import pybed, common, pymaf +from . import pybed, common, pymaf, pybam import numpy as np import pandas as pd @@ -138,7 +138,7 @@ } def call( - fasta, bams, path=None, regions=None, min_mq=1, max_depth=250 + fasta, bams, regions=None, path=None, min_mq=1, max_depth=250 ): """ Call SNVs and indels from BAM files. @@ -152,9 +152,6 @@ def call( bams : str or list One or more input BAM files. Alternatively, you can provide a text file (.txt, .tsv, .csv, or .list) containing one BAM file per line. - path : str, optional - Output VCF file. Writes to stdout when ``path='-'``. If None is - provided the result is returned as a string. regions : str, list, or pybed.BedFrame, optional By default (``regions=None``), the method looks at each genomic position with coverage in BAM files, which can be excruciatingly slow @@ -167,6 +164,9 @@ def call( to specify regions. Note that the 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will be automatically added or removed as necessary to match the input VCF's contig names. + path : str, optional + Output VCF file. Writes to stdout when ``path='-'``. If None is + provided the result is returned as a string. min_mq : int, default: 1 Minimum mapping quality for an alignment to be used. max_depth : int, default: 250 @@ -180,14 +180,28 @@ def call( # Parse input BAM files. bams = common.parse_list_or_file(bams) + # Check the 'chr' prefix. + if all([pybam.has_chr_prefix(x) for x in bams]): + chr_prefix = 'chr' + else: + chr_prefix = '' + # Parse target regions, if provided. if regions is not None: if isinstance(regions, str): regions = [regions] + elif isinstance(regions, list): + pass + elif isinstance(regions, pybed.BedFrame): + regions = bf.to_regions() + else: + raise TypeError("Incorrect type of argument 'regions'") if '.bed' in regions[0]: - parsed_regions = ['-R', regions[0]] + bf = pybed.BedFrame.from_file(regions[0]) + regions = bf.to_regions() else: - parsed_regions = ['-r'] + regions + regions = common.sort_regions(regions) + regions = [chr_prefix + x.replace('chr', '') for x in regions] with tempfile.TemporaryDirectory() as t: # Step 1: Get genotype likelihoods. @@ -195,10 +209,9 @@ def call( args += ['-q', str(min_mq)] args += ['--max-depth', str(max_depth)] args += ['-f', fasta] - if parsed_regions is not None: - args += parsed_regions - args += bams - results = bcftools.mpileup(*args) + if regions is not None: + args += ['-r', ','.join(regions)] + results = bcftools.mpileup(*(args + bams)) with open(f'{t}/likelihoods.bcf', 'wb') as f: f.write(results) diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py index d0d27c5..b200783 100644 --- a/fuc/cli/vcf_call.py +++ b/fuc/cli/vcf_call.py @@ -81,6 +81,6 @@ def create_parser(subparsers): def main(args): api.pyvcf.call( - args.fasta, args.bams, path='-', regions=args.regions, + args.fasta, args.bams, regions=args.regions, path='-', min_mq=args.min_mq, max_depth=args.max_depth ) From 66fcfb859fd31fd0d1b944eb21ee16ad19595d82 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 15 Feb 2022 10:29:08 +0900 Subject: [PATCH 12/22] Rename `pyvcf.VcfFrame.variants` to `pyvcf.VcfFrame.to_variants` --- CHANGELOG.rst | 1 + fuc/api/pyvcf.py | 204 +++++++++++++++++++++++------------------------ 2 files changed, 103 insertions(+), 102 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 97c1149..77d36ea 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ Changelog * Combine optional arguments ``bam`` and ``fn`` into single positional argument ``bams`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bam`` and ``--fn`` into ``bams``). * Combine optional arguments ``bed`` and ``region`` into single optional argument ``regions`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bed`` and ``--region`` into ``--regions``). * Update :meth:`pycov.CovFrame.from_bam` method and :command:`bam-depth` command to automatically handle the 'chr' string. +* Rename :meth:`pyvcf.VcfFrame.variants` method to :meth:`pyvcf.VcfFrame.to_variants`. 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 4b643e4..18ab30f 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -2344,6 +2344,70 @@ def copy(self): """Return a copy of the VcfFrame.""" return self.__class__(self.copy_meta(), self.copy_df()) + def to_bed(self): + """Write BedFrame from the VcfFrame. + + Returns + ------- + BedFrame + BedFrame. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103, 104], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'C', 'C', 'ACGT'], + ... 'ALT': ['C', 'T,G', 'G', 'A,G,CT', 'A'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT:DP', 'GT:DP', 'GT:DP', 'GT:DP', 'GT:DP'], + ... 'Steven': ['0/1:32', './.:.', '0/1:27', '0/2:34', '0/0:31'], + ... 'Sara': ['0/0:28', '1/2:30', '1/1:29', '1/2:38', '0/1:27'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara + 0 chr1 100 . A C . . . GT:DP 0/1:32 0/0:28 + 1 chr1 101 . A T,G . . . GT:DP ./.:. 1/2:30 + 2 chr1 102 . C G . . . GT:DP 0/1:27 1/1:29 + 3 chr1 103 . C A,G,CT . . . GT:DP 0/2:34 1/2:38 + 4 chr1 104 . ACGT A . . . GT:DP 0/0:31 0/1:27 + + We can construct BedFrame from the VcfFrame: + + >>> bf = vf.to_bed() + >>> bf.gr.df + Chromosome Start End + 0 chr1 100 100 + 1 chr1 101 101 + 2 chr1 102 102 + 3 chr1 103 103 + 4 chr1 103 104 + 5 chr1 105 107 + """ + def one_row(r): + if len(r.REF) == len(r.ALT) == 1: + start = r.POS + end = r.POS + elif len(r.REF) > len(r.ALT): + start = r.POS + 1 + end = r.POS + len(r.REF) - len(r.ALT) + else: + start = r.POS + end = r.POS + 1 + return pd.Series([r.CHROM, start, end]) + df = self.expand().df.apply(one_row, axis=1) + df.columns = ['Chromosome', 'Start', 'End'] + df = df.drop_duplicates() + bf = pybed.BedFrame.from_frame([], df) + return bf + def to_file(self, fn, compression=False): """ Write VcfFrame to a VCF file. @@ -2425,6 +2489,44 @@ def to_string(self): ).to_csv(index=False, sep='\t') return s + def to_variants(self): + """ + List unique variants in VcfFrame. + + Returns + ------- + list + List of unique variants. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr2'], + ... 'POS': [100, 101], + ... 'ID': ['.', '.'], + ... 'REF': ['G', 'T'], + ... 'ALT': ['A', 'A,C'], + ... 'QUAL': ['.', '.'], + ... 'FILTER': ['.', '.'], + ... 'INFO': ['.', '.'], + ... 'FORMAT': ['GT', 'GT'], + ... 'A': ['0/1', '1/2'] + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.variants() + ['chr1-100-G-A', 'chr2-101-T-A', 'chr2-101-T-C'] + """ + def one_row(r): + result = [] + for alt in r.ALT.split(','): + result.append(f'{r.CHROM}-{r.POS}-{r.REF}-{alt}') + return ','.join(result) + s = self.df.apply(one_row, axis=1) + s = ','.join(s) + return s.split(',') + def strip(self, format='GT', metadata=False): """ Remove any unnecessary data. @@ -4970,70 +5072,6 @@ def slice(self, region): df = df[df.POS <= end] return self.__class__(self.copy_meta(), df) - def to_bed(self): - """Write BedFrame from the VcfFrame. - - Returns - ------- - BedFrame - BedFrame. - - Examples - -------- - Assume we have the following data: - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103, 104], - ... 'ID': ['.', '.', '.', '.', '.'], - ... 'REF': ['A', 'A', 'C', 'C', 'ACGT'], - ... 'ALT': ['C', 'T,G', 'G', 'A,G,CT', 'A'], - ... 'QUAL': ['.', '.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.', '.'], - ... 'FORMAT': ['GT:DP', 'GT:DP', 'GT:DP', 'GT:DP', 'GT:DP'], - ... 'Steven': ['0/1:32', './.:.', '0/1:27', '0/2:34', '0/0:31'], - ... 'Sara': ['0/0:28', '1/2:30', '1/1:29', '1/2:38', '0/1:27'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara - 0 chr1 100 . A C . . . GT:DP 0/1:32 0/0:28 - 1 chr1 101 . A T,G . . . GT:DP ./.:. 1/2:30 - 2 chr1 102 . C G . . . GT:DP 0/1:27 1/1:29 - 3 chr1 103 . C A,G,CT . . . GT:DP 0/2:34 1/2:38 - 4 chr1 104 . ACGT A . . . GT:DP 0/0:31 0/1:27 - - We can construct BedFrame from the VcfFrame: - - >>> bf = vf.to_bed() - >>> bf.gr.df - Chromosome Start End - 0 chr1 100 100 - 1 chr1 101 101 - 2 chr1 102 102 - 3 chr1 103 103 - 4 chr1 103 104 - 5 chr1 105 107 - """ - def one_row(r): - if len(r.REF) == len(r.ALT) == 1: - start = r.POS - end = r.POS - elif len(r.REF) > len(r.ALT): - start = r.POS + 1 - end = r.POS + len(r.REF) - len(r.ALT) - else: - start = r.POS - end = r.POS + 1 - return pd.Series([r.CHROM, start, end]) - df = self.expand().df.apply(one_row, axis=1) - df.columns = ['Chromosome', 'Start', 'End'] - df = df.drop_duplicates() - bf = pybed.BedFrame.from_frame([], df) - return bf - def extract_format(self, k, func=None, as_nan=False): """ Extract data for the specified FORMAT key. @@ -6070,41 +6108,3 @@ def get_af(self, sample, variant): af = float(field.split(',')[j+1]) return af - - def variants(self): - """ - List unique variants in VcfFrame. - - Returns - ------- - list - List of unique variants. - - Examples - -------- - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr2'], - ... 'POS': [100, 101], - ... 'ID': ['.', '.'], - ... 'REF': ['G', 'T'], - ... 'ALT': ['A', 'A,C'], - ... 'QUAL': ['.', '.'], - ... 'FILTER': ['.', '.'], - ... 'INFO': ['.', '.'], - ... 'FORMAT': ['GT', 'GT'], - ... 'A': ['0/1', '1/2'] - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.variants() - ['chr1-100-G-A', 'chr2-101-T-A', 'chr2-101-T-C'] - """ - def one_row(r): - result = [] - for alt in r.ALT.split(','): - result.append(f'{r.CHROM}-{r.POS}-{r.REF}-{alt}') - return ','.join(result) - s = self.df.apply(one_row, axis=1) - s = ','.join(s) - return s.split(',') From 5b98ffa2a558b862401f4de1c506ddc6a8926e5e Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 15 Feb 2022 10:58:52 +0900 Subject: [PATCH 13/22] Update docs --- fuc/api/pybed.py | 3 ++- fuc/api/pyfq.py | 3 ++- fuc/api/pymaf.py | 17 +++++++++++------ fuc/api/pysnpeff.py | 9 ++++++--- fuc/api/pyvcf.py | 41 +++++++++++++++++++++++++++-------------- fuc/api/pyvep.py | 15 ++++++++++----- 6 files changed, 58 insertions(+), 30 deletions(-) diff --git a/fuc/api/pybed.py b/fuc/api/pybed.py index 6383dab..d31b942 100644 --- a/fuc/api/pybed.py +++ b/fuc/api/pybed.py @@ -48,7 +48,8 @@ ] class BedFrame: - """Class for storing BED data. + """ + Class for storing BED data. Parameters ---------- diff --git a/fuc/api/pyfq.py b/fuc/api/pyfq.py index f88262b..a3e1d8b 100644 --- a/fuc/api/pyfq.py +++ b/fuc/api/pyfq.py @@ -30,7 +30,8 @@ def readlen(self): @classmethod def from_file(cls, fn): - """Construct FqFrame from a FASTQ file. + """ + Construct FqFrame from a FASTQ file. Parameters ---------- diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 4485bab..dfbe434 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -198,7 +198,8 @@ SNV_CLASS_ORDER = ['C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G'] class MafFrame: - """Class for storing MAF data. + """ + Class for storing MAF data. Parameters ---------- @@ -2367,7 +2368,8 @@ def plot_summary( legend_fontsize=12 ): - """Create a summary figure for MafFrame. + """ + Create a summary figure for MafFrame. Parameters ---------- @@ -2709,7 +2711,8 @@ def plot_vaf( return ax def plot_varcls(self, ax=None, figsize=None, **kwargs): - """Create a bar plot for the nonsynonymous variant classes. + """ + Create a bar plot for the nonsynonymous variant classes. Parameters ---------- @@ -3350,7 +3353,7 @@ def to_vcf( Returns ------- VcfFrame - The VcfFrame object. + VcfFrame object. Examples -------- @@ -3454,7 +3457,8 @@ def one_row(r): return vf def to_file(self, fn): - """Write MafFrame to a MAF file. + """ + Write MafFrame to a MAF file. Parameters ---------- @@ -3465,7 +3469,8 @@ def to_file(self, fn): f.write(self.to_string()) def to_string(self): - """Render MafFrame to a console-friendly tabular output. + """ + Render MafFrame to a console-friendly tabular output. Returns ------- diff --git a/fuc/api/pysnpeff.py b/fuc/api/pysnpeff.py index f37ae2b..1177d54 100644 --- a/fuc/api/pysnpeff.py +++ b/fuc/api/pysnpeff.py @@ -31,7 +31,8 @@ """ def row_firstann(r): - """Return the first SnpEff annotation for the row.""" + """ + Return the first SnpEff annotation for the row.""" ann = [x for x in r.INFO.split(';') if 'ANN=' in x] if not ann: return '' @@ -39,7 +40,8 @@ def row_firstann(r): return ann def filter_ann(vf, targets, include=True): - """Filter out rows based on the SnpEff annotations. + """ + Filter out rows based on the SnpEff annotations. Parameters ---------- @@ -71,7 +73,8 @@ def func(r): return vf def parseann(vf, idx, sep=' | '): - """Parse SnpEff annotations. + """ + Parse SnpEff annotations. Parameters ---------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 18ab30f..67282c7 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -381,7 +381,8 @@ def gt_miss(g): return '.' in g.split(':')[0] def gt_polyp(g): - """Return True if sample genotype has a polyploid call. + """ + Return True if sample genotype has a polyploid call. Parameters ---------- @@ -1534,7 +1535,8 @@ def one_gt(g): return self.__class__(self.copy_meta(), df) def add_dp(self): - """Compute DP using AD and add it to the FORMAT field. + """ + Compute DP using AD and add it to the FORMAT field. Returns ------- @@ -1594,7 +1596,8 @@ def infunc(x): return vf def add_flag(self, flag, order='last', index=None): - """Add the given flag to the INFO field. + """ + Add the given flag to the INFO field. The default behavior is to add the flag to all rows in the VcfFrame. @@ -1697,7 +1700,8 @@ def f(r): return vf def cfilter_empty(self, opposite=False, as_list=False): - """Remove samples whose genotype calls are all missing. + """ + Remove samples whose genotype calls are all missing. Parameters ---------- @@ -1773,7 +1777,8 @@ def cfilter_empty(self, opposite=False, as_list=False): return self.subset(l) def collapse(self): - """Collapse duplicate records in the VcfFrame. + """ + Collapse duplicate records in the VcfFrame. Duplicate records have the identical values for CHROM, POS, and REF. They can result from merging two VCF files. @@ -2345,12 +2350,13 @@ def copy(self): return self.__class__(self.copy_meta(), self.copy_df()) def to_bed(self): - """Write BedFrame from the VcfFrame. + """ + Convert VcfFrame to BedFrame. Returns ------- BedFrame - BedFrame. + BedFrame obejct. Examples -------- @@ -3469,7 +3475,8 @@ def one_gt(g): return vf def expand(self): - """Expand each multiallelic locus to multiple rows. + """ + Expand each multiallelic locus to multiple rows. Only the GT subfield of FORMAT will be retained. @@ -3806,7 +3813,8 @@ def filter_indel(self, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_flagall(self, flags, opposite=False, as_index=False): - """Select rows if all of the given INFO flags are present. + """ + Select rows if all of the given INFO flags are present. Parameters ---------- @@ -3889,7 +3897,8 @@ def f(r): return self.__class__(self.copy_meta(), self.df[i]) def filter_flagany(self, flags, opposite=False, as_index=False): - """Select rows if any one of the given INFO flags is present. + """ + Select rows if any one of the given INFO flags is present. Parameters ---------- @@ -4047,7 +4056,8 @@ def filter_multialt(self, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_pass(self, opposite=False, as_index=False): - """Select rows with PASS in the FILTER field. + """ + Select rows with PASS in the FILTER field. Parameters ---------- @@ -4268,7 +4278,8 @@ def filter_polyp(self, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_qual(self, threshold, opposite=False, as_index=False): - """Select rows with minimum QUAL value. + """ + Select rows with minimum QUAL value. Parameters ---------- @@ -4348,7 +4359,8 @@ def one_row(r): return self.__class__(self.copy_meta(), self.df[i]) def filter_sampall(self, samples=None, opposite=False, as_index=False): - """Select rows if all of the given samples have the variant. + """ + Select rows if all of the given samples have the variant. The default behavior is to use all samples in the VcfFrame. @@ -4544,7 +4556,8 @@ def filter_sampany(self, samples=None, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_sampnum(self, threshold, opposite=False, as_index=False): - """Select rows if the variant is prevalent enough. + """ + Select rows if the variant is prevalent enough. Parameters ---------- diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index c29404b..69906c4 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -156,7 +156,8 @@ ] def row_firstann(r): - """Return the first result in the row. + """ + Return the first result in the row. Parameters ---------- @@ -208,7 +209,8 @@ def row_firstann(r): return results.split(',')[0] def row_mostsevere(r): - """Return result with the most severe consequence in the row. + """ + Return result with the most severe consequence in the row. Parameters ---------- @@ -260,7 +262,8 @@ def row_mostsevere(r): return sorted(results.split(','), key=f)[0] def parseann(vf, targets, sep=' | ', as_series=False): - """Parse variant annotations in VcfFrame. + """ + Parse variant annotations in VcfFrame. If there are multiple results per row, the method will use the first one for extracting requested data. @@ -391,7 +394,8 @@ def to_frame(vf, as_zero=False): return df def pick_result(vf, mode='mostsevere'): - """Return a new VcfFrame after picking one result per row. + """ + Return a new VcfFrame after picking one result per row. Parameters ---------- @@ -456,7 +460,8 @@ def pick_result(vf, mode='mostsevere'): return new_vf def annot_names(vf): - """Return the list of avaialble consequence annotations in the VcfFrame. + """ + Return the list of avaialble consequence annotations in the VcfFrame. Parameters ---------- From 72c5e40adb6b2fca7aaff0c3be66c7805768938d Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 15 Feb 2022 15:25:28 +0900 Subject: [PATCH 14/22] Update `pyvcf.row_updateinfo` --- CHANGELOG.rst | 1 + fuc/api/pyvcf.py | 260 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 198 insertions(+), 63 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 77d36ea..152cbe1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,7 @@ Changelog * Combine optional arguments ``bed`` and ``region`` into single optional argument ``regions`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bed`` and ``--region`` into ``--regions``). * Update :meth:`pycov.CovFrame.from_bam` method and :command:`bam-depth` command to automatically handle the 'chr' string. * Rename :meth:`pyvcf.VcfFrame.variants` method to :meth:`pyvcf.VcfFrame.to_variants`. +* Add new optional arguments ``force`` and ``missing`` to :meth:`pyvcf.row_updateinfo` method. 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 67282c7..9bbb980 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -12,45 +12,117 @@ identifier (e.g. 'chr1'). See the VCF specification above for an example VCF file. -Genotype lines have nine required fields for storing variant information -and variable-length fields for storing sample genotype data. For some -fields, missing values are tolerated and can be specified with a dot ('.'). -The nine required fields are: - -+-----+--------+------------------------------------+------------------------+ -| No. | Name | Description | Examples | -+=====+========+====================================+========================+ -| 1 | CHROM | Chromosome or contig identifier | 'chr2', '2', 'chrM' | -+-----+--------+------------------------------------+------------------------+ -| 2 | POS | 1-based reference position | 10041, 23042 | -+-----+--------+------------------------------------+------------------------+ -| 3 | ID | ';'-separated variant identifiers | '.', 'rs35', 'rs9;rs53'| -+-----+--------+------------------------------------+------------------------+ -| 4 | REF | Reference allele | 'A', 'GT' | -+-----+--------+------------------------------------+------------------------+ -| 5 | ALT | ','-separated alternate alleles | 'T', 'ACT', 'C,T' | -+-----+--------+------------------------------------+------------------------+ -| 6 | QUAL | Phred-scaled quality score for ALT | '.', 67, 12 | -+-----+--------+------------------------------------+------------------------+ -| 7 | FILTER | ';'-separated filters that failed | '.', 'PASS', 'q10;s50' | -+-----+--------+------------------------------------+------------------------+ -| 8 | INFO | ';'-separated information fields | '.', 'DP=14;AF=0.5;DB' | -+-----+--------+------------------------------------+------------------------+ -| 9 | FORMAT | ':'-separated genotype fields | 'GT', 'GT:AD:DP' | -+-----+--------+------------------------------------+------------------------+ - -You will sometimes come across VCFs that have only eight columns, and contain -no FORMAT or sample-specific information. These are called "sites-only" VCFs, -and represent variation that has been observed in a population. Generally, -information about the population of origin should be included in the header. - -There are several common, reserved genotype keywords that are standards -across the community. Currently, the pyvcf submodule is aware of the -following: - -* AD - Total read depth for each allele (R, Integer) -* AF - Allele fraction of the event in the tumor (1, Float) -* DP - Read depth (1, Integer) +Genotype lines usually consist of nine columns for storing variant +information (all fixed and mandatory except FORMAT) plus additional columns +for storing sample genotype data. For some columns, missing values are +allowed and can be specified with a dot ('.'). The first nine columns are: + +.. list-table:: + :header-rows: 1 + + * - No. + - Column + - Description + - Required + - Missing + - Examples + * - 1 + - CHROM + - Chromosome or contig identifier + - ✅ + - ❌ + - 'chr2', '2', 'chrM' + * - 2 + - POS + - 1-based reference position + - ✅ + - ❌ + - 10041, 23042 + * - 3 + - ID + - ';'-separated variant identifiers + - ✅ + - ✅ + - '.', 'rs35', 'rs9;rs53' + * - 4 + - REF + - Reference allele + - ✅ + - ❌ + - 'A', 'GT' + * - 5 + - ALT + - ','-separated alternate alleles + - ✅ + - ❌ + - 'T', 'ACT', 'C,T' + * - 6 + - QUAL + - Phred-scaled quality score for ALT + - ✅ + - ✅ + - '.', 67, 12 + * - 7 + - FILTER + - ';'-separated filters that failed + - ✅ + - ✅ + - '.', 'PASS', 'q10;s50' + * - 8 + - INFO + - ';'-separated information fields + - ✅ + - ✅ + - '.', 'DP=14;AF=0.5;DB' + * - 9 + - FORMAT + - ':'-separated genotype fields + - ❌ + - ❌ + - 'GT', 'GT:AD:DP' + +You will sometimes come across VCF files that have only eight columns, and +do not contain the FORMAT column or sample-specific information. These are +called "sites-only" VCF files, and normally represent genetic variation that +has been observed in a large population. Generally, information about the +population of origin should be included in the header. + +There are several reserved keywords in the INFO and FORMAT columns that are +standards across the community. Popular keywords are listed below: + +.. list-table:: + :header-rows: 1 + + * - Column + - Key + - Number + - Type + - Description + * - INFO + - AC + - A + - Integer + - Allele count in genotypes, for each ALT allele, in the same order as listed + * - INFO + - AF + - A + - Float + - Allele frequency for each ALT allele in the same order as listed (estimated from primary data, not called genotypes) + * - FORMAT + - AD + - R + - Integer + - Total read depth for each allele + * - FORMAT + - AF + - 1 + - Float + - Allele fraction of the event in the tumor + * - FORMAT + - DP + - 1 + - Integer + - Read depth If sample annotation data are available for a given VCF file, use the :class:`common.AnnFrame` class to import the data. @@ -845,9 +917,9 @@ def one_gt(g): return '|' in g.split(':')[0] return r[9:].apply(one_gt).all() -def row_updateinfo(r, key, value): +def row_updateinfo(r, key, value, force=True, missing=False): """ - Update INFO data in the row that match the given key. + Update INFO field matching specified key. Parameters ---------- @@ -857,6 +929,12 @@ def row_updateinfo(r, key, value): INFO key. value : str New value to be assigned. + force : bool, default: True + If True, overwrite any existing data. + missing : bool, default: False + By default (``missing=False``), the method will not update INFO field + if there is missing value ('.'). Setting ``missing=True`` will stop + this behavior. Returns ------- @@ -868,36 +946,62 @@ def row_updateinfo(r, key, value): >>> from fuc import pyvcf >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'T', 'A', 'C'], - ... 'ALT': ['A', 'C', 'T', 'A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['DB;AC=0', 'DB;H2;AC=1', 'DB;H2;AC=1', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/0', '0/1', '0/1', '0/0'], + ... 'CHROM': ['chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102], + ... 'ID': ['.', '.', '.'], + ... 'REF': ['G', 'T', 'A'], + ... 'ALT': ['A', 'C', 'T'], + ... 'QUAL': ['.', '.', '.'], + ... 'FILTER': ['.', '.', '.'], + ... 'INFO': ['DB;AC=1', 'DB', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT'], + ... 'A': ['0/1', '1/1', '0/0'], + ... 'B': ['0/0', '0/1', '0/1'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . DB;AC=0 GT 0/0 - 1 chr1 101 . T C . . DB;H2;AC=1 GT 0/1 - 2 chr1 102 . A T . . DB;H2;AC=1 GT 0/1 - 3 chr1 103 . C A . . . GT 0/0 - >>> vf.df.apply(pyvcf.row_updateinfo, args=('AC', '4'), axis=1) - 0 DB;AC=4 - 1 DB;H2;AC=4 - 2 DB;H2;AC=4 - 3 . - dtype: object + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . DB;AC=1 GT 0/1 0/0 + 1 chr1 101 . T C . . DB GT 1/1 0/1 + 2 chr1 102 . A T . . . GT 0/0 0/1 + >>> pyvcf.row_updateinfo(vf.df.iloc[0, :], 'AC', '100') + 'DB;AC=100' + >>> pyvcf.row_updateinfo(vf.df.iloc[0, :], 'AC', '100', force=False) + 'DB;AC=1' + >>> pyvcf.row_updateinfo(vf.df.iloc[1, :], 'AC', '100') + 'DB;AC=100' + >>> pyvcf.row_updateinfo(vf.df.iloc[1, :], 'AC', '100', force=False) + 'DB;AC=100' + >>> pyvcf.row_updateinfo(vf.df.iloc[2, :], 'AC', '100') + '.' + >>> pyvcf.row_updateinfo(vf.df.iloc[2, :], 'AC', '100', force=False) + '.' + >>> pyvcf.row_updateinfo(vf.df.iloc[2, :], 'AC', '100', missing=True) + 'AC=100' """ + data = f'{key}={value}' + + if not missing and r.INFO == '.': + return r.INFO + fields = r.INFO.split(';') + found = False + for i, field in enumerate(fields): if field.startswith(f'{key}='): - fields[i] = field[:len(key)+1] + value + found = True + if force: + fields[i] = data + else: + pass break + + if not found: + if r.INFO == '.': + fields = [data] + else: + fields.append(data) + return ';'.join(fields) def row_missval(r): @@ -1595,6 +1699,36 @@ def infunc(x): vf = self.__class__(self.copy_meta(), df) return vf + def add_info_ac(self, force=True): + """ + Add the AC field to the INFO column. + + Parameters + ---------- + force : bool, default: True + If True, overwrite any existing data. + + Returns + ------- + VcfFrame + Updated VcfFrame. + """ + def one_row(r): + data = [] + for i, allele in enumerate(r.ALT.split(',')): + count = r[9:].apply(lambda x: x.split('/').count(str(i))) + data.append(str(sum(count))) + data = 'AC=' + ','.join(data) + if r.INFO == '.': + r.INFO = data + else: + r.INFO += ';' + data + return r + + df = self.df.apply(one_row, axis=1) + + return self.__class__(self.copy_meta(), df) + def add_flag(self, flag, order='last', index=None): """ Add the given flag to the INFO field. From 6f1e149a391a40a557d58c99b7c43191400aea37 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 15 Feb 2022 15:38:27 +0900 Subject: [PATCH 15/22] Update docs --- fuc/api/pyvep.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index 69906c4..9006495 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -415,6 +415,7 @@ def pick_result(vf, mode='mostsevere'): Examples -------- + >>> from fuc import pyvep, pyvcf >>> meta = [ ... '##fileformat=VCFv4.1', ... '##VEP="v104" time="2021-05-20 10:50:12" cache="/net/isilonP/public/ro/ensweb-data/latest/tools/grch37/e104/vep/cache/homo_sapiens/104_GRCh37" db="homo_sapiens_core_104_37@hh-mysql-ens-grch37-web" 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"', @@ -441,16 +442,16 @@ def pick_result(vf, mode='mostsevere'): >>> vf1 = pyvep.pick_result(vf, mode='mostsevere') >>> vf2 = pyvep.pick_result(vf, mode='firstann') >>> vf1.df.INFO - 0 CSQ=C|missense_variant|MODERATE|SPEN|ENSG00000... - 1 CSQ=T|missense_variant|MODERATE|NRAS|ENSG00000... - 2 CSQ=A|stop_gained&NMD_transcript_variant|HIGH|... - 3 CSQ=-|inframe_deletion|MODERATE|HRAS|ENSG00000... + 0 CSQ=C|missense_variant|MODERATE|SPEN|ENSG00000065526|Transcript|ENST00000375759.3|protein_coding|12/15||||10322|10118|3373|Q/P|cAg/cCg|||1||HGNC|17575|||||tolerated_low_confidence(0.08)|possibly_damaging(0.718)|||||||||| + 1 CSQ=T|missense_variant|MODERATE|NRAS|ENSG00000213281|Transcript|ENST00000369535.4|protein_coding|3/7||||502|248|83|A/D|gCc/gAc|COSV99795232||-1||HGNC|7989|||||deleterious(0.02)|probably_damaging(0.946)|||1|1|||||| + 2 CSQ=A|stop_gained&NMD_transcript_variant|HIGH|NTRK1|ENSG00000198400|Transcript|ENST00000497019.2|nonsense_mediated_decay|10/16||||1183|1032|344|W/*|tgG/tgA|COSV62328771||1||HGNC|8031|||||||||1|1|||||| + 3 CSQ=-|inframe_deletion|MODERATE|HRAS|ENSG00000174775|Transcript|ENST00000311189.7|protein_coding|2/6||||198-200|26-28|9-10|VG/G|gTGGgc/ggc|rs1164486792||-1||HGNC|5173||||||||||1|||||| Name: INFO, dtype: object >>> vf2.df.INFO - 0 CSQ=C|downstream_gene_variant|MODIFIER|ZBTB17|... - 1 CSQ=T|downstream_gene_variant|MODIFIER|CSDE1|E... - 2 CSQ=A|missense_variant|MODERATE|NTRK1|ENSG0000... - 3 CSQ=-|upstream_gene_variant|MODIFIER|LRRC56|EN... + 0 CSQ=C|downstream_gene_variant|MODIFIER|ZBTB17|ENSG00000116809|Transcript|ENST00000375733.2|protein_coding|||||||||||4617|-1||HGNC|12936|||||||||||||||| + 1 CSQ=T|downstream_gene_variant|MODIFIER|CSDE1|ENSG00000009307|Transcript|ENST00000261443.5|protein_coding||||||||||COSV99795232|3453|-1||HGNC|29905|||||||||1|1|||||| + 2 CSQ=A|missense_variant|MODERATE|NTRK1|ENSG00000198400|Transcript|ENST00000358660.3|protein_coding|10/16||||1296|1255|419|A/T|Gcc/Acc|COSV62328771||1||HGNC|8031|||||deleterious(0.01)|probably_damaging(0.999)|||1|1|||||| + 3 CSQ=-|upstream_gene_variant|MODIFIER|LRRC56|ENSG00000161328|Transcript|ENST00000270115.7|protein_coding||||||||||rs1164486792|3230|1||HGNC|25430||||||||||1|||||| Name: INFO, dtype: object """ funcs = {'mostsevere': row_mostsevere, 'firstann': row_firstann} From 3cb733ffea798ada17732d82d25f0573dd4ca8d1 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 15 Feb 2022 17:26:39 +0900 Subject: [PATCH 16/22] Update `pyvcf`: * Add new method :meth:`pyvcf.gt_ploidy`. * Update :meth:`pyvcf.gt_polyp` method to use :meth:`pyvcf.gt_ploidy` method internally. * :issue:`53`: Add new methods :meth:`pyvcf.row_compute_info` and :meth:`pyvcf.VcfFrame.compute_info`. --- CHANGELOG.rst | 3 + fuc/api/pyvcf.py | 240 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 206 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 152cbe1..78075ec 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,9 @@ Changelog * Update :meth:`pycov.CovFrame.from_bam` method and :command:`bam-depth` command to automatically handle the 'chr' string. * Rename :meth:`pyvcf.VcfFrame.variants` method to :meth:`pyvcf.VcfFrame.to_variants`. * Add new optional arguments ``force`` and ``missing`` to :meth:`pyvcf.row_updateinfo` method. +* Add new method :meth:`pyvcf.gt_ploidy`. +* Update :meth:`pyvcf.gt_polyp` method to use :meth:`pyvcf.gt_ploidy` method internally. +* :issue:`53`: Add new methods :meth:`pyvcf.row_compute_info` and :meth:`pyvcf.VcfFrame.compute_info`. 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 9bbb980..a80fa59 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -103,6 +103,11 @@ - A - Integer - Allele count in genotypes, for each ALT allele, in the same order as listed + * - INFO + - AN + - 1 + - Integer + - Total number of alleles in called genotypes * - INFO - AF - A @@ -452,9 +457,50 @@ def gt_miss(g): """ return '.' in g.split(':')[0] +def gt_ploidy(g): + """ + For given genotype, determine its ploidy. + + Parameters + ---------- + g : str + Sample genotype. + + Returns + ------- + int + Ploidy number. + + Examples + -------- + + >>> from fuc import pyvcf + >>> pyvcf.gt_ploidy('1') + 1 + >>> pyvcf.gt_ploidy('.') + 1 + >>> pyvcf.gt_ploidy('0/1') + 2 + >>> pyvcf.gt_ploidy('./.') + 2 + >>> pyvcf.gt_ploidy('0|1') + 2 + >>> pyvcf.gt_ploidy('1|0|1') + 3 + >>> pyvcf.gt_ploidy('0/./1/1') + 4 + """ + gt = g.split(':')[0] + if '/' in gt: + return gt.count('/') + 1 + elif '|' in gt: + return gt.count('|') + 1 + else: + return 1 + def gt_polyp(g): """ - Return True if sample genotype has a polyploid call. + For given genotype, return True if it's polyploid. Parameters ---------- @@ -464,7 +510,7 @@ def gt_polyp(g): Returns ------- bool - True if sample genotype has a polyploid call. + True if genotype is polyploid. Examples -------- @@ -481,11 +527,7 @@ def gt_polyp(g): >>> pyvcf.gt_polyp('0/./1/1') True """ - gt = g.split(':')[0] - if '/' in gt: - return gt.count('/') > 1 - else: - return gt.count('|') > 1 + return gt_ploidy(g) > 2 def gt_hasvar(g): """ @@ -819,6 +861,106 @@ def row_hasindel(r): alt_has = max([len(x) for x in r['ALT'].split(',')]) > 1 return ref_has or alt_has + +def row_compute_info(r, key, decimals=3): + """ + For given row, compute AC/AN/AF in INFO column. + + Parameters + ---------- + r : pandas.Series + VCF row. + key : {'AC', 'AN', 'AF'} + INFO key. + decimals : int, default: 3 + Number of decimals to display for AF. + + Returns + ------- + str + Requested INFO data. + + Example + ------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chrX'], + ... 'POS': [100, 101, 102, 100], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'A', 'A'], + ... 'ALT': ['A', 'C', 'T,G', 'G'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'A': ['0/1', '0/0', '0/1', '0'], + ... 'B': ['1/1', './.', '0/0', '0/1'], + ... 'C': ['0/0', '0/0', '1/2', '1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT 0/1 1/1 0/0 + 1 chr1 101 . T C . . . GT 0/0 ./. 0/0 + 2 chr1 102 . A T,G . . . GT 0/1 0/0 1/2 + 3 chrX 100 . A G . . . GT 0 0/1 1 + >>> pyvcf.row_compute_info(vf.df.iloc[0, :], 'AC') + '3' + >>> pyvcf.row_compute_info(vf.df.iloc[0, :], 'AN') + '6' + >>> pyvcf.row_compute_info(vf.df.iloc[0, :], 'AF') + '0.500' + >>> pyvcf.row_compute_info(vf.df.iloc[1, :], 'AC') + '0' + >>> pyvcf.row_compute_info(vf.df.iloc[1, :], 'AN') + '4' + >>> pyvcf.row_compute_info(vf.df.iloc[1, :], 'AF') + '0.000' + >>> pyvcf.row_compute_info(vf.df.iloc[2, :], 'AC') + '2,1' + >>> pyvcf.row_compute_info(vf.df.iloc[2, :], 'AN') + '6' + >>> pyvcf.row_compute_info(vf.df.iloc[2, :], 'AF') + '0.333,0.167' + >>> pyvcf.row_compute_info(vf.df.iloc[3, :], 'AC') + '2' + >>> pyvcf.row_compute_info(vf.df.iloc[3, :], 'AN') + '4' + >>> pyvcf.row_compute_info(vf.df.iloc[3, :], 'AF') + '0.500' + """ + def get_ac(r): + counts = [] + for i, allele in enumerate(r.ALT.split(',')): + count = r[9:].apply(lambda x: x.split('/').count(str(i+1))) + counts.append(sum(count)) + return counts + + def get_an(r): + def one_gt(g): + if '.' in g: + return 0 + else: + return gt_ploidy(g) + return r[9:].apply(one_gt).sum() + + def get_af(r): + return [x / get_an(r) for x in get_ac(r)] + + methods = {'AC': get_ac, 'AN': get_an, 'AF': get_af} + + results = methods[key](r) + + if key == 'AC': + results = ','.join([str(x) for x in results]) + elif key == 'AN': + results = str(results) + else: + results = ','.join([f'{x:.{decimals}f}' for x in results]) + + return results + def row_parseinfo(r, key): """ Return INFO data in the row that match the given key. @@ -1699,36 +1841,6 @@ def infunc(x): vf = self.__class__(self.copy_meta(), df) return vf - def add_info_ac(self, force=True): - """ - Add the AC field to the INFO column. - - Parameters - ---------- - force : bool, default: True - If True, overwrite any existing data. - - Returns - ------- - VcfFrame - Updated VcfFrame. - """ - def one_row(r): - data = [] - for i, allele in enumerate(r.ALT.split(',')): - count = r[9:].apply(lambda x: x.split('/').count(str(i))) - data.append(str(sum(count))) - data = 'AC=' + ','.join(data) - if r.INFO == '.': - r.INFO = data - else: - r.INFO += ';' + data - return r - - df = self.df.apply(one_row, axis=1) - - return self.__class__(self.copy_meta(), df) - def add_flag(self, flag, order='last', index=None): """ Add the given flag to the INFO field. @@ -2036,6 +2148,60 @@ def raise_error(c): vf = self.__class__(self.copy_meta(), df) return vf + def compute_info(self, key): + """ + Compute AC/AN/AF in INFO column. + + The method will ignore and overwrite any existing data for selected + key. + + Returns + ------- + VcfFrame + Updated VcfFrame. + key : {'AC', 'AN', 'AF'} + INFO key. + + Example + ------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102], + ... 'ID': ['.', '.', '.'], + ... 'REF': ['G', 'T', 'A'], + ... 'ALT': ['A', 'C', 'T,G'], + ... 'QUAL': ['.', '.', '.'], + ... 'FILTER': ['.', '.', '.'], + ... 'INFO': ['AC=100', 'MQ=59', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT'], + ... 'A': ['0/1', '0/0', '0/1'], + ... 'B': ['1/1', '0/1', '0/0'], + ... 'C': ['0/0', '0/0', '1/2'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=100 GT 0/1 1/1 0/0 + 1 chr1 101 . T C . . MQ=59 GT 0/0 0/1 0/0 + 2 chr1 102 . A T,G . . . GT 0/1 0/0 1/2 + >>> vf = vf.add_info_ac() + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=3 GT 0/1 1/1 0/0 + 1 chr1 101 . T C . . MQ=59;AC=5 GT 0/0 0/1 0/0 + 2 chr1 102 . A T,G . . AC=3,2 GT 0/1 0/0 1/2 + """ + def one_row(r, key): + data = row_compute_info(r, key) + r.INFO = row_updateinfo(r, key, data, missing=True) + return r + + df = self.df.apply(one_row, args=(key,), axis=1) + + return self.__class__(self.copy_meta(), df) + @classmethod def from_dict(cls, meta, data): """ From b2239ad7ef0977636a244a86bd7f0c5763ab7e0f Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Tue, 15 Feb 2022 20:53:38 +0900 Subject: [PATCH 17/22] Update API --- CHANGELOG.rst | 2 +- fuc/api/pyvcf.py | 152 +++++++++++++++++++++++++++-------------------- 2 files changed, 89 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 78075ec..a3c7bb0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,7 +13,7 @@ Changelog * Add new optional arguments ``force`` and ``missing`` to :meth:`pyvcf.row_updateinfo` method. * Add new method :meth:`pyvcf.gt_ploidy`. * Update :meth:`pyvcf.gt_polyp` method to use :meth:`pyvcf.gt_ploidy` method internally. -* :issue:`53`: Add new methods :meth:`pyvcf.row_compute_info` and :meth:`pyvcf.VcfFrame.compute_info`. +* :issue:`53`: Add new methods to compute AC/AN/AF in the INFO column: :meth:`pyvcf.row_computeinfo` and :meth:`pyvcf.VcfFrame.compute_info`. 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index a80fa59..a48cbac 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -13,9 +13,10 @@ VCF file. Genotype lines usually consist of nine columns for storing variant -information (all fixed and mandatory except FORMAT) plus additional columns -for storing sample genotype data. For some columns, missing values are -allowed and can be specified with a dot ('.'). The first nine columns are: +information (all fixed and mandatory except for the FORMAT column) plus +additional sample-specific columns for expressing individual genotype calls +(e.g. '0/1'). Missing values are allowed in some cases and can be specified +with a dot ('.'). The first nine columns are: .. list-table:: :header-rows: 1 @@ -422,7 +423,7 @@ def rescue_filtered_variants(vfs, format='GT'): def gt_miss(g): """ - Return True if sample genotype is missing. + For given genotype, return True if it has missing value. Parameters ---------- @@ -432,7 +433,7 @@ def gt_miss(g): Returns ------- bool - True if sample genotype is missing. + True if genotype is missing. Examples -------- @@ -459,7 +460,7 @@ def gt_miss(g): def gt_ploidy(g): """ - For given genotype, determine its ploidy. + For given genotype, return its ploidy number. Parameters ---------- @@ -500,7 +501,7 @@ def gt_ploidy(g): def gt_polyp(g): """ - For given genotype, return True if it's polyploid. + For given genotype, return True if it is polyploid. Parameters ---------- @@ -531,7 +532,7 @@ def gt_polyp(g): def gt_hasvar(g): """ - Return True if sample genotype has at least one variant call. + For given genotype, return True if it has variation. Parameters ---------- @@ -541,7 +542,7 @@ def gt_hasvar(g): Returns ------- bool - True if sample genotype has a variant call. + True if genotype has variation. Examples -------- @@ -572,7 +573,7 @@ def gt_hasvar(g): def gt_unphase(g): """ - Return unphased sample genotype. + For given genotype, return its unphased form. Parameters ---------- @@ -620,7 +621,7 @@ def gt_unphase(g): def gt_het(g): """ - Return True if genotype call is heterozygous. + For given genotype, return True if it is heterozygous. Parameters ---------- @@ -630,7 +631,7 @@ def gt_het(g): Returns ------- bool - True if genotype call is heterozygous. + True if genotype is heterozygous. Examples -------- @@ -657,7 +658,7 @@ def gt_het(g): def gt_pseudophase(g): """ - Return pseudophased genotype call. + For given genotype, return its pseudophased form. Parameters ---------- @@ -815,7 +816,7 @@ def merge( def row_hasindel(r): """ - Return True if the row has an indel. + For given row, return True if it has indel. Parameters ---------- @@ -862,9 +863,9 @@ def row_hasindel(r): return ref_has or alt_has -def row_compute_info(r, key, decimals=3): +def row_computeinfo(r, key, decimals=3): """ - For given row, compute AC/AN/AF in INFO column. + For given row, return AC/AN/AF calculation in INFO column. Parameters ---------- @@ -893,47 +894,54 @@ def row_compute_info(r, key, decimals=3): ... 'QUAL': ['.', '.', '.', '.'], ... 'FILTER': ['.', '.', '.', '.'], ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'A': ['0/1', '0/0', '0/1', '0'], - ... 'B': ['1/1', './.', '0/0', '0/1'], - ... 'C': ['0/0', '0/0', '1/2', '1'], + ... 'FORMAT': ['GT:DP', 'GT', 'GT', 'GT'], + ... 'A': ['1|0:29', '0|0', '1|0', '0'], + ... 'B': ['1/1:34', './.', '0/0', '0/1'], + ... 'C': ['0/0:23', '0/0', '1/2', '1'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C - 0 chr1 100 . G A . . . GT 0/1 1/1 0/0 - 1 chr1 101 . T C . . . GT 0/0 ./. 0/0 - 2 chr1 102 . A T,G . . . GT 0/1 0/0 1/2 - 3 chrX 100 . A G . . . GT 0 0/1 1 - >>> pyvcf.row_compute_info(vf.df.iloc[0, :], 'AC') + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT:DP 1|0:29 1/1:34 0/0:23 + 1 chr1 101 . T C . . . GT 0|0 ./. 0/0 + 2 chr1 102 . A T,G . . . GT 1|0 0/0 1/2 + 3 chrX 100 . A G . . . GT 0 0/1 1 + >>> pyvcf.row_computeinfo(vf.df.iloc[0, :], 'AC') '3' - >>> pyvcf.row_compute_info(vf.df.iloc[0, :], 'AN') + >>> pyvcf.row_computeinfo(vf.df.iloc[0, :], 'AN') '6' - >>> pyvcf.row_compute_info(vf.df.iloc[0, :], 'AF') + >>> pyvcf.row_computeinfo(vf.df.iloc[0, :], 'AF') '0.500' - >>> pyvcf.row_compute_info(vf.df.iloc[1, :], 'AC') + >>> pyvcf.row_computeinfo(vf.df.iloc[1, :], 'AC') '0' - >>> pyvcf.row_compute_info(vf.df.iloc[1, :], 'AN') + >>> pyvcf.row_computeinfo(vf.df.iloc[1, :], 'AN') '4' - >>> pyvcf.row_compute_info(vf.df.iloc[1, :], 'AF') + >>> pyvcf.row_computeinfo(vf.df.iloc[1, :], 'AF') '0.000' - >>> pyvcf.row_compute_info(vf.df.iloc[2, :], 'AC') + >>> pyvcf.row_computeinfo(vf.df.iloc[2, :], 'AC') '2,1' - >>> pyvcf.row_compute_info(vf.df.iloc[2, :], 'AN') + >>> pyvcf.row_computeinfo(vf.df.iloc[2, :], 'AN') '6' - >>> pyvcf.row_compute_info(vf.df.iloc[2, :], 'AF') + >>> pyvcf.row_computeinfo(vf.df.iloc[2, :], 'AF') '0.333,0.167' - >>> pyvcf.row_compute_info(vf.df.iloc[3, :], 'AC') + >>> pyvcf.row_computeinfo(vf.df.iloc[3, :], 'AC') '2' - >>> pyvcf.row_compute_info(vf.df.iloc[3, :], 'AN') + >>> pyvcf.row_computeinfo(vf.df.iloc[3, :], 'AN') '4' - >>> pyvcf.row_compute_info(vf.df.iloc[3, :], 'AF') + >>> pyvcf.row_computeinfo(vf.df.iloc[3, :], 'AF') '0.500' """ def get_ac(r): + def one_gt(g, i): + gt = g.split(':')[0] + if '/' in gt: + l = gt.split('/') + else: + l = gt.split('|') + return l.count(str(i + 1)) counts = [] for i, allele in enumerate(r.ALT.split(',')): - count = r[9:].apply(lambda x: x.split('/').count(str(i+1))) + count = r[9:].apply(one_gt, args=(i, )) counts.append(sum(count)) return counts @@ -963,7 +971,7 @@ def get_af(r): def row_parseinfo(r, key): """ - Return INFO data in the row that match the given key. + For given row, return requested data in INFO column. Parameters ---------- @@ -1015,7 +1023,7 @@ def row_parseinfo(r, key): def row_phased(r): """ - Return True if every genotype in the row is haplotype phased. + For given row, return True if all genotypes are phased. Parameters ---------- @@ -1061,7 +1069,7 @@ def one_gt(g): def row_updateinfo(r, key, value, force=True, missing=False): """ - Update INFO field matching specified key. + For given row, return updated data in INFO column. Parameters ---------- @@ -1148,7 +1156,7 @@ def row_updateinfo(r, key, value, force=True, missing=False): def row_missval(r): """ - Return the correctly formatted missing value for the row. + For given row, return formatted missing genotype. Parameters ---------- @@ -2167,34 +2175,50 @@ def compute_info(self, key): >>> from fuc import pyvcf >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102], - ... 'ID': ['.', '.', '.'], - ... 'REF': ['G', 'T', 'A'], - ... 'ALT': ['A', 'C', 'T,G'], - ... 'QUAL': ['.', '.', '.'], - ... 'FILTER': ['.', '.', '.'], - ... 'INFO': ['AC=100', 'MQ=59', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT'], - ... 'A': ['0/1', '0/0', '0/1'], - ... 'B': ['1/1', '0/1', '0/0'], - ... 'C': ['0/0', '0/0', '1/2'], + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chrX'], + ... 'POS': [100, 101, 102, 100], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'A', 'C'], + ... 'ALT': ['A', 'C', 'T,G', 'A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['AC=100', 'MQ=59', '.', '.'], + ... 'FORMAT': ['GT:DP', 'GT', 'GT', 'GT'], + ... 'A': ['1|0:34', '0|0', '1|0', '0'], + ... 'B': ['1/1:23', '0/1', '0/0', '0/0'], + ... 'C': ['0/0:28', './.', '1/2', '1'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C - 0 chr1 100 . G A . . AC=100 GT 0/1 1/1 0/0 - 1 chr1 101 . T C . . MQ=59 GT 0/0 0/1 0/0 - 2 chr1 102 . A T,G . . . GT 0/1 0/0 1/2 - >>> vf = vf.add_info_ac() + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=100 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . . GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . . GT 0 0/0 1 + >>> vf = vf.compute_info('AC') + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=1 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59;AC=1 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . AC=1,1 GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . AC=1 GT 0 0/0 1 + >>> vf = vf.compute_info('AN') + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=1;AN=6 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59;AC=1;AN=4 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . AC=1,1;AN=6 GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . AC=1;AN=4 GT 0 0/0 1 + >>> vf = vf.compute_info('AF') >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C - 0 chr1 100 . G A . . AC=3 GT 0/1 1/1 0/0 - 1 chr1 101 . T C . . MQ=59;AC=5 GT 0/0 0/1 0/0 - 2 chr1 102 . A T,G . . AC=3,2 GT 0/1 0/0 1/2 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=1;AN=6;AF=0.167 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59;AC=1;AN=4;AF=0.250 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . AC=1,1;AN=6;AF=0.167,0.167 GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . AC=1;AN=4;AF=0.250 GT 0 0/0 1 """ def one_row(r, key): - data = row_compute_info(r, key) + data = row_computeinfo(r, key) r.INFO = row_updateinfo(r, key, data, missing=True) return r From e957537e77eabb1fb96828c6a1be735461563cef Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 17 Feb 2022 17:26:39 +0900 Subject: [PATCH 18/22] Update `pyvcf.VcfFrame.cfilter_empty` (#54): * :issue:`54: Update :meth:`pyvcf.VcfFrame.cfilter_empty` method so that users can control missingness threshold for filtering samples. * Rename :meth:`pyvcf.VcfFrame.cfilter_empty` method to :meth:`pyvcf.VcfFrame.empty_samples`. --- CHANGELOG.rst | 2 + fuc/api/pyvcf.py | 374 ++++++++++++++++++++++++++--------------------- 2 files changed, 211 insertions(+), 165 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a3c7bb0..5da424e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,8 @@ Changelog * Add new method :meth:`pyvcf.gt_ploidy`. * Update :meth:`pyvcf.gt_polyp` method to use :meth:`pyvcf.gt_ploidy` method internally. * :issue:`53`: Add new methods to compute AC/AN/AF in the INFO column: :meth:`pyvcf.row_computeinfo` and :meth:`pyvcf.VcfFrame.compute_info`. +* :issue:`54: Update :meth:`pyvcf.VcfFrame.cfilter_empty` method so that users can control missingness threshold for filtering samples. +* Rename :meth:`pyvcf.VcfFrame.cfilter_empty` method to :meth:`pyvcf.VcfFrame.empty_samples`. 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index a48cbac..f3f87d7 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -1953,83 +1953,6 @@ def f(r): vf = self.__class__(self.copy_meta(), df) return vf - def cfilter_empty(self, opposite=False, as_list=False): - """ - Remove samples whose genotype calls are all missing. - - Parameters - ---------- - opposite : bool, default: False - If True, return samples that don't meet the said criteria. - as_list : bool, default: False - If True, return a list of sample names instead of a VcfFrame. - - Returns - ------- - VcfFrame - Filtered VcfFrame. - - Examples - -------- - Assume we have the following data: - - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'T', 'G', 'T'], - ... 'ALT': ['A', 'C', 'C', 'C'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '1/1', '1/1', '1/1'], - ... 'Rachel': ['./.', './.', './.', './.'], - ... 'John': ['0/0', './.', '0/0', '0/0'], - ... 'Sara': ['./.', './.', './.', './.'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Rachel John Sara - 0 chr1 100 . G A . . . GT 0/1 ./. 0/0 ./. - 1 chr1 101 . T C . . . GT 1/1 ./. ./. ./. - 2 chr1 102 . G C . . . GT 1/1 ./. 0/0 ./. - 3 chr1 103 . T C . . . GT 1/1 ./. 0/0 ./. - - We can remove samples whose genotypes are all missing: - - >>> vf.cfilter_empty().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven John - 0 chr1 100 . G A . . . GT 0/1 0/0 - 1 chr1 101 . T C . . . GT 1/1 ./. - 2 chr1 102 . G C . . . GT 1/1 0/0 - 3 chr1 103 . T C . . . GT 1/1 0/0 - - We can also select those samples: - - >>> vf.cfilter_empty(opposite=True).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Rachel Sara - 0 chr1 100 . G A . . . GT ./. ./. - 1 chr1 101 . T C . . . GT ./. ./. - 2 chr1 102 . G C . . . GT ./. ./. - 3 chr1 103 . T C . . . GT ./. ./. - - Finally, we can return a list of sample names from the filtering: - - >>> vf.cfilter_empty(as_list=True) - ['Steven', 'John'] - """ - f = lambda r: r[9:].apply(gt_miss) - s = self.df.apply(f, axis=1).all() - if opposite: - s = s[s == True] - else: - s = s[s == False] - l = s.index.to_list() - if as_list: - return l - return self.subset(l) - def collapse(self): """ Collapse duplicate records in the VcfFrame. @@ -3798,6 +3721,101 @@ def one_gt(g): vf = self.__class__(self.copy_meta(), df) return vf + def empty_samples(self, threshold=0, opposite=False, as_list=False): + """ + Remove samples with high missingness. + + Samples with missingness >= threshold will be removed. + + Parameters + ---------- + threshold : int or float, default: 0 + Number or fraction of missing variants. By default + (``threshold=0``), only samples with 100% missingness will be + removed. + opposite : bool, default: False + If True, return samples that don't meet the said criteria. + as_list : bool, default: False + If True, return a list of sample names instead of a VcfFrame. + + Returns + ------- + VcfFrame + Filtered VcfFrame. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'G', 'T'], + ... 'ALT': ['A', 'C', 'C', 'C'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'Steven': ['0/1', '1/1', '1/1', '1/1'], + ... 'Rachel': ['./.', './.', './.', './.'], + ... 'John': ['0/0', './.', '0/0', '0/0'], + ... 'Sara': ['./.', './.', './.', './.'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Rachel John Sara + 0 chr1 100 . G A . . . GT 0/1 ./. 0/0 ./. + 1 chr1 101 . T C . . . GT 1/1 ./. ./. ./. + 2 chr1 102 . G C . . . GT 1/1 ./. 0/0 ./. + 3 chr1 103 . T C . . . GT 1/1 ./. 0/0 ./. + + We can remove samples whose genotypes are all missing: + + >>> vf.cfilter_empty().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven John + 0 chr1 100 . G A . . . GT 0/1 0/0 + 1 chr1 101 . T C . . . GT 1/1 ./. + 2 chr1 102 . G C . . . GT 1/1 0/0 + 3 chr1 103 . T C . . . GT 1/1 0/0 + + We can also select those samples: + + >>> vf.cfilter_empty(opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Rachel Sara + 0 chr1 100 . G A . . . GT ./. ./. + 1 chr1 101 . T C . . . GT ./. ./. + 2 chr1 102 . G C . . . GT ./. ./. + 3 chr1 103 . T C . . . GT ./. ./. + + Finally, we can return a list of sample names from the filtering: + + >>> vf.cfilter_empty(as_list=True) + ['Steven', 'John'] + """ + total = self.shape[0] + + if not threshold: + threshold = total + + if isinstance(threshold, int): + use_fraction = False + elif isinstance(threshold, float): + use_fraction = True + else: + raise TypeError("Incorrect argument type 'threshold'") + + def one_col(c): + data = sum(c.apply(gt_miss)) + if use_fraction: + data /= total + return data + + s = self.df.iloc[:, 9:].apply(one_col, axis=0) < threshold + + return self.subset(s[s].index.to_list()) + def expand(self): """ Expand each multiallelic locus to multiple rows. @@ -3876,7 +3894,9 @@ def one_gt(g, i): def filter_bed(self, bed, opposite=False, as_index=False): """ - Select rows that overlap with the given BED data. + Filter rows intersecting with given BED. + + Only variants intersecting with given BED data will remain. Parameters ---------- @@ -3968,7 +3988,9 @@ def filter_bed(self, bed, opposite=False, as_index=False): def filter_empty(self, threshold=0, opposite=False, as_index=False): """ - Remove rows with missing genotype calls. + Filter rows with high missingness. + + Variants with missingness >= threshold will be removed. Parameters ---------- @@ -4063,82 +4085,11 @@ def one_row(r): return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) - def filter_indel(self, opposite=False, as_index=False): - """ - Remove rows with an indel. - - Parameters - ---------- - opposite : bool, default: False - If True, return rows that don't meet the said criteria. - as_index : bool, default: False - If True, return boolean index array instead of VcfFrame. - - Returns - ------- - VcfFrame or pandas.Series - Filtered VcfFrame or boolean index array. - - Examples - -------- - Assume we have the following data: - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'CT', 'A', 'C'], - ... 'ALT': ['A', 'C', 'C,AT', 'A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '0/1', '1/2', '0/1'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/1 - 1 chr1 101 . CT C . . . GT 0/1 - 2 chr1 102 . A C,AT . . . GT 1/2 - 3 chr1 103 . C A . . . GT 0/1 - - We can remove rows with an indel: - - >>> vf.filter_indel().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/1 - 1 chr1 103 . C A . . . GT 0/1 - - We can also select those rows: - - >>> vf.filter_indel(opposite=True).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 101 . CT C . . . GT 0/1 - 1 chr1 102 . A C,AT . . . GT 1/2 - - Finally, we can return boolean index array from the filtering: - - >>> vf.filter_indel(as_index=True) - 0 True - 1 False - 2 False - 3 True - dtype: bool - """ - i = ~self.df.apply(row_hasindel, axis=1) - if opposite: - i = ~i - if as_index: - return i - if i.empty: - return self.__class__(self.copy_meta(), self.copy_df()) - return self.__class__(self.copy_meta(), self.df[i]) - def filter_flagall(self, flags, opposite=False, as_index=False): """ - Select rows if all of the given INFO flags are present. + Filter rows with all given INFO flags. + + Only variants with all given INFO flags will remain. Parameters ---------- @@ -4222,7 +4173,9 @@ def f(r): def filter_flagany(self, flags, opposite=False, as_index=False): """ - Select rows if any one of the given INFO flags is present. + Filter rows with any given INFO flags. + + Only variants with any given INFO flags will remain. Parameters ---------- @@ -4304,9 +4257,86 @@ def f(r): return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) + def filter_indel(self, opposite=False, as_index=False): + """ + Filter rows with indel. + + Variants with indel will be removed. + + Parameters + ---------- + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. + + Returns + ------- + VcfFrame or pandas.Series + Filtered VcfFrame or boolean index array. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'CT', 'A', 'C'], + ... 'ALT': ['A', 'C', 'C,AT', 'A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'Steven': ['0/1', '0/1', '1/2', '0/1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . . GT 0/1 + 1 chr1 101 . CT C . . . GT 0/1 + 2 chr1 102 . A C,AT . . . GT 1/2 + 3 chr1 103 . C A . . . GT 0/1 + + We can remove rows with an indel: + + >>> vf.filter_indel().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . . GT 0/1 + 1 chr1 103 . C A . . . GT 0/1 + + We can also select those rows: + + >>> vf.filter_indel(opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 101 . CT C . . . GT 0/1 + 1 chr1 102 . A C,AT . . . GT 1/2 + + Finally, we can return boolean index array from the filtering: + + >>> vf.filter_indel(as_index=True) + 0 True + 1 False + 2 False + 3 True + dtype: bool + """ + i = ~self.df.apply(row_hasindel, axis=1) + if opposite: + i = ~i + if as_index: + return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) + return self.__class__(self.copy_meta(), self.df[i]) + def filter_multialt(self, opposite=False, as_index=False): """ - Remove rows with multiple ALT alleles. + Filter rows with multiple ALT alleles. + + Variants with multiple ALT alleles will be removed. Parameters ---------- @@ -4381,7 +4411,9 @@ def filter_multialt(self, opposite=False, as_index=False): def filter_pass(self, opposite=False, as_index=False): """ - Select rows with PASS in the FILTER field. + Filter rows with PASS in FILTER column. + + Only variants with PASS in the FILTER column will remain. Parameters ---------- @@ -4455,7 +4487,9 @@ def filter_pass(self, opposite=False, as_index=False): def filter_phased(self, opposite=False, as_index=False): """ - Remove rows with phased genotypes. + Filter rows with phased genotypes. + + Variants with phased genotypes will be removed. Parameters ---------- @@ -4529,7 +4563,9 @@ def filter_phased(self, opposite=False, as_index=False): def filter_polyp(self, opposite=False, as_index=False): """ - Remove rows with a polyploid genotype call. + Filter rows with polyploid genotypes. + + Variants with polyploid genotypes will be removed. Parameters ---------- @@ -4603,7 +4639,9 @@ def filter_polyp(self, opposite=False, as_index=False): def filter_qual(self, threshold, opposite=False, as_index=False): """ - Select rows with minimum QUAL value. + Filter rows with low QUAL values. + + Only variants with QUAL >= threashold will remain. Parameters ---------- @@ -4684,9 +4722,10 @@ def one_row(r): def filter_sampall(self, samples=None, opposite=False, as_index=False): """ - Select rows if all of the given samples have the variant. + Filter rows where all given samples have variant. - The default behavior is to use all samples in the VcfFrame. + Only variants where all given samples have variant. The default + behavior is to use all samples in the VcfFrame. Parameters ---------- @@ -4783,9 +4822,10 @@ def filter_sampall(self, samples=None, opposite=False, as_index=False): def filter_sampany(self, samples=None, opposite=False, as_index=False): """ - Select rows if any one of the given samples has the variant. + Filter rows where any given samples have variant. - The default behavior is to use all samples in the VcfFrame. + Only variants where any given samples have variant will remain. The + default behavior is to use all samples in the VcfFrame. Parameters ---------- @@ -4881,7 +4921,9 @@ def filter_sampany(self, samples=None, opposite=False, as_index=False): def filter_sampnum(self, threshold, opposite=False, as_index=False): """ - Select rows if the variant is prevalent enough. + Filter rows with high variant prevalence. + + Only variants with variant prevalence >= threshold will remian. Parameters ---------- @@ -4969,7 +5011,9 @@ def f(r): def filter_vcf(self, vcf, opposite=False, as_index=False): """ - Select rows that overlap with the other VCF. + Filter rows intersecting with given VCF. + + Only variants intersecting with given VCF data will remain. Parameters ---------- From fa1c705dae26279acdbf2b123ba6da01f12c819c Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Thu, 17 Feb 2022 20:04:40 +0900 Subject: [PATCH 19/22] Update docs --- CHANGELOG.rst | 2 +- fuc/api/pyvcf.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5da424e..4651da0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,7 +14,7 @@ Changelog * Add new method :meth:`pyvcf.gt_ploidy`. * Update :meth:`pyvcf.gt_polyp` method to use :meth:`pyvcf.gt_ploidy` method internally. * :issue:`53`: Add new methods to compute AC/AN/AF in the INFO column: :meth:`pyvcf.row_computeinfo` and :meth:`pyvcf.VcfFrame.compute_info`. -* :issue:`54: Update :meth:`pyvcf.VcfFrame.cfilter_empty` method so that users can control missingness threshold for filtering samples. +* :issue:`54`: Update :meth:`pyvcf.VcfFrame.cfilter_empty` method so that users can control missingness threshold for filtering samples. * Rename :meth:`pyvcf.VcfFrame.cfilter_empty` method to :meth:`pyvcf.VcfFrame.empty_samples`. 0.30.0 (2022-02-05) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index f3f87d7..d311353 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -865,7 +865,7 @@ def row_hasindel(r): def row_computeinfo(r, key, decimals=3): """ - For given row, return AC/AN/AF calculation in INFO column. + For given row, return AC/AN/AF calculation for INFO column. Parameters ---------- @@ -971,7 +971,7 @@ def get_af(r): def row_parseinfo(r, key): """ - For given row, return requested data in INFO column. + For given row, return requested data from INFO column. Parameters ---------- @@ -1069,7 +1069,7 @@ def one_gt(g): def row_updateinfo(r, key, value, force=True, missing=False): """ - For given row, return updated data in INFO column. + For given row, return updated data from INFO column. Parameters ---------- @@ -2081,7 +2081,7 @@ def raise_error(c): def compute_info(self, key): """ - Compute AC/AN/AF in INFO column. + Compute AC/AN/AF for INFO column. The method will ignore and overwrite any existing data for selected key. From 8e29646583545aac56e0788f81a9f0bc7334532f Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Thu, 17 Feb 2022 20:30:16 +0900 Subject: [PATCH 20/22] Update docs --- fuc/api/pyvcf.py | 83 ++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index d311353..1d4f1a3 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -3741,11 +3741,10 @@ def empty_samples(self, threshold=0, opposite=False, as_list=False): Returns ------- VcfFrame - Filtered VcfFrame. + Subsetted VcfFrame. Examples -------- - Assume we have the following data: >>> from fuc import pyvcf >>> data = { @@ -3758,41 +3757,45 @@ def empty_samples(self, threshold=0, opposite=False, as_list=False): ... 'FILTER': ['.', '.', '.', '.'], ... 'INFO': ['.', '.', '.', '.'], ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '1/1', '1/1', '1/1'], - ... 'Rachel': ['./.', './.', './.', './.'], - ... 'John': ['0/0', './.', '0/0', '0/0'], - ... 'Sara': ['./.', './.', './.', './.'], + ... 'A': ['0/0', '0/0', '0/0', '0/0'], + ... 'B': ['./.', '0/0', '0/0', '0/0'], + ... 'C': ['./.', './.', '0/0', '0/0'], + ... 'D': ['./.', './.', './.', '0/0'], + ... 'E': ['./.', './.', './.', './.'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Rachel John Sara - 0 chr1 100 . G A . . . GT 0/1 ./. 0/0 ./. - 1 chr1 101 . T C . . . GT 1/1 ./. ./. ./. - 2 chr1 102 . G C . . . GT 1/1 ./. 0/0 ./. - 3 chr1 103 . T C . . . GT 1/1 ./. 0/0 ./. - - We can remove samples whose genotypes are all missing: - - >>> vf.cfilter_empty().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven John - 0 chr1 100 . G A . . . GT 0/1 0/0 - 1 chr1 101 . T C . . . GT 1/1 ./. - 2 chr1 102 . G C . . . GT 1/1 0/0 - 3 chr1 103 . T C . . . GT 1/1 0/0 - - We can also select those samples: - - >>> vf.cfilter_empty(opposite=True).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Rachel Sara - 0 chr1 100 . G A . . . GT ./. ./. - 1 chr1 101 . T C . . . GT ./. ./. - 2 chr1 102 . G C . . . GT ./. ./. - 3 chr1 103 . T C . . . GT ./. ./. - - Finally, we can return a list of sample names from the filtering: - - >>> vf.cfilter_empty(as_list=True) - ['Steven', 'John'] + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D E + 0 chr1 100 . G A . . . GT 0/0 ./. ./. ./. ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 ./. ./. ./. + 2 chr1 102 . G C . . . GT 0/0 0/0 0/0 ./. ./. + 3 chr1 103 . T C . . . GT 0/0 0/0 0/0 0/0 ./. + >>> vf.empty_samples().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D + 0 chr1 100 . G A . . . GT 0/0 ./. ./. ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 ./. ./. + 2 chr1 102 . G C . . . GT 0/0 0/0 0/0 ./. + 3 chr1 103 . T C . . . GT 0/0 0/0 0/0 0/0 + >>> vf.empty_samples(threshold=2).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 0/0 ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 + 2 chr1 102 . G C . . . GT 0/0 0/0 + 3 chr1 103 . T C . . . GT 0/0 0/0 + >>> vf.empty_samples(threshold=0.5).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 0/0 ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 + 2 chr1 102 . G C . . . GT 0/0 0/0 + 3 chr1 103 . T C . . . GT 0/0 0/0 + >>> vf.empty_samples(threshold=0.5, opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C D E + 0 chr1 100 . G A . . . GT ./. ./. ./. + 1 chr1 101 . T C . . . GT ./. ./. ./. + 2 chr1 102 . G C . . . GT 0/0 ./. ./. + 3 chr1 103 . T C . . . GT 0/0 0/0 ./. + >>> vf.empty_samples(threshold=0.5, opposite=True, as_list=True) + ['C', 'D', 'E'] """ total = self.shape[0] @@ -3814,7 +3817,17 @@ def one_col(c): s = self.df.iloc[:, 9:].apply(one_col, axis=0) < threshold - return self.subset(s[s].index.to_list()) + if opposite: + s = s[s == False] + else: + s = s[s == True] + + l = s.index.to_list() + + if as_list: + return l + + return self.subset(l) def expand(self): """ From 8f603d1f36940e427b31beaa7e6aa40add62246e Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Feb 2022 17:11:31 +0900 Subject: [PATCH 21/22] Update `common.sort_regions`: * Update :meth:`common.sort_regions` method to support regions containing an ALT contig (e.g. chr16_KI270854v1_alt). --- CHANGELOG.rst | 1 + fuc/api/common.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4651da0..37899b7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,6 +16,7 @@ Changelog * :issue:`53`: Add new methods to compute AC/AN/AF in the INFO column: :meth:`pyvcf.row_computeinfo` and :meth:`pyvcf.VcfFrame.compute_info`. * :issue:`54`: Update :meth:`pyvcf.VcfFrame.cfilter_empty` method so that users can control missingness threshold for filtering samples. * Rename :meth:`pyvcf.VcfFrame.cfilter_empty` method to :meth:`pyvcf.VcfFrame.empty_samples`. +* Update :meth:`common.sort_regions` method to support regions containing an ALT contig (e.g. chr16_KI270854v1_alt). 0.30.0 (2022-02-05) ------------------- diff --git a/fuc/api/common.py b/fuc/api/common.py index b352265..02cc5d0 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -1316,17 +1316,21 @@ def sort_regions(regions): -------- >>> from fuc import common - >>> regions = ['22:1000-1500', '16:100-200', '22:200-300'] + >>> regions = ['chr22:1000-1500', 'chr16:100-200', 'chr22:200-300', 'chr16_KI270854v1_alt', 'chr3_GL000221v1_random', 'HLA-A*02:10'] >>> sorted(regions) # Lexicographic sorting (not what we want) - ['16:100-200', '22:1000-1500', '22:200-300'] + ['HLA-A*02:10', 'chr16:100-200', 'chr16_KI270854v1_alt', 'chr22:1000-1500', 'chr22:200-300', 'chr3_GL000221v1_random'] >>> common.sort_regions(regions) - ['16:100-200', '22:200-300', '22:1000-1500'] + ['chr16:100-200', 'chr22:200-300', 'chr22:1000-1500', 'chr16_KI270854v1_alt', 'chr3_GL000221v1_random', 'HLA-A*02:10'] """ def func(x): chrom, start, end = parse_region(x) if chrom in pyvcf.CONTIGS: chrom = pyvcf.CONTIGS.index(chrom) - return (chrom, start, end) + alt = '' + else: + chrom = len(pyvcf.CONTIGS) + alt = chrom + return (chrom, alt, start, end) return sorted(regions, key=func) def update_chr_prefix(regions, mode='remove'): From c6b5e8f5807e6965d07e0d2a3a79e6a308885f28 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Tue, 1 Mar 2022 10:35:03 +0900 Subject: [PATCH 22/22] Update docs --- CHANGELOG.rst | 4 +- README.rst | 28 ++--- docs/cli.rst | 234 ++++++++++++++++++++--------------------- docs/create.py | 8 +- fuc/cli/bam_aldepth.py | 15 +-- fuc/cli/bam_head.py | 8 +- fuc/cli/bam_index.py | 8 +- fuc/cli/bam_rename.py | 17 ++- fuc/cli/bam_slice.py | 37 ++++--- fuc/cli/bed_intxn.py | 10 +- fuc/cli/bed_sum.py | 12 ++- fuc/cli/cov_concat.py | 12 ++- fuc/cli/cov_rename.py | 19 ++-- fuc/cli/fa_filter.py | 14 ++- fuc/cli/fq_count.py | 6 +- fuc/cli/fq_sum.py | 6 +- fuc/cli/fuc_bgzip.py | 6 +- fuc/cli/fuc_compf.py | 15 ++- fuc/cli/fuc_demux.py | 14 ++- fuc/cli/fuc_exist.py | 11 +- fuc/cli/fuc_find.py | 14 ++- fuc/cli/fuc_undetm.py | 11 +- fuc/cli/maf_maf2vcf.py | 18 ++-- fuc/cli/maf_oncoplt.py | 24 +++-- fuc/cli/maf_sumplt.py | 21 ++-- fuc/cli/maf_vcf2maf.py | 9 +- fuc/cli/ngs_bam2fq.py | 24 +++-- fuc/cli/ngs_fq2bam.py | 43 +++++--- fuc/cli/ngs_hc.py | 65 +++++++----- fuc/cli/ngs_m2.py | 30 ++++-- fuc/cli/ngs_pon.py | 27 +++-- fuc/cli/ngs_quant.py | 36 ++++--- fuc/cli/ngs_trim.py | 21 ++-- fuc/cli/tabix_index.py | 9 +- fuc/cli/tabix_slice.py | 9 +- fuc/cli/tbl_merge.py | 26 +++-- fuc/cli/tbl_sum.py | 55 ++++++---- fuc/cli/vcf_filter.py | 36 ++++--- fuc/cli/vcf_index.py | 10 +- fuc/cli/vcf_merge.py | 29 ++--- fuc/cli/vcf_rename.py | 24 +++-- fuc/cli/vcf_slice.py | 29 ++--- fuc/cli/vcf_split.py | 19 ++-- fuc/cli/vcf_vcf2bed.py | 6 +- fuc/cli/vcf_vep.py | 17 +-- 45 files changed, 662 insertions(+), 434 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 37899b7..45de339 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,8 @@ Changelog ********* -0.31.0 (in development) ------------------------ +0.31.0 (2022-03-01) +------------------- * Fix bug in :meth:`pykallisto.KallistoFrame.compute_fold_change` method. * Add new method :meth:`pyvcf.call` and new command :command:`vcf-call`. diff --git a/README.rst b/README.rst index 12f95e2..d7f087e 100644 --- a/README.rst +++ b/README.rst @@ -46,10 +46,10 @@ Currently, fuc can be used to analyze, summarize, visualize, and manipulate the Additionally, fuc can be used to parse output data from the following programs: -- Ensembl Variant Effect Predictor (VEP) -- SnpEff -- bcl2fastq and bcl2fastq2 -- Kallisto +- `Ensembl Variant Effect Predictor (VEP) `__ +- `SnpEff `__ +- `bcl2fastq and bcl2fastq2 `__ +- `Kallisto `__ Your contributions (e.g. feature ideas, pull requests) are most welcome. @@ -112,38 +112,38 @@ For getting help on the fuc CLI: positional arguments: COMMAND - bam-aldepth Compute allelic depth from a SAM/BAM/CRAM file. + bam-aldepth Compute allelic depth from a BAM file. bam-depth Compute per-base read depth from BAM files. - bam-head Print the header of a SAM/BAM/CRAM file. - bam-index Index a SAM/BAM/CRAM file. - bam-rename Rename the sample in a SAM/BAM/CRAM file. - bam-slice Slice a SAM/BAM/CRAM file. + bam-head Print the header of a BAM file. + bam-index Index a BAM file. + bam-rename Rename the sample in a BAM file. + bam-slice Slice a BAM file. bed-intxn Find the intersection of BED files. bed-sum Summarize a BED file. cov-concat Concatenate depth of coverage files. cov-rename Rename the samples in a depth of coverage file. - fa-filter Filter sequence records in a FASTA file + fa-filter Filter sequence records in a FASTA file. fq-count Count sequence reads in FASTQ files. fq-sum Summarize a FASTQ file. fuc-bgzip Write a BGZF compressed file. fuc-compf Compare the contents of two files. fuc-demux Parse the Reports directory from bcl2fastq. fuc-exist Check whether certain files exist. - fuc-find Retrieve absolute paths of files whose name matches a + fuc-find Retrieve absolute paths of files whose name matches a specified pattern, optionally recursively. - fuc-undetm Compute top unknown barcodes using undertermined FASTQ from + fuc-undetm Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. maf-maf2vcf Convert a MAF file to a VCF file. maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. ngs-bam2fq Pipeline for converting BAM files to FASTQ files. - ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM + ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. ngs-pon Pipeline for constructing a panel of normals (PoN). - ngs-quant Pipeline for running RNAseq quantification from FASTQ files + ngs-quant Pipeline for running RNAseq quantification from FASTQ files with Kallisto. ngs-trim Pipeline for trimming adapters from FASTQ files. tabix-index Index a GFF/BED/SAM/VCF file with Tabix. diff --git a/docs/cli.rst b/docs/cli.rst index dcc4aaa..e815334 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -18,38 +18,38 @@ For getting help on the fuc CLI: positional arguments: COMMAND - bam-aldepth Compute allelic depth from a SAM/BAM/CRAM file. + bam-aldepth Compute allelic depth from a BAM file. bam-depth Compute per-base read depth from BAM files. - bam-head Print the header of a SAM/BAM/CRAM file. - bam-index Index a SAM/BAM/CRAM file. - bam-rename Rename the sample in a SAM/BAM/CRAM file. - bam-slice Slice a SAM/BAM/CRAM file. + bam-head Print the header of a BAM file. + bam-index Index a BAM file. + bam-rename Rename the sample in a BAM file. + bam-slice Slice a BAM file. bed-intxn Find the intersection of BED files. bed-sum Summarize a BED file. cov-concat Concatenate depth of coverage files. cov-rename Rename the samples in a depth of coverage file. - fa-filter Filter sequence records in a FASTA file + fa-filter Filter sequence records in a FASTA file. fq-count Count sequence reads in FASTQ files. fq-sum Summarize a FASTQ file. fuc-bgzip Write a BGZF compressed file. fuc-compf Compare the contents of two files. fuc-demux Parse the Reports directory from bcl2fastq. fuc-exist Check whether certain files exist. - fuc-find Retrieve absolute paths of files whose name matches a + fuc-find Retrieve absolute paths of files whose name matches a specified pattern, optionally recursively. - fuc-undetm Compute top unknown barcodes using undertermined FASTQ from + fuc-undetm Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. maf-maf2vcf Convert a MAF file to a VCF file. maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. ngs-bam2fq Pipeline for converting BAM files to FASTQ files. - ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM + ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. ngs-pon Pipeline for constructing a panel of normals (PoN). - ngs-quant Pipeline for running RNAseq quantification from FASTQ files + ngs-quant Pipeline for running RNAseq quantification from FASTQ files with Kallisto. ngs-trim Pipeline for trimming adapters from FASTQ files. tabix-index Index a GFF/BED/SAM/VCF file with Tabix. @@ -84,13 +84,13 @@ bam-aldepth $ fuc bam-aldepth -h usage: fuc bam-aldepth [-h] bam sites - Count allelic depth from a SAM/BAM/CRAM file. + Count allelic depth from a BAM file. Positional arguments: - bam Alignment file. - sites TSV file containing two columns, chromosome and position. - This can also be a BED or VCF file (compressed or - uncompressed) Input type will be detected automatically. + bam Input alignment file. + sites TSV file containing two columns, chromosome and position. This + can also be a BED or VCF file (compressed or uncompressed). + Input type will be detected automatically. Optional arguments: -h, --help Show this help message and exit. @@ -153,10 +153,10 @@ bam-head $ fuc bam-head -h usage: fuc bam-head [-h] bam - Print the header of a SAM/BAM/CRAM file. + Print the header of a BAM file. Positional arguments: - bam Alignment file. + bam Input alignment file. Optional arguments: -h, --help Show this help message and exit. @@ -172,10 +172,10 @@ bam-index $ fuc bam-index -h usage: fuc bam-index [-h] bam - Index a SAM/BAM/CRAM file. + Index a BAM file. Positional arguments: - bam Alignment file. + bam Input alignment file. Optional arguments: -h, --help Show this help message and exit. @@ -191,10 +191,10 @@ bam-rename $ fuc bam-rename -h usage: fuc bam-rename [-h] bam name - Rename the sample in a SAM/BAM/CRAM file. + Rename the sample in a BAM file. Positional arguments: - bam Alignment file. + bam Input alignment file. name New sample name. Optional arguments: @@ -212,24 +212,24 @@ bam-slice usage: fuc bam-slice [-h] [--format TEXT] [--fasta PATH] bam regions [regions ...] - Slice an alignment file (SAM/BAM/CRAM). + Slice a BAM file. Positional arguments: - bam Input alignment file must be already indexed (.bai) to allow - random access. You can index an alignment file with the + bam Input alignment file must be already indexed (.bai) to allow + random access. You can index an alignment file with the bam-index command. - regions One or more regions to be sliced. Each region must have the - format chrom:start-end and be a half-open interval with - (start, end]. This means, for example, chr1:100-103 will - extract positions 101, 102, and 103. Alternatively, you can - provide a BED file (compressed or uncompressed) to specify - regions. Note that the 'chr' prefix in contig names (e.g. - 'chr1' vs. '1') will be automatically added or removed as + regions One or more regions to be sliced. Each region must have the + format chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 will + extract positions 101, 102, and 103. Alternatively, you can + provide a BED file (compressed or uncompressed) to specify + regions. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as necessary to match the input BED's contig names. Optional arguments: -h, --help Show this help message and exit. - --format TEXT Output format (default: 'BAM') (choices: 'SAM', 'BAM', + --format TEXT Output format (default: 'BAM') (choices: 'SAM', 'BAM', 'CRAM'). --fasta PATH FASTA file. Required when --format is 'CRAM'. @@ -253,7 +253,7 @@ bed-intxn Find the intersection of BED files. Positional arguments: - bed BED files. + bed Input BED files. Optional arguments: -h, --help Show this help message and exit. @@ -279,7 +279,7 @@ bed-sum can, for example, use '--bases 1000' to display in kb. Positional arguments: - bed BED file. + bed Input BED file. Optional arguments: -h, --help Show this help message and exit. @@ -292,16 +292,16 @@ cov-concat .. code-block:: text $ fuc cov-concat -h - usage: fuc cov-concat [-h] [--axis INT] PATH [PATH ...] + usage: fuc cov-concat [-h] [--axis INT] tsv [tsv ...] Concatenate depth of coverage files. Positional arguments: - PATH One or more TSV files. + tsv Input TSV files. Optional arguments: -h, --help Show this help message and exit. - --axis INT The axis to concatenate along (default: 0) (choices: + --axis INT The axis to concatenate along (default: 0) (choices: 0, 1 where 0 is index and 1 is columns). [Example] Concatenate vertically: @@ -337,9 +337,9 @@ cov-rename -h, --help Show this help message and exit. --mode TEXT Renaming mode (default: 'MAP') (choices: 'MAP', 'INDEX', 'RANGE'). - --range INT INT Index range to use when renaming the samples. + --range INT INT Index range to use when renaming the samples. Applicable only with the 'RANGE' mode. - --sep TEXT Delimiter to use when reading the names file + --sep TEXT Delimiter to use when reading the names file (default: '\t'). [Example] Using the default 'MAP' mode: @@ -362,13 +362,13 @@ fa-filter Filter sequence records in a FASTA file. Positional arguments: - fasta FASTA file (compressed or uncompressed). + fasta Input FASTA file (compressed or uncompressed). Optional arguments: -h, --help Show this help message and exit. --contigs TEXT [TEXT ...] - One or more contigs to be selected. Alternatively, you can - provide a file containing one contig per line. + One or more contigs to be selected. Alternatively, you can + provide a file containing one contig per line. --exclude Exclude specified contigs. [Example] Select certain contigs: @@ -388,7 +388,7 @@ fq-count Count sequence reads in FASTQ files. Positional arguments: - fastq FASTQ files (compressed or uncompressed) (default: stdin). + fastq Input FASTQ files (compressed or uncompressed) (default: stdin). Optional arguments: -h, --help Show this help message and exit. @@ -414,7 +414,7 @@ fq-sum lengths, and the numbers of unique and duplicate sequences. Positional arguments: - fastq FASTQ file (zipped or unqzipped). + fastq Input FASTQ file (compressed or uncompressed). Optional arguments: -h, --help Show this help message and exit. @@ -439,7 +439,7 @@ fuc-bgzip formats (e.g. FASTA, FASTQ, GenBank, VCF, MAF). Positional arguments: - file File to be compressed (default: stdin). + file Input file to be compressed (default: stdin). Optional arguments: -h, --help Show this help message and exit. @@ -464,8 +464,8 @@ fuc-compf are identical and 'False' otherwise. Positional arguments: - left Left file. - right Right file. + left Input left file. + right Input right file. Optional arguments: -h, --help Show this help message and exit. @@ -497,12 +497,12 @@ fuc-demux reports.pdf files. Positional arguments: - reports Reports directory. + reports Input Reports directory. output Output directory (will be created). Optional arguments: -h, --help Show this help message and exit. - --sheet PATH SampleSheet.csv file. Used for sorting and/or subsetting + --sheet PATH SampleSheet.csv file. Used for sorting and/or subsetting samples. fuc-exist @@ -656,7 +656,7 @@ maf-oncoplt determined by the output file's extension. Positional arguments: - maf MAF file. + maf Input MAF file. out Output image file. Optional arguments: @@ -693,19 +693,19 @@ maf-sumplt determined by the output file's extension. Positional arguments: - maf MAF file. + maf Input MAF file. out Output image file. Optional arguments: -h, --help Show this help message and exit. --figsize FLOAT FLOAT - width, height in inches (default: [15, 10]) + Width, height in inches (default: [15, 10]). --title_fontsize FLOAT - font size of subplot titles (default: 16) + Font size of subplot titles (default: 16). --ticklabels_fontsize FLOAT - font size of tick labels (default: 12) + Font size of tick labels (default: 12). --legend_fontsize FLOAT - font size of legend texts (default: 12) + Font size of legend texts (default: 12). [Example] Output a PNG file: $ fuc maf-sumplt in.maf out.png @@ -757,9 +757,9 @@ ngs-bam2fq Positional arguments: manifest Sample manifest CSV file. output Output directory. - qsub SGE resoruce to request with qsub for BAM to FASTQ - conversion. Since this oppoeration supports multithreading, - it is recommended to speicfy a parallel environment (PE) + qsub SGE resoruce to request with qsub for BAM to FASTQ + conversion. Since this oppoeration supports multithreading, + it is recommended to speicfy a parallel environment (PE) to speed up the process (also see --thread). Optional arguments: @@ -815,7 +815,7 @@ ngs-fq2bam output Output directory. qsub SGE resoruce to request for qsub. java Java resoruce to request for GATK. - vcf One or more reference VCF files containing known variant + vcf One or more reference VCF files containing known variant sites (e.g. 1000 Genomes Project). Optional arguments: @@ -879,20 +879,20 @@ ngs-hc --bed PATH BED file. --dbsnp PATH VCF file from dbSNP. --thread INT Number of threads to use (default: 1). - --batch INT Batch size used for GenomicsDBImport (default: 0). This - controls the number of samples for which readers are - open at once and therefore provides a way to minimize - memory consumption. The size of 0 means no batching (i.e. + --batch INT Batch size used for GenomicsDBImport (default: 0). This + controls the number of samples for which readers are + open at once and therefore provides a way to minimize + memory consumption. The size of 0 means no batching (i.e. readers for all samples will be opened at once). --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. - --posix Set GenomicsDBImport to allow for optimizations to improve - the usability and performance for shared Posix Filesystems - (e.g. NFS, Lustre). If set, file level locking is disabled - and file system writes are minimized by keeping a higher - number of file descriptors open for longer periods of time. - Use with --batch if keeping a large number of file + --posix Set GenomicsDBImport to allow for optimizations to improve + the usability and performance for shared Posix Filesystems + (e.g. NFS, Lustre). If set, file level locking is disabled + and file system writes are minimized by keeping a higher + number of file descriptors open for longer periods of time. + Use with --batch if keeping a large number of file descriptors open is an issue. [Example] Specify queue: @@ -1028,8 +1028,8 @@ ngs-quant --bootstrap INT Number of bootstrap samples (default: 50). --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. - --posix Set the environment variable HDF5_USE_FILE_LOCKING=FALSE - before running Kallisto. This is required for shared Posix + --posix Set the environment variable HDF5_USE_FILE_LOCKING=FALSE + before running Kallisto. This is required for shared Posix Filesystems (e.g. NFS, Lustre). [Example] Specify queue: @@ -1154,12 +1154,12 @@ tbl-merge method's documentation page. Positional arguments: - left Left file. - right Right file. + left Input left file. + right Input right file. Optional arguments: -h, --help Show this help message and exit. - --how TEXT Type of merge to be performed (default: 'inner') + --how TEXT Type of merge to be performed (default: 'inner') (choices: 'left', 'right', 'outer', 'inner', 'cross'). --on TEXT [TEXT ...] Column names to join on. --lsep TEXT Delimiter to use for the left file (default: '\t'). @@ -1194,27 +1194,27 @@ tbl-sum Optional arguments: -h, --help Show this help message and exit. --sep TEXT Delimiter to use (default: '\t'). - --skiprows TEXT Comma-separated line numbers to skip (0-indexed) or - number of lines to skip at the start of the file - (e.g. `--skiprows 1,` will skip the second line, - `--skiprows 2,4` will skip the third and fifth lines, + --skiprows TEXT Comma-separated line numbers to skip (0-indexed) or + number of lines to skip at the start of the file + (e.g. `--skiprows 1,` will skip the second line, + `--skiprows 2,4` will skip the third and fifth lines, and `--skiprows 10` will skip the first 10 lines). --na_values TEXT [TEXT ...] - Additional strings to recognize as NA/NaN (by - default, the following values are interpreted - as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', - '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', - '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', + Additional strings to recognize as NA/NaN (by + default, the following values are interpreted + as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', + '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', + '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'). - --keep_default_na Whether or not to include the default NaN values when + --keep_default_na Whether or not to include the default NaN values when parsing the data (see 'pandas.read_table' for details). - --expr TEXT Query the columns of a pandas.DataFrame with a + --expr TEXT Query the columns of a pandas.DataFrame with a boolean expression (e.g. `--query "A == 'yes'"`). --columns TEXT [TEXT ...] - Columns to be summarized (by default, all columns + Columns to be summarized (by default, all columns will be included). - --dtypes PATH File of column names and their data types (either - 'categorical' or 'numeric'); one tab-delimited pair of + --dtypes PATH File of column names and their data types (either + 'categorical' or 'numeric'); one tab-delimited pair of column name and data type per line. [Example] Summarize a table: @@ -1287,17 +1287,17 @@ vcf-filter Optional arguments: -h, --help Show this help message and exit. --expr TEXT Expression to evaluate. - --samples PATH File of sample names to apply the marking (one + --samples PATH File of sample names to apply the marking (one sample per line). --drop_duplicates [TEXT ...] - Only consider certain columns for identifying + Only consider certain columns for identifying duplicates, by default use all of the columns. - --greedy Use this flag to mark even ambiguous genotypes + --greedy Use this flag to mark even ambiguous genotypes as missing. - --opposite Use this flag to mark all genotypes that do not - satisfy the query expression as missing and leave + --opposite Use this flag to mark all genotypes that do not + satisfy the query expression as missing and leave those that do intact. - --filter_empty Use this flag to remove rows with no genotype + --filter_empty Use this flag to remove rows with no genotype calls at all. [Example] Mark genotypes with 0/0 as missing: @@ -1338,8 +1338,8 @@ vcf-index This command will create an index file (.tbi) for the input VCF. Positional arguments: - vcf Input VCF file to be indexed. When an uncompressed file is - given, the command will automatically create a BGZF + vcf Input VCF file to be indexed. When an uncompressed file is + given, the command will automatically create a BGZF compressed copy of the file (.gz) before indexing. Optional arguments: @@ -1365,19 +1365,19 @@ vcf-merge Positional arguments: vcf_files VCF files (compressed or uncompressed). Note that the 'chr' - prefix in contig names (e.g. 'chr1' vs. '1') will be - automatically added or removed as necessary to match the + prefix in contig names (e.g. 'chr1' vs. '1') will be + automatically added or removed as necessary to match the contig names of the first VCF. Optional arguments: -h, --help Show this help message and exit. - --how TEXT Type of merge as defined in pandas.DataFrame.merge + --how TEXT Type of merge as defined in pandas.DataFrame.merge (default: 'inner'). - --format TEXT FORMAT subfields to be retained (e.g. 'GT:AD:DP') + --format TEXT FORMAT subfields to be retained (e.g. 'GT:AD:DP') (default: 'GT'). - --sort Use this flag to turn off sorting of records + --sort Use this flag to turn off sorting of records (default: True). - --collapse Use this flag to collapse duplicate records + --collapse Use this flag to collapse duplicate records (default: False). [Example] Merge multiple VCF files: @@ -1411,9 +1411,9 @@ vcf-rename Optional arguments: -h, --help Show this help message and exit. - --mode TEXT Renaming mode (default: 'MAP') (choices: 'MAP', + --mode TEXT Renaming mode (default: 'MAP') (choices: 'MAP', 'INDEX', 'RANGE'). - --range INT INT Index range to use when renaming the samples. + --range INT INT Index range to use when renaming the samples. Applicable only with the 'RANGE' mode. --sep TEXT Delimiter to use for reading the 'names' file (default: '\t'). @@ -1438,17 +1438,17 @@ vcf-slice Slice a VCF file for specified regions. Positional arguments: - vcf Input VCF file must be already BGZF compressed (.gz) and - indexed (.tbi) to allow random access. A VCF file can be - compressed with the fuc-bgzip command and indexed with the + vcf Input VCF file must be already BGZF compressed (.gz) and + indexed (.tbi) to allow random access. A VCF file can be + compressed with the fuc-bgzip command and indexed with the vcf-index command. - regions One or more regions to be sliced. Each region must have the - format chrom:start-end and be a half-open interval with - (start, end]. This means, for example, chr1:100-103 will - extract positions 101, 102, and 103. Alternatively, you can - provide a BED file (compressed or uncompressed) to specify - regions. Note that the 'chr' prefix in contig names (e.g. - 'chr1' vs. '1') will be automatically added or removed as + regions One or more regions to be sliced. Each region must have the + format chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 will + extract positions 101, 102, and 103. Alternatively, you can + provide a BED file (compressed or uncompressed) to specify + regions. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as necessary to match the input VCF's contig names. Optional arguments: @@ -1479,8 +1479,8 @@ vcf-split Optional arguments: -h, --help Show this help message and exit. - --clean By default, the command will only return variants present in - each individual. Use the tag to stop this behavior and make + --clean By default, the command will only return variants present in + each individual. Use the tag to stop this behavior and make sure that all individuals have the same number of variants. --force Overwrite the output directory if it already exists. @@ -1522,7 +1522,7 @@ vcf-vep Optional arguments: -h, --help Show this help message and exit. - --opposite Use this flag to return only records that don't + --opposite Use this flag to return only records that don't meet the said criteria. --as_zero Use this flag to treat missing values as zero instead of NaN. diff --git a/docs/create.py b/docs/create.py index a036d7a..27d9cf9 100644 --- a/docs/create.py +++ b/docs/create.py @@ -74,10 +74,10 @@ Additionally, fuc can be used to parse output data from the following programs: -- Ensembl Variant Effect Predictor (VEP) -- SnpEff -- bcl2fastq and bcl2fastq2 -- Kallisto +- `Ensembl Variant Effect Predictor (VEP) `__ +- `SnpEff `__ +- `bcl2fastq and bcl2fastq2 `__ +- `Kallisto `__ Your contributions (e.g. feature ideas, pull requests) are most welcome. diff --git a/fuc/cli/bam_aldepth.py b/fuc/cli/bam_aldepth.py index 36c09a2..e715d62 100644 --- a/fuc/cli/bam_aldepth.py +++ b/fuc/cli/bam_aldepth.py @@ -3,7 +3,7 @@ from .. import api description = """ -Count allelic depth from a SAM/BAM/CRAM file. +Count allelic depth from a BAM file. """ epilog = f""" @@ -20,17 +20,20 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compute allelic depth from a SAM/BAM/CRAM file.', + help= +"""Compute allelic depth from a BAM file.""" ) parser.add_argument( 'bam', - help='Alignment file.' + help= +"""Input alignment file.""" ) parser.add_argument( 'sites', - help='TSV file containing two columns, chromosome and position. \n' - 'This can also be a BED or VCF file (compressed or \n' - 'uncompressed) Input type will be detected automatically.' + help= +"""TSV file containing two columns, chromosome and position. This +can also be a BED or VCF file (compressed or uncompressed). +Input type will be detected automatically.""" ) def main(args): diff --git a/fuc/cli/bam_head.py b/fuc/cli/bam_head.py index ca3572b..385a4da 100644 --- a/fuc/cli/bam_head.py +++ b/fuc/cli/bam_head.py @@ -5,7 +5,7 @@ import pysam description = """ -Print the header of a SAM/BAM/CRAM file. +Print the header of a BAM file. """ epilog = f""" @@ -19,11 +19,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Print the header of a SAM/BAM/CRAM file.', + help= +"""Print the header of a BAM file.""" ) parser.add_argument( 'bam', - help='Alignment file.' + help= +"""Input alignment file.""" ) def main(args): diff --git a/fuc/cli/bam_index.py b/fuc/cli/bam_index.py index cdb443d..2c88dc1 100644 --- a/fuc/cli/bam_index.py +++ b/fuc/cli/bam_index.py @@ -3,7 +3,7 @@ import pysam description = """ -Index a SAM/BAM/CRAM file. +Index a BAM file. """ epilog = f""" @@ -17,11 +17,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Index a SAM/BAM/CRAM file.', + help= +"""Index a BAM file.""" ) parser.add_argument( 'bam', - help='Alignment file.' + help= +"""Input alignment file.""" ) def main(args): diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index 0ed6d79..e0aa141 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -6,7 +6,7 @@ import pysam description = """ -Rename the sample in a SAM/BAM/CRAM file. +Rename the sample in a BAM file. """ epilog = f""" @@ -20,10 +20,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Rename the sample in a SAM/BAM/CRAM file.', + help= +"""Rename the sample in a BAM file.""" + ) + parser.add_argument( + 'bam', + help= +"""Input alignment file.""" + ) + parser.add_argument( + 'name', + help= +"""New sample name.""" ) - parser.add_argument('bam', help='Alignment file.') - parser.add_argument('name', help='New sample name.') def main(args): # Detect the input format. diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index 91324df..3f2816f 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -5,7 +5,7 @@ import pysam description = """ -Slice an alignment file (SAM/BAM/CRAM). +Slice a BAM file. """ epilog = f""" @@ -25,38 +25,43 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Slice a SAM/BAM/CRAM file.', + help= +"""Slice a BAM file.""" ) parser.add_argument( 'bam', - help="Input alignment file must be already indexed (.bai) to allow \n" - "random access. You can index an alignment file with the \n" - "bam-index command." + help= +"""Input alignment file must be already indexed (.bai) to allow +random access. You can index an alignment file with the +bam-index command.""" ) parser.add_argument( 'regions', nargs='+', - help="One or more regions to be sliced. Each region must have the \n" - "format chrom:start-end and be a half-open interval with \n" - "(start, end]. This means, for example, chr1:100-103 will \n" - "extract positions 101, 102, and 103. Alternatively, you can \n" - "provide a BED file (compressed or uncompressed) to specify \n" - "regions. Note that the 'chr' prefix in contig names (e.g. \n" - "'chr1' vs. '1') will be automatically added or removed as \n" - "necessary to match the input BED's contig names." + help= +"""One or more regions to be sliced. Each region must have the +format chrom:start-end and be a half-open interval with +(start, end]. This means, for example, chr1:100-103 will +extract positions 101, 102, and 103. Alternatively, you can +provide a BED file (compressed or uncompressed) to specify +regions. Note that the 'chr' prefix in contig names (e.g. +'chr1' vs. '1') will be automatically added or removed as +necessary to match the input BED's contig names.""" ) parser.add_argument( '--format', metavar='TEXT', default='BAM', choices=['SAM', 'BAM', 'CRAM'], - help="Output format (default: 'BAM') (choices: 'SAM', 'BAM', \n" - "'CRAM')." + help= +"""Output format (default: 'BAM') (choices: 'SAM', 'BAM', +'CRAM').""" ) parser.add_argument( '--fasta', metavar='PATH', - help="FASTA file. Required when --format is 'CRAM'." + help= +"""FASTA file. Required when --format is 'CRAM'.""" ) def main(args): diff --git a/fuc/cli/bed_intxn.py b/fuc/cli/bed_intxn.py index 6c78804..d673eb4 100644 --- a/fuc/cli/bed_intxn.py +++ b/fuc/cli/bed_intxn.py @@ -17,9 +17,15 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Find the intersection of BED files.', + help= +"""Find the intersection of BED files.""" + ) + parser.add_argument( + 'bed', + nargs='+', + help= +"""Input BED files.""" ) - parser.add_argument('bed', help='BED files.', nargs='+') def main(args): bfs = [api.pybed.BedFrame.from_file(x) for x in args.bed] diff --git a/fuc/cli/bed_sum.py b/fuc/cli/bed_sum.py index b72844d..d429baf 100644 --- a/fuc/cli/bed_sum.py +++ b/fuc/cli/bed_sum.py @@ -16,25 +16,29 @@ def create_parser(subparsers): subparsers, api.common._script_name(), description=description, - help='Summarize a BED file.', + help= +"""Summarize a BED file.""" ) parser.add_argument( 'bed', - help='BED file.' + help= +"""Input BED file.""" ) parser.add_argument( '--bases', metavar='INT', type=int, default=1, - help='Number to divide covered base pairs (default: 1).' + help= +"""Number to divide covered base pairs (default: 1).""" ) parser.add_argument( '--decimals', metavar='INT', type=int, default=0, - help='Number of decimals (default: 0).' + help= +"""Number of decimals (default: 0).""" ) def main(args): diff --git a/fuc/cli/cov_concat.py b/fuc/cli/cov_concat.py index fdae26c..69a4bea 100644 --- a/fuc/cli/cov_concat.py +++ b/fuc/cli/cov_concat.py @@ -22,21 +22,23 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Concatenate depth of coverage files.', + help= +"""Concatenate depth of coverage files.""" ) parser.add_argument( 'tsv', - metavar='PATH', nargs='+', - help='One or more TSV files.' + help= +"""Input TSV files.""" ) parser.add_argument( '--axis', metavar='INT', default=0, type=int, - help='The axis to concatenate along (default: 0) (choices: \n' - '0, 1 where 0 is index and 1 is columns).' + help= +"""The axis to concatenate along (default: 0) (choices: +0, 1 where 0 is index and 1 is columns).""" ) def main(args): diff --git a/fuc/cli/cov_rename.py b/fuc/cli/cov_rename.py index 9498ec5..56ed8e2 100644 --- a/fuc/cli/cov_rename.py +++ b/fuc/cli/cov_rename.py @@ -33,15 +33,18 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Rename the samples in a depth of coverage file.', + help= +"""Rename the samples in a depth of coverage file.""" ) parser.add_argument( 'tsv', - help='TSV file (compressed or uncompressed).' + help= +"""TSV file (compressed or uncompressed).""" ) parser.add_argument( 'names', - help='Text file containing information for renaming the samples.' + help= +"""Text file containing information for renaming the samples.""" ) parser.add_argument( '--mode', @@ -56,15 +59,17 @@ def create_parser(subparsers): metavar='INT', type=int, nargs=2, - help="Index range to use when renaming the samples. \n" - "Applicable only with the 'RANGE' mode." + help= +"""Index range to use when renaming the samples. +Applicable only with the 'RANGE' mode.""" ) parser.add_argument( '--sep', metavar='TEXT', default='\t', - help="Delimiter to use when reading the names file \n" - "(default: '\\t')." + help= +"""Delimiter to use when reading the names file +(default: '\\t').""" ) def main(args): diff --git a/fuc/cli/fa_filter.py b/fuc/cli/fa_filter.py index 83fcae0..dbe7316 100644 --- a/fuc/cli/fa_filter.py +++ b/fuc/cli/fa_filter.py @@ -24,23 +24,27 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Filter sequence records in a FASTA file', + help= +"""Filter sequence records in a FASTA file.""" ) parser.add_argument( 'fasta', - help='FASTA file (compressed or uncompressed).' + help= +"""Input FASTA file (compressed or uncompressed).""" ) parser.add_argument( '--contigs', metavar='TEXT', nargs='+', - help='One or more contigs to be selected. Alternatively, you can \n' - 'provide a file containing one contig per line. ' + help= +"""One or more contigs to be selected. Alternatively, you can +provide a file containing one contig per line.""" ) parser.add_argument( '--exclude', action='store_true', - help='Exclude specified contigs.' + help= +"""Exclude specified contigs.""" ) def main(args): diff --git a/fuc/cli/fq_count.py b/fuc/cli/fq_count.py index df4155f..d22856f 100644 --- a/fuc/cli/fq_count.py +++ b/fuc/cli/fq_count.py @@ -21,12 +21,14 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Count sequence reads in FASTQ files.', + help= +"""Count sequence reads in FASTQ files.""" ) parser.add_argument( 'fastq', nargs='*', - help='FASTQ files (compressed or uncompressed) (default: stdin).' + help= +"""Input FASTQ files (compressed or uncompressed) (default: stdin).""" ) def main(args): diff --git a/fuc/cli/fq_sum.py b/fuc/cli/fq_sum.py index 70c519c..50bdb33 100644 --- a/fuc/cli/fq_sum.py +++ b/fuc/cli/fq_sum.py @@ -19,11 +19,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Summarize a FASTQ file.', + help= +"""Summarize a FASTQ file.""" ) parser.add_argument( 'fastq', - help='FASTQ file (zipped or unqzipped).' + help= +"""Input FASTQ file (compressed or uncompressed).""" ) def main(args): diff --git a/fuc/cli/fuc_bgzip.py b/fuc/cli/fuc_bgzip.py index b3a681b..6ef96ac 100644 --- a/fuc/cli/fuc_bgzip.py +++ b/fuc/cli/fuc_bgzip.py @@ -28,12 +28,14 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Write a BGZF compressed file.', + help= +"""Write a BGZF compressed file.""" ) parser.add_argument( 'file', nargs='*', - help='File to be compressed (default: stdin).' + help= +"""Input file to be compressed (default: stdin).""" ) def main(args): diff --git a/fuc/cli/fuc_compf.py b/fuc/cli/fuc_compf.py index 8762b77..31c266a 100644 --- a/fuc/cli/fuc_compf.py +++ b/fuc/cli/fuc_compf.py @@ -20,10 +20,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compare the contents of two files.', + help= +"""Compare the contents of two files.""" + ) + parser.add_argument( + 'left', + help= +"""Input left file.""" + ) + parser.add_argument( + 'right', + help= +"""Input right file.""" ) - parser.add_argument('left', help='Left file.') - parser.add_argument('right', help='Right file.') def main(args): print(filecmp.cmp(args.left, args.right)) diff --git a/fuc/cli/fuc_demux.py b/fuc/cli/fuc_demux.py index 0b92c6d..aed7dd8 100644 --- a/fuc/cli/fuc_demux.py +++ b/fuc/cli/fuc_demux.py @@ -36,22 +36,26 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='Parse the Reports directory from bcl2fastq.', description=description, + help= +"""Parse the Reports directory from bcl2fastq.""" ) parser.add_argument( 'reports', - help='Reports directory.' + help= +"""Input Reports directory.""" ) parser.add_argument( 'output', - help='Output directory (will be created).' + help= +"""Output directory (will be created).""" ) parser.add_argument( '--sheet', metavar='PATH', - help='SampleSheet.csv file. Used for sorting and/or subsetting \n' - 'samples.' + help= +"""SampleSheet.csv file. Used for sorting and/or subsetting +samples.""" ) def main(args): diff --git a/fuc/cli/fuc_exist.py b/fuc/cli/fuc_exist.py index 4e6a1af..b54d4f5 100644 --- a/fuc/cli/fuc_exist.py +++ b/fuc/cli/fuc_exist.py @@ -27,10 +27,15 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Check whether certain files exist.', + help= +"""Check whether certain files exist.""" + ) + parser.add_argument( + 'files', + nargs='*', + help= +"""Files and directories to be tested (default: stdin).""" ) - parser.add_argument('files', nargs='*', - help='Files and directories to be tested (default: stdin).') def main(args): if args.files: diff --git a/fuc/cli/fuc_find.py b/fuc/cli/fuc_find.py index 39e2164..9659b92 100644 --- a/fuc/cli/fuc_find.py +++ b/fuc/cli/fuc_find.py @@ -25,25 +25,29 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Retrieve absolute paths of files whose name matches a \n' - 'specified pattern, optionally recursively.', + help= +"""Retrieve absolute paths of files whose name matches a +specified pattern, optionally recursively.""" ) parser.add_argument( 'pattern', - help='Filename pattern.' + help= +"""Filename pattern.""" ) parser.add_argument( '-r', '--recursive', action='store_true', - help='Turn on recursive retrieving.' + help= +"""Turn on recursive retrieving.""" ) parser.add_argument( '-d', '--directory', metavar='PATH', default=os.getcwd(), - help='Directory to search in (default: current directory).' + help= +"""Directory to search in (default: current directory).""" ) def main(args): diff --git a/fuc/cli/fuc_undetm.py b/fuc/cli/fuc_undetm.py index 30d10bf..4e71440 100644 --- a/fuc/cli/fuc_undetm.py +++ b/fuc/cli/fuc_undetm.py @@ -20,19 +20,22 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compute top unknown barcodes using undertermined FASTQ from \n' - 'bcl2fastq.' + help= +"""Compute top unknown barcodes using undertermined FASTQ from +bcl2fastq.""" ) parser.add_argument( 'fastq', - help='Undertermined FASTQ (compressed or uncompressed).' + help= +"""Undertermined FASTQ (compressed or uncompressed).""" ) parser.add_argument( '--count', metavar='INT', type=int, default=30, - help='Number of top unknown barcodes to return (default: 30).' + help= +"""Number of top unknown barcodes to return (default: 30).""" ) def main(args): diff --git a/fuc/cli/maf_maf2vcf.py b/fuc/cli/maf_maf2vcf.py index 7286135..a413720 100644 --- a/fuc/cli/maf_maf2vcf.py +++ b/fuc/cli/maf_maf2vcf.py @@ -44,33 +44,39 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Convert a MAF file to a VCF file.', + help= +"""Convert a MAF file to a VCF file.""" ) parser.add_argument( 'maf', - help='MAF file (compressed or uncompressed).' + help= +"""MAF file (compressed or uncompressed).""" ) parser.add_argument( '--fasta', metavar='PATH', - help='FASTA file (required to include INDELs in the output).' + help= +"""FASTA file (required to include INDELs in the output).""" ) parser.add_argument( '--ignore_indels', action='store_true', - help='Use this flag to exclude INDELs from the output.' + help= +"""Use this flag to exclude INDELs from the output.""" ) parser.add_argument( '--cols', metavar='TEXT', nargs='+', - help='Column(s) in the MAF file.' + help= +"""Column(s) in the MAF file.""" ) parser.add_argument( '--names', metavar='TEXT', nargs='+', - help='Name(s) to be displayed in the FORMAT field.' + help= +"""Name(s) to be displayed in the FORMAT field.""" ) def main(args): diff --git a/fuc/cli/maf_oncoplt.py b/fuc/cli/maf_oncoplt.py index 15b786c..458e420 100644 --- a/fuc/cli/maf_oncoplt.py +++ b/fuc/cli/maf_oncoplt.py @@ -23,22 +23,26 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Create an oncoplot with a MAF file.', + help= +"""Create an oncoplot with a MAF file.""" ) parser.add_argument( 'maf', - help='MAF file.' + help= +"""Input MAF file.""" ) parser.add_argument( 'out', - help='Output image file.' + help= +"""Output image file.""" ) parser.add_argument( '--count', metavar='INT', type=int, default=10, - help='Number of top mutated genes to display (default: 10).' + help= +"""Number of top mutated genes to display (default: 10).""" ) parser.add_argument( '--figsize', @@ -46,28 +50,32 @@ def create_parser(subparsers): type=float, default=[15, 10], nargs=2, - help='Width, height in inches (default: [15, 10]).' + help= +"""Width, height in inches (default: [15, 10]).""" ) parser.add_argument( '--label_fontsize', metavar='FLOAT', type=float, default=15, - help='Font size of labels (default: 15).' + help= +"""Font size of labels (default: 15).""" ) parser.add_argument( '--ticklabels_fontsize', metavar='FLOAT', type=float, default=15, - help='Font size of tick labels (default: 15).' + help= +"""Font size of tick labels (default: 15).""" ) parser.add_argument( '--legend_fontsize', metavar='FLOAT', type=float, default=15, - help='Font size of legend texts (default: 15).' + help= +"""Font size of legend texts (default: 15).""" ) def main(args): diff --git a/fuc/cli/maf_sumplt.py b/fuc/cli/maf_sumplt.py index 87ecad1..03f6e85 100644 --- a/fuc/cli/maf_sumplt.py +++ b/fuc/cli/maf_sumplt.py @@ -23,15 +23,18 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Create a summary plot with a MAF file.', + help= +"""Create a summary plot with a MAF file.""" ) parser.add_argument( 'maf', - help='MAF file.' + help= +"""Input MAF file.""" ) parser.add_argument( 'out', - help='Output image file.' + help= +"""Output image file.""" ) parser.add_argument( '--figsize', @@ -39,28 +42,32 @@ def create_parser(subparsers): type=float, default=[15, 10], nargs=2, - help='width, height in inches (default: [15, 10])' + help= +"""Width, height in inches (default: [15, 10]).""" ) parser.add_argument( '--title_fontsize', metavar='FLOAT', type=float, default=16, - help='font size of subplot titles (default: 16)' + help= +"""Font size of subplot titles (default: 16).""" ) parser.add_argument( '--ticklabels_fontsize', metavar='FLOAT', type=float, default=12, - help='font size of tick labels (default: 12)' + help= +"""Font size of tick labels (default: 12).""" ) parser.add_argument( '--legend_fontsize', metavar='FLOAT', type=float, default=12, - help='font size of legend texts (default: 12)' + help= +"""Font size of legend texts (default: 12).""" ) def main(args): diff --git a/fuc/cli/maf_vcf2maf.py b/fuc/cli/maf_vcf2maf.py index 6fc653a..5846652 100644 --- a/fuc/cli/maf_vcf2maf.py +++ b/fuc/cli/maf_vcf2maf.py @@ -17,9 +17,14 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Convert a VCF file to a MAF file.', + help= +"""Convert a VCF file to a MAF file.""" + ) + parser.add_argument( + 'vcf', + help= +"""Annotated VCF file.""" ) - parser.add_argument('vcf', help='Annotated VCF file.') def main(args): mf = api.pymaf.MafFrame.from_vcf(args.vcf) diff --git a/fuc/cli/ngs_bam2fq.py b/fuc/cli/ngs_bam2fq.py index 3e2bcb8..6dd92d1 100644 --- a/fuc/cli/ngs_bam2fq.py +++ b/fuc/cli/ngs_bam2fq.py @@ -44,36 +44,42 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for converting BAM files to FASTQ files.', + help= +"""Pipeline for converting BAM files to FASTQ files.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help="SGE resoruce to request with qsub for BAM to FASTQ \n" - "conversion. Since this oppoeration supports multithreading, \n" - "it is recommended to speicfy a parallel environment (PE) \n" - "to speed up the process (also see --thread)." + help= +"""SGE resoruce to request with qsub for BAM to FASTQ +conversion. Since this oppoeration supports multithreading, +it is recommended to speicfy a parallel environment (PE) +to speed up the process (also see --thread).""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) def main(args): diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index ffdd5f4..0955113 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -54,73 +54,86 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for converting FASTQ files to analysis-ready BAM \n' - 'files.' + help= +"""Pipeline for converting FASTQ files to analysis-ready BAM +files.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java', - help='Java resoruce to request for GATK.' + help= +"""Java resoruce to request for GATK.""" ) parser.add_argument( 'vcf', type=str, nargs='+', - help='One or more reference VCF files containing known variant \n' - 'sites (e.g. 1000 Genomes Project).' + help= +"""One or more reference VCF files containing known variant +sites (e.g. 1000 Genomes Project).""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--platform', metavar='TEXT', type=str, default='Illumina', - help="Sequencing platform (default: 'Illumina')." + help= +"""Sequencing platform (default: 'Illumina').""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) def main(args): diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index b39e29d..3d437e1 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -45,92 +45,107 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for germline short variant discovery.', + help= +"""Pipeline for germline short variant discovery.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java1', type=str, - help='Java resoruce to request for single-sample variant calling.' + help= +"""Java resoruce to request for single-sample variant calling.""" ) parser.add_argument( 'java2', type=str, - help='Java resoruce to request for joint variant calling.' + help= +"""Java resoruce to request for joint variant calling.""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--dbsnp', metavar='PATH', type=str, - help='VCF file from dbSNP.' + help= +"""VCF file from dbSNP.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--batch', metavar='INT', type=int, default=0, - help='Batch size used for GenomicsDBImport (default: 0). This \n' - 'controls the number of samples for which readers are \n' - 'open at once and therefore provides a way to minimize \n' - 'memory consumption. The size of 0 means no batching (i.e. \n' - 'readers for all samples will be opened at once).' + help= +"""Batch size used for GenomicsDBImport (default: 0). This +controls the number of samples for which readers are +open at once and therefore provides a way to minimize +memory consumption. The size of 0 means no batching (i.e. +readers for all samples will be opened at once).""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) parser.add_argument( '--posix', action='store_true', - help='Set GenomicsDBImport to allow for optimizations to improve \n' - 'the usability and performance for shared Posix Filesystems \n' - '(e.g. NFS, Lustre). If set, file level locking is disabled \n' - 'and file system writes are minimized by keeping a higher \n' - 'number of file descriptors open for longer periods of time. \n' - 'Use with --batch if keeping a large number of file \n' - 'descriptors open is an issue.' + help= +"""Set GenomicsDBImport to allow for optimizations to improve +the usability and performance for shared Posix Filesystems +(e.g. NFS, Lustre). If set, file level locking is disabled +and file system writes are minimized by keeping a higher +number of file descriptors open for longer periods of time. +Use with --batch if keeping a large number of file +descriptors open is an issue.""" ) def main(args): diff --git a/fuc/cli/ngs_m2.py b/fuc/cli/ngs_m2.py index 867cb31..618a90d 100644 --- a/fuc/cli/ngs_m2.py +++ b/fuc/cli/ngs_m2.py @@ -50,50 +50,60 @@ def create_parser(subparsers): ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'pon', - help='PoN VCF file.' + help= +"""PoN VCF file.""" ) parser.add_argument( 'germline', - help='Germline VCF file.' + help= +"""Germline VCF file.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java', type=str, - help='Java resoruce to request for GATK.' + help= +"""Java resoruce to request for GATK.""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) def main(args): diff --git a/fuc/cli/ngs_pon.py b/fuc/cli/ngs_pon.py index 2cb0ed1..82809ae 100644 --- a/fuc/cli/ngs_pon.py +++ b/fuc/cli/ngs_pon.py @@ -40,45 +40,54 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for constructing a panel of normals (PoN).', + help= +"""Pipeline for constructing a panel of normals (PoN).""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java', - help='Java resoruce to request for GATK.' + help= +"""Java resoruce to request for GATK.""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) def main(args): diff --git a/fuc/cli/ngs_quant.py b/fuc/cli/ngs_quant.py index a0f9196..e731995 100644 --- a/fuc/cli/ngs_quant.py +++ b/fuc/cli/ngs_quant.py @@ -35,58 +35,68 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for running RNAseq quantification from FASTQ files \n' - 'with Kallisto.', + help= +"""Pipeline for running RNAseq quantification from FASTQ files +with Kallisto.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'index', - help='Kallisto index file.' + help= +"""Kallisto index file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--bootstrap', metavar='INT', type=int, default=50, - help='Number of bootstrap samples (default: 50).' + help= +"""Number of bootstrap samples (default: 50).""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--posix', action='store_true', - help='Set the environment variable HDF5_USE_FILE_LOCKING=FALSE \n' - 'before running Kallisto. This is required for shared Posix \n' - 'Filesystems (e.g. NFS, Lustre).' + help= +"""Set the environment variable HDF5_USE_FILE_LOCKING=FALSE +before running Kallisto. This is required for shared Posix +Filesystems (e.g. NFS, Lustre).""" ) def main(args): diff --git a/fuc/cli/ngs_trim.py b/fuc/cli/ngs_trim.py index 4384fc3..89f33bf 100644 --- a/fuc/cli/ngs_trim.py +++ b/fuc/cli/ngs_trim.py @@ -34,39 +34,46 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for trimming adapters from FASTQ files.', + help= +"""Pipeline for trimming adapters from FASTQ files.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) def main(args): diff --git a/fuc/cli/tabix_index.py b/fuc/cli/tabix_index.py index 993ee29..bc3e759 100644 --- a/fuc/cli/tabix_index.py +++ b/fuc/cli/tabix_index.py @@ -32,16 +32,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Index a GFF/BED/SAM/VCF file with Tabix.', + help= +"""Index a GFF/BED/SAM/VCF file with Tabix.""" ) parser.add_argument( 'file', - help='File to be indexed.' + help= +"""File to be indexed.""" ) parser.add_argument( '--force', action='store_true', - help='Force to overwrite the index file if it is present.' + help= +"""Force to overwrite the index file if it is present.""" ) def main(args): diff --git a/fuc/cli/tabix_slice.py b/fuc/cli/tabix_slice.py index d8f3bd7..dfc08e6 100644 --- a/fuc/cli/tabix_slice.py +++ b/fuc/cli/tabix_slice.py @@ -25,16 +25,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Slice a GFF/BED/SAM/VCF file with Tabix.', + help= +"""Slice a GFF/BED/SAM/VCF file with Tabix.""" ) parser.add_argument( 'file', - help='File to be sliced.' + help= +"""File to be sliced.""" ) parser.add_argument( 'regions', nargs='+', - help='One or more regions.' + help= +"""One or more regions.""" ) def main(args): diff --git a/fuc/cli/tbl_merge.py b/fuc/cli/tbl_merge.py index b98d6c4..9765883 100644 --- a/fuc/cli/tbl_merge.py +++ b/fuc/cli/tbl_merge.py @@ -32,47 +32,55 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Merge two table files.', + help= +"""Merge two table files.""" ) parser.add_argument( 'left', - help='Left file.' + help= +"""Input left file.""" ) parser.add_argument( 'right', - help='Right file.' + help= +"""Input right file.""" ) parser.add_argument( '--how', metavar='TEXT', choices=CHOICES, default='inner', - help="Type of merge to be performed (default: 'inner') \n" - "(choices: 'left', 'right', 'outer', 'inner', 'cross')." + help= +"""Type of merge to be performed (default: 'inner') +(choices: 'left', 'right', 'outer', 'inner', 'cross').""" ) parser.add_argument( '--on', metavar='TEXT', nargs='+', - help='Column names to join on.' + help= +"""Column names to join on.""" ) parser.add_argument( '--lsep', metavar='TEXT', default='\t', - help="Delimiter to use for the left file (default: '\\t')." + help= +"""Delimiter to use for the left file (default: '\\t').""" ) parser.add_argument( '--rsep', metavar='TEXT', default='\t', - help="Delimiter to use for the right file (default: '\\t')." + help= +"""Delimiter to use for the right file (default: '\\t').""" ) parser.add_argument( '--osep', metavar='TEXT', default='\t', - help="Delimiter to use for the output file (default: '\\t')." + help= +"""Delimiter to use for the output file (default: '\\t').""" ) return parser diff --git a/fuc/cli/tbl_sum.py b/fuc/cli/tbl_sum.py index 1b46794..6cdebd0 100644 --- a/fuc/cli/tbl_sum.py +++ b/fuc/cli/tbl_sum.py @@ -18,63 +18,72 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Summarize a table file.', + help= +"""Summarize a table file.""" ) parser.add_argument( 'table_file', - help='Table file.' + help= +"""Table file.""" ) parser.add_argument( '--sep', metavar='TEXT', default='\t', - help="Delimiter to use (default: '\\t')." + help= +"""Delimiter to use (default: '\\t').""" ) parser.add_argument( '--skiprows', metavar='TEXT', - help='Comma-separated line numbers to skip (0-indexed) or \n' - 'number of lines to skip at the start of the file \n' - '(e.g. `--skiprows 1,` will skip the second line, \n' - '`--skiprows 2,4` will skip the third and fifth lines, \n' - 'and `--skiprows 10` will skip the first 10 lines).' + help= +"""Comma-separated line numbers to skip (0-indexed) or +number of lines to skip at the start of the file +(e.g. `--skiprows 1,` will skip the second line, +`--skiprows 2,4` will skip the third and fifth lines, +and `--skiprows 10` will skip the first 10 lines).""" ) parser.add_argument( '--na_values', metavar='TEXT', nargs='+', - help='Additional strings to recognize as NA/NaN (by \n' - "default, the following values are interpreted \n" - "as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', \n" - "'-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', \n" - "'', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', \n" - "'null')." + help= +"""Additional strings to recognize as NA/NaN (by +default, the following values are interpreted +as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', +'-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', +'', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', +'null').""" ) parser.add_argument( '--keep_default_na', action='store_false', - help='Whether or not to include the default NaN values when \n' - "parsing the data (see 'pandas.read_table' for details)." + help= +"""Whether or not to include the default NaN values when +parsing the data (see 'pandas.read_table' for details).""" ) parser.add_argument( '--expr', metavar='TEXT', - help='Query the columns of a pandas.DataFrame with a \n' - """boolean expression (e.g. `--query "A == 'yes'"`).""" + help= +"""Query the columns of a pandas.DataFrame with a +boolean expression (e.g. `--query "A == 'yes'"`).""" ) parser.add_argument( '--columns', metavar='TEXT', nargs='+', - help='Columns to be summarized (by default, all columns \n' - 'will be included).' + help= +"""Columns to be summarized (by default, all columns +will be included).""" ) parser.add_argument( '--dtypes', metavar='PATH', - help="File of column names and their data types (either \n" - "'categorical' or 'numeric'); one tab-delimited pair of \n" - "column name and data type per line." + help= +"""File of column names and their data types (either +'categorical' or 'numeric'); one tab-delimited pair of +column name and data type per line.""" ) def main(args): diff --git a/fuc/cli/vcf_filter.py b/fuc/cli/vcf_filter.py index c80be07..5de1933 100644 --- a/fuc/cli/vcf_filter.py +++ b/fuc/cli/vcf_filter.py @@ -41,48 +41,56 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Filter a VCF file.', + help= +"""Filter a VCF file.""" ) parser.add_argument( 'vcf', - help='VCF file (compressed or uncompressed).' + help= +"""VCF file (compressed or uncompressed).""" ) parser.add_argument( '--expr', metavar='TEXT', - help='Expression to evaluate.' + help= +"""Expression to evaluate.""" ) parser.add_argument( '--samples', metavar='PATH', - help='File of sample names to apply the marking (one \n' - 'sample per line).' + help= +"""File of sample names to apply the marking (one +sample per line).""" ) parser.add_argument( '--drop_duplicates', metavar='TEXT', nargs='*', - help='Only consider certain columns for identifying \n' - 'duplicates, by default use all of the columns.' + help= +"""Only consider certain columns for identifying +duplicates, by default use all of the columns.""" ) parser.add_argument( '--greedy', action='store_true', - help='Use this flag to mark even ambiguous genotypes \n' - 'as missing.' + help= +"""Use this flag to mark even ambiguous genotypes +as missing.""" ) parser.add_argument( '--opposite', action='store_true', - help='Use this flag to mark all genotypes that do not \n' - 'satisfy the query expression as missing and leave \n' - 'those that do intact.' + help= +"""Use this flag to mark all genotypes that do not +satisfy the query expression as missing and leave +those that do intact.""" ) parser.add_argument( '--filter_empty', action='store_true', - help='Use this flag to remove rows with no genotype \n' - 'calls at all.' + help= +"""Use this flag to remove rows with no genotype +calls at all.""" ) def main(args): diff --git a/fuc/cli/vcf_index.py b/fuc/cli/vcf_index.py index 4ecd621..6f77ce0 100644 --- a/fuc/cli/vcf_index.py +++ b/fuc/cli/vcf_index.py @@ -28,14 +28,16 @@ def create_parser(subparsers): ) parser.add_argument( 'vcf', - help='Input VCF file to be indexed. When an uncompressed file is \n' - 'given, the command will automatically create a BGZF \n' - 'compressed copy of the file (.gz) before indexing.' + help= +"""Input VCF file to be indexed. When an uncompressed file is +given, the command will automatically create a BGZF +compressed copy of the file (.gz) before indexing.""" ) parser.add_argument( '--force', action='store_true', - help='Force to overwrite the index file if it is already present.' + help= +"""Force to overwrite the index file if it is already present.""" ) def main(args): diff --git a/fuc/cli/vcf_merge.py b/fuc/cli/vcf_merge.py index 60d5353..710cd25 100644 --- a/fuc/cli/vcf_merge.py +++ b/fuc/cli/vcf_merge.py @@ -25,36 +25,41 @@ def create_parser(subparsers): parser.add_argument( 'vcf_files', nargs='+', - help="VCF files (compressed or uncompressed). Note that the 'chr'\n" - "prefix in contig names (e.g. 'chr1' vs. '1') will be \n" - "automatically added or removed as necessary to match the \n" - "contig names of the first VCF." + help= +"""VCF files (compressed or uncompressed). Note that the 'chr' +prefix in contig names (e.g. 'chr1' vs. '1') will be +automatically added or removed as necessary to match the +contig names of the first VCF.""" ) parser.add_argument( '--how', metavar='TEXT', default='inner', - help="Type of merge as defined in pandas.DataFrame.merge \n" - "(default: 'inner')." + help= +"""Type of merge as defined in pandas.DataFrame.merge +(default: 'inner').""" ) parser.add_argument( '--format', metavar='TEXT', default='GT', - help="FORMAT subfields to be retained (e.g. 'GT:AD:DP') \n" - "(default: 'GT')." + help= +"""FORMAT subfields to be retained (e.g. 'GT:AD:DP') +(default: 'GT').""" ) parser.add_argument( '--sort', action='store_false', - help='Use this flag to turn off sorting of records \n' - '(default: True).' + help= +"""Use this flag to turn off sorting of records +(default: True).""" ) parser.add_argument( '--collapse', action='store_true', - help='Use this flag to collapse duplicate records \n' - '(default: False).' + help= +"""Use this flag to collapse duplicate records +(default: False).""" ) def main(args): diff --git a/fuc/cli/vcf_rename.py b/fuc/cli/vcf_rename.py index d03db01..3d0f7a0 100644 --- a/fuc/cli/vcf_rename.py +++ b/fuc/cli/vcf_rename.py @@ -33,38 +33,44 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Rename the samples in a VCF file.', + help= +"""Rename the samples in a VCF file.""" ) parser.add_argument( 'vcf', - help='VCF file (compressed or uncompressed).' + help= +"""VCF file (compressed or uncompressed).""" ) parser.add_argument( 'names', - help='Text file containing information for renaming the samples.' + help= +"""Text file containing information for renaming the samples.""" ) parser.add_argument( '--mode', metavar='TEXT', default='MAP', choices=['MAP', 'INDEX', 'RANGE'], - help="Renaming mode (default: 'MAP') (choices: 'MAP', \n" - "'INDEX', 'RANGE')." + help= +"""Renaming mode (default: 'MAP') (choices: 'MAP', +'INDEX', 'RANGE').""" ) parser.add_argument( '--range', metavar='INT', type=int, nargs=2, - help="Index range to use when renaming the samples. \n" - "Applicable only with the 'RANGE' mode." + help= +"""Index range to use when renaming the samples. +Applicable only with the 'RANGE' mode.""" ) parser.add_argument( '--sep', metavar='TEXT', default='\t', - help="Delimiter to use for reading the 'names' file \n" - "(default: '\\t')." + help= +"""Delimiter to use for reading the 'names' file +(default: '\\t').""" ) def main(args): diff --git a/fuc/cli/vcf_slice.py b/fuc/cli/vcf_slice.py index 388fcea..68d154c 100644 --- a/fuc/cli/vcf_slice.py +++ b/fuc/cli/vcf_slice.py @@ -23,26 +23,29 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Slice a VCF file for specified regions.', + help= +"""Slice a VCF file for specified regions.""" ) parser.add_argument( 'vcf', - help="Input VCF file must be already BGZF compressed (.gz) and \n" - "indexed (.tbi) to allow random access. A VCF file can be \n" - "compressed with the fuc-bgzip command and indexed with the \n" - "vcf-index command." + help= +"""Input VCF file must be already BGZF compressed (.gz) and +indexed (.tbi) to allow random access. A VCF file can be +compressed with the fuc-bgzip command and indexed with the +vcf-index command.""" ) parser.add_argument( 'regions', nargs='+', - help="One or more regions to be sliced. Each region must have the \n" - "format chrom:start-end and be a half-open interval with \n" - "(start, end]. This means, for example, chr1:100-103 will \n" - "extract positions 101, 102, and 103. Alternatively, you can \n" - "provide a BED file (compressed or uncompressed) to specify \n" - "regions. Note that the 'chr' prefix in contig names (e.g. \n" - "'chr1' vs. '1') will be automatically added or removed as \n" - "necessary to match the input VCF's contig names." + help= +"""One or more regions to be sliced. Each region must have the +format chrom:start-end and be a half-open interval with +(start, end]. This means, for example, chr1:100-103 will +extract positions 101, 102, and 103. Alternatively, you can +provide a BED file (compressed or uncompressed) to specify +regions. Note that the 'chr' prefix in contig names (e.g. +'chr1' vs. '1') will be automatically added or removed as +necessary to match the input VCF's contig names.""" ) def main(args): diff --git a/fuc/cli/vcf_split.py b/fuc/cli/vcf_split.py index 8d14dd8..23529f5 100644 --- a/fuc/cli/vcf_split.py +++ b/fuc/cli/vcf_split.py @@ -19,28 +19,33 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Split a VCF file by individual.', + help= +"""Split a VCF file by individual.""" ) parser.add_argument( 'vcf', - help="VCF file to be split." + help= +"""VCF file to be split.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( '--clean', action='store_false', - help="By default, the command will only return variants present in \n" - "each individual. Use the tag to stop this behavior and make \n" - "sure that all individuals have the same number of variants." + help= +"""By default, the command will only return variants present in +each individual. Use the tag to stop this behavior and make +sure that all individuals have the same number of variants.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) def main(args): diff --git a/fuc/cli/vcf_vcf2bed.py b/fuc/cli/vcf_vcf2bed.py index bda3991..2fb07f9 100644 --- a/fuc/cli/vcf_vcf2bed.py +++ b/fuc/cli/vcf_vcf2bed.py @@ -17,11 +17,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Convert a VCF file to a BED file.', + help= +"""Convert a VCF file to a BED file.""" ) parser.add_argument( 'vcf', - help='VCF file (compressed or uncompressed).' + help= +"""VCF file (compressed or uncompressed).""" ) def main(args): diff --git a/fuc/cli/vcf_vep.py b/fuc/cli/vcf_vep.py index 969b21f..d0605c5 100644 --- a/fuc/cli/vcf_vep.py +++ b/fuc/cli/vcf_vep.py @@ -37,26 +37,31 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Filter a VCF file by annotations from Ensembl VEP.', + help= +"""Filter a VCF file by annotations from Ensembl VEP.""" ) parser.add_argument( 'vcf', - help='VCF file annotated by Ensembl VEP (compressed or uncompressed).' + help= +"""VCF file annotated by Ensembl VEP (compressed or uncompressed).""" ) parser.add_argument( 'expr', - help='Query expression to evaluate.' + help= +"""Query expression to evaluate.""" ) parser.add_argument( '--opposite', action='store_true', - help="Use this flag to return only records that don't \n" - "meet the said criteria." + help= +"""Use this flag to return only records that don't +meet the said criteria.""" ) parser.add_argument( '--as_zero', action='store_true', - help='Use this flag to treat missing values as zero instead of NaN.' + help= +"""Use this flag to treat missing values as zero instead of NaN.""" ) def main(args):