diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a1187a3..45de339 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,23 @@ Changelog ********* +0.31.0 (2022-03-01) +------------------- + +* Fix bug in :meth:`pykallisto.KallistoFrame.compute_fold_change` method. +* Add new method :meth:`pyvcf.call` and new command :command:`vcf-call`. +* Combine optional arguments ``bam`` and ``fn`` into single positional argument ``bams`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bam`` and ``--fn`` into ``bams``). +* Combine optional arguments ``bed`` and ``region`` into single optional argument ``regions`` for :meth:`pycov.CovFrame.from_bam` method. The same goes for :command:`bam-depth` command (combine ``--bed`` and ``--region`` into ``--regions``). +* Update :meth:`pycov.CovFrame.from_bam` method and :command:`bam-depth` command to automatically handle the 'chr' string. +* Rename :meth:`pyvcf.VcfFrame.variants` method to :meth:`pyvcf.VcfFrame.to_variants`. +* Add new optional arguments ``force`` and ``missing`` to :meth:`pyvcf.row_updateinfo` method. +* Add new method :meth:`pyvcf.gt_ploidy`. +* Update :meth:`pyvcf.gt_polyp` method to use :meth:`pyvcf.gt_ploidy` method internally. +* :issue:`53`: Add new methods to compute AC/AN/AF in the INFO column: :meth:`pyvcf.row_computeinfo` and :meth:`pyvcf.VcfFrame.compute_info`. +* :issue:`54`: Update :meth:`pyvcf.VcfFrame.cfilter_empty` method so that users can control missingness threshold for filtering samples. +* Rename :meth:`pyvcf.VcfFrame.cfilter_empty` method to :meth:`pyvcf.VcfFrame.empty_samples`. +* Update :meth:`common.sort_regions` method to support regions containing an ALT contig (e.g. chr16_KI270854v1_alt). + 0.30.0 (2022-02-05) ------------------- diff --git a/README.rst b/README.rst index 4c7621e..d7f087e 100644 --- a/README.rst +++ b/README.rst @@ -46,10 +46,10 @@ Currently, fuc can be used to analyze, summarize, visualize, and manipulate the Additionally, fuc can be used to parse output data from the following programs: -- Ensembl Variant Effect Predictor (VEP) -- SnpEff -- bcl2fastq and bcl2fastq2 -- Kallisto +- `Ensembl Variant Effect Predictor (VEP) `__ +- `SnpEff `__ +- `bcl2fastq and bcl2fastq2 `__ +- `Kallisto `__ Your contributions (e.g. feature ideas, pull requests) are most welcome. @@ -112,42 +112,45 @@ For getting help on the fuc CLI: positional arguments: COMMAND - bam-aldepth Compute allelic depth from a SAM/BAM/CRAM file. - bam-depth Compute read depth from SAM/BAM/CRAM files. - bam-head Print the header of a SAM/BAM/CRAM file. - bam-index Index a SAM/BAM/CRAM file. - bam-rename Rename the sample in a SAM/BAM/CRAM file. - bam-slice Slice a SAM/BAM/CRAM file. + bam-aldepth Compute allelic depth from a BAM file. + bam-depth Compute per-base read depth from BAM files. + bam-head Print the header of a BAM file. + bam-index Index a BAM file. + bam-rename Rename the sample in a BAM file. + bam-slice Slice a BAM file. bed-intxn Find the intersection of BED files. bed-sum Summarize a BED file. cov-concat Concatenate depth of coverage files. cov-rename Rename the samples in a depth of coverage file. - fa-filter Filter sequence records in a FASTA file + fa-filter Filter sequence records in a FASTA file. fq-count Count sequence reads in FASTQ files. fq-sum Summarize a FASTQ file. fuc-bgzip Write a BGZF compressed file. fuc-compf Compare the contents of two files. fuc-demux Parse the Reports directory from bcl2fastq. fuc-exist Check whether certain files exist. - fuc-find Retrieve absolute paths of files whose name matches a + fuc-find Retrieve absolute paths of files whose name matches a specified pattern, optionally recursively. - fuc-undetm Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. + fuc-undetm Compute top unknown barcodes using undertermined FASTQ from + bcl2fastq. maf-maf2vcf Convert a MAF file to a VCF file. maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. ngs-bam2fq Pipeline for converting BAM files to FASTQ files. - ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. + ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM + files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. ngs-pon Pipeline for constructing a panel of normals (PoN). - ngs-quant Pipeline for running RNAseq quantification from FASTQ files + ngs-quant Pipeline for running RNAseq quantification from FASTQ files with Kallisto. ngs-trim Pipeline for trimming adapters from FASTQ files. tabix-index Index a GFF/BED/SAM/VCF file with Tabix. tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. + vcf-call Call SNVs and indels from BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. diff --git a/docs/cli.rst b/docs/cli.rst index ae904c5..e815334 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -18,42 +18,45 @@ For getting help on the fuc CLI: positional arguments: COMMAND - bam-aldepth Compute allelic depth from a SAM/BAM/CRAM file. - bam-depth Compute read depth from SAM/BAM/CRAM files. - bam-head Print the header of a SAM/BAM/CRAM file. - bam-index Index a SAM/BAM/CRAM file. - bam-rename Rename the sample in a SAM/BAM/CRAM file. - bam-slice Slice a SAM/BAM/CRAM file. + bam-aldepth Compute allelic depth from a BAM file. + bam-depth Compute per-base read depth from BAM files. + bam-head Print the header of a BAM file. + bam-index Index a BAM file. + bam-rename Rename the sample in a BAM file. + bam-slice Slice a BAM file. bed-intxn Find the intersection of BED files. bed-sum Summarize a BED file. cov-concat Concatenate depth of coverage files. cov-rename Rename the samples in a depth of coverage file. - fa-filter Filter sequence records in a FASTA file + fa-filter Filter sequence records in a FASTA file. fq-count Count sequence reads in FASTQ files. fq-sum Summarize a FASTQ file. fuc-bgzip Write a BGZF compressed file. fuc-compf Compare the contents of two files. fuc-demux Parse the Reports directory from bcl2fastq. fuc-exist Check whether certain files exist. - fuc-find Retrieve absolute paths of files whose name matches a + fuc-find Retrieve absolute paths of files whose name matches a specified pattern, optionally recursively. - fuc-undetm Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. + fuc-undetm Compute top unknown barcodes using undertermined FASTQ from + bcl2fastq. maf-maf2vcf Convert a MAF file to a VCF file. maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. ngs-bam2fq Pipeline for converting BAM files to FASTQ files. - ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. + ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM + files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. ngs-pon Pipeline for constructing a panel of normals (PoN). - ngs-quant Pipeline for running RNAseq quantification from FASTQ files + ngs-quant Pipeline for running RNAseq quantification from FASTQ files with Kallisto. ngs-trim Pipeline for trimming adapters from FASTQ files. tabix-index Index a GFF/BED/SAM/VCF file with Tabix. tabix-slice Slice a GFF/BED/SAM/VCF file with Tabix. tbl-merge Merge two table files. tbl-sum Summarize a table file. + vcf-call Call SNVs and indels from BAM files. vcf-filter Filter a VCF file. vcf-index Index a VCF file. vcf-merge Merge two or more VCF files. @@ -81,13 +84,13 @@ bam-aldepth $ fuc bam-aldepth -h usage: fuc bam-aldepth [-h] bam sites - Count allelic depth from a SAM/BAM/CRAM file. + Count allelic depth from a BAM file. Positional arguments: - bam Alignment file. - sites TSV file containing two columns, chromosome and position. - This can also be a BED or VCF file (compressed or - uncompressed) Input type will be detected automatically. + bam Input alignment file. + sites TSV file containing two columns, chromosome and position. This + can also be a BED or VCF file (compressed or uncompressed). + Input type will be detected automatically. Optional arguments: -h, --help Show this help message and exit. @@ -104,36 +107,43 @@ bam-depth .. code-block:: text $ fuc bam-depth -h - usage: fuc bam-depth [-h] [--bam PATH [PATH ...]] [--fn PATH] [--bed PATH] - [--region TEXT] [--zero] + usage: fuc bam-depth [-h] [-r TEXT [TEXT ...]] [--zero] bams [bams ...] - Compute read depth from SAM/BAM/CRAM files. + Compute per-base read depth from BAM files. - By default, the command will count all reads within the alignment files. You - can specify regions of interest with --bed or --region. When you do this, pay - close attention to the 'chr' string in contig names (e.g. 'chr1' vs. '1'). - Note also that --region requires the input files be indexed. + Under the hood, the command computes read depth using the 'samtools depth' + command. + + Positional arguments: + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. Optional arguments: -h, --help Show this help message and exit. - --bam PATH [PATH ...] - One or more alignment files. Cannot be used with --fn. - --fn PATH File containing one alignment file per line. Cannot - be used with --bam. - --bed PATH BED file. Cannot be used with --region. - --region TEXT Target region ('chrom:start-end'). Cannot be used - with --bed. + -r TEXT [TEXT ...], --regions TEXT [TEXT ...] + By default, the command counts all reads in BAM + files, which can be excruciatingly slow for large + files (e.g. whole genome sequencing). Therefore, use + this argument to only output positions in given + regions. Each region must have the format + chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 + will extract positions 101, 102, and 103. + Alternatively, you can provide a BED file (compressed + or uncompressed) to specify regions. Note that the + 'chr' prefix in contig names (e.g. 'chr1' vs. '1') + will be automatically added or removed as necessary + to match the input BAM's contig names. --zero Output all positions including those with zero depth. - [Example] To specify regions with a BED file: - $ fuc bam-depth \ - --bam 1.bam 2.bam \ - --bed in.bed > out.tsv + [Example] Specify regions manually: + $ fuc bam-depth 1.bam 2.bam \ + -r chr1:100-200 chr2:400-500 > out.tsv - [Example] To specify regions manually: - $ fuc bam-depth \ - --fn bam.list \ - --region chr1:100-200 > out.tsv + [Example] Specify regions with a BED file: + $ fuc bam-depth bam.list \ + -r in.bed > out.tsv bam-head ======== @@ -143,10 +153,10 @@ bam-head $ fuc bam-head -h usage: fuc bam-head [-h] bam - Print the header of a SAM/BAM/CRAM file. + Print the header of a BAM file. Positional arguments: - bam Alignment file. + bam Input alignment file. Optional arguments: -h, --help Show this help message and exit. @@ -162,10 +172,10 @@ bam-index $ fuc bam-index -h usage: fuc bam-index [-h] bam - Index a SAM/BAM/CRAM file. + Index a BAM file. Positional arguments: - bam Alignment file. + bam Input alignment file. Optional arguments: -h, --help Show this help message and exit. @@ -181,10 +191,10 @@ bam-rename $ fuc bam-rename -h usage: fuc bam-rename [-h] bam name - Rename the sample in a SAM/BAM/CRAM file. + Rename the sample in a BAM file. Positional arguments: - bam Alignment file. + bam Input alignment file. name New sample name. Optional arguments: @@ -202,24 +212,24 @@ bam-slice usage: fuc bam-slice [-h] [--format TEXT] [--fasta PATH] bam regions [regions ...] - Slice an alignment file (SAM/BAM/CRAM). + Slice a BAM file. Positional arguments: - bam Input alignment file must be already indexed (.bai) to allow - random access. You can index an alignment file with the + bam Input alignment file must be already indexed (.bai) to allow + random access. You can index an alignment file with the bam-index command. - regions One or more regions to be sliced. Each region must have the - format chrom:start-end and be a half-open interval with - (start, end]. This means, for example, chr1:100-103 will - extract positions 101, 102, and 103. Alternatively, you can - provide a BED file (compressed or uncompressed) to specify - regions. Note that the 'chr' prefix in contig names (e.g. - 'chr1' vs. '1') will be automatically added or removed as + regions One or more regions to be sliced. Each region must have the + format chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 will + extract positions 101, 102, and 103. Alternatively, you can + provide a BED file (compressed or uncompressed) to specify + regions. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as necessary to match the input BED's contig names. Optional arguments: -h, --help Show this help message and exit. - --format TEXT Output format (default: 'BAM') (choices: 'SAM', 'BAM', + --format TEXT Output format (default: 'BAM') (choices: 'SAM', 'BAM', 'CRAM'). --fasta PATH FASTA file. Required when --format is 'CRAM'. @@ -243,7 +253,7 @@ bed-intxn Find the intersection of BED files. Positional arguments: - bed BED files. + bed Input BED files. Optional arguments: -h, --help Show this help message and exit. @@ -269,7 +279,7 @@ bed-sum can, for example, use '--bases 1000' to display in kb. Positional arguments: - bed BED file. + bed Input BED file. Optional arguments: -h, --help Show this help message and exit. @@ -282,16 +292,16 @@ cov-concat .. code-block:: text $ fuc cov-concat -h - usage: fuc cov-concat [-h] [--axis INT] PATH [PATH ...] + usage: fuc cov-concat [-h] [--axis INT] tsv [tsv ...] Concatenate depth of coverage files. Positional arguments: - PATH One or more TSV files. + tsv Input TSV files. Optional arguments: -h, --help Show this help message and exit. - --axis INT The axis to concatenate along (default: 0) (choices: + --axis INT The axis to concatenate along (default: 0) (choices: 0, 1 where 0 is index and 1 is columns). [Example] Concatenate vertically: @@ -327,9 +337,9 @@ cov-rename -h, --help Show this help message and exit. --mode TEXT Renaming mode (default: 'MAP') (choices: 'MAP', 'INDEX', 'RANGE'). - --range INT INT Index range to use when renaming the samples. + --range INT INT Index range to use when renaming the samples. Applicable only with the 'RANGE' mode. - --sep TEXT Delimiter to use when reading the names file + --sep TEXT Delimiter to use when reading the names file (default: '\t'). [Example] Using the default 'MAP' mode: @@ -352,13 +362,13 @@ fa-filter Filter sequence records in a FASTA file. Positional arguments: - fasta FASTA file (compressed or uncompressed). + fasta Input FASTA file (compressed or uncompressed). Optional arguments: -h, --help Show this help message and exit. --contigs TEXT [TEXT ...] - One or more contigs to be selected. Alternatively, you can - provide a file containing one contig per line. + One or more contigs to be selected. Alternatively, you can + provide a file containing one contig per line. --exclude Exclude specified contigs. [Example] Select certain contigs: @@ -378,7 +388,7 @@ fq-count Count sequence reads in FASTQ files. Positional arguments: - fastq FASTQ files (compressed or uncompressed) (default: stdin). + fastq Input FASTQ files (compressed or uncompressed) (default: stdin). Optional arguments: -h, --help Show this help message and exit. @@ -404,7 +414,7 @@ fq-sum lengths, and the numbers of unique and duplicate sequences. Positional arguments: - fastq FASTQ file (zipped or unqzipped). + fastq Input FASTQ file (compressed or uncompressed). Optional arguments: -h, --help Show this help message and exit. @@ -429,7 +439,7 @@ fuc-bgzip formats (e.g. FASTA, FASTQ, GenBank, VCF, MAF). Positional arguments: - file File to be compressed (default: stdin). + file Input file to be compressed (default: stdin). Optional arguments: -h, --help Show this help message and exit. @@ -454,8 +464,8 @@ fuc-compf are identical and 'False' otherwise. Positional arguments: - left Left file. - right Right file. + left Input left file. + right Input right file. Optional arguments: -h, --help Show this help message and exit. @@ -487,12 +497,12 @@ fuc-demux reports.pdf files. Positional arguments: - reports Reports directory. + reports Input Reports directory. output Output directory (will be created). Optional arguments: -h, --help Show this help message and exit. - --sheet PATH SampleSheet.csv file. Used for sorting and/or subsetting + --sheet PATH SampleSheet.csv file. Used for sorting and/or subsetting samples. fuc-exist @@ -646,7 +656,7 @@ maf-oncoplt determined by the output file's extension. Positional arguments: - maf MAF file. + maf Input MAF file. out Output image file. Optional arguments: @@ -683,19 +693,19 @@ maf-sumplt determined by the output file's extension. Positional arguments: - maf MAF file. + maf Input MAF file. out Output image file. Optional arguments: -h, --help Show this help message and exit. --figsize FLOAT FLOAT - width, height in inches (default: [15, 10]) + Width, height in inches (default: [15, 10]). --title_fontsize FLOAT - font size of subplot titles (default: 16) + Font size of subplot titles (default: 16). --ticklabels_fontsize FLOAT - font size of tick labels (default: 12) + Font size of tick labels (default: 12). --legend_fontsize FLOAT - font size of legend texts (default: 12) + Font size of legend texts (default: 12). [Example] Output a PNG file: $ fuc maf-sumplt in.maf out.png @@ -747,9 +757,9 @@ ngs-bam2fq Positional arguments: manifest Sample manifest CSV file. output Output directory. - qsub SGE resoruce to request with qsub for BAM to FASTQ - conversion. Since this oppoeration supports multithreading, - it is recommended to speicfy a parallel environment (PE) + qsub SGE resoruce to request with qsub for BAM to FASTQ + conversion. Since this oppoeration supports multithreading, + it is recommended to speicfy a parallel environment (PE) to speed up the process (also see --thread). Optional arguments: @@ -805,7 +815,7 @@ ngs-fq2bam output Output directory. qsub SGE resoruce to request for qsub. java Java resoruce to request for GATK. - vcf One or more reference VCF files containing known variant + vcf One or more reference VCF files containing known variant sites (e.g. 1000 Genomes Project). Optional arguments: @@ -869,20 +879,20 @@ ngs-hc --bed PATH BED file. --dbsnp PATH VCF file from dbSNP. --thread INT Number of threads to use (default: 1). - --batch INT Batch size used for GenomicsDBImport (default: 0). This - controls the number of samples for which readers are - open at once and therefore provides a way to minimize - memory consumption. The size of 0 means no batching (i.e. + --batch INT Batch size used for GenomicsDBImport (default: 0). This + controls the number of samples for which readers are + open at once and therefore provides a way to minimize + memory consumption. The size of 0 means no batching (i.e. readers for all samples will be opened at once). --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. - --posix Set GenomicsDBImport to allow for optimizations to improve - the usability and performance for shared Posix Filesystems - (e.g. NFS, Lustre). If set, file level locking is disabled - and file system writes are minimized by keeping a higher - number of file descriptors open for longer periods of time. - Use with --batch if keeping a large number of file + --posix Set GenomicsDBImport to allow for optimizations to improve + the usability and performance for shared Posix Filesystems + (e.g. NFS, Lustre). If set, file level locking is disabled + and file system writes are minimized by keeping a higher + number of file descriptors open for longer periods of time. + Use with --batch if keeping a large number of file descriptors open is an issue. [Example] Specify queue: @@ -1018,8 +1028,8 @@ ngs-quant --bootstrap INT Number of bootstrap samples (default: 50). --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. - --posix Set the environment variable HDF5_USE_FILE_LOCKING=FALSE - before running Kallisto. This is required for shared Posix + --posix Set the environment variable HDF5_USE_FILE_LOCKING=FALSE + before running Kallisto. This is required for shared Posix Filesystems (e.g. NFS, Lustre). [Example] Specify queue: @@ -1144,12 +1154,12 @@ tbl-merge method's documentation page. Positional arguments: - left Left file. - right Right file. + left Input left file. + right Input right file. Optional arguments: -h, --help Show this help message and exit. - --how TEXT Type of merge to be performed (default: 'inner') + --how TEXT Type of merge to be performed (default: 'inner') (choices: 'left', 'right', 'outer', 'inner', 'cross'). --on TEXT [TEXT ...] Column names to join on. --lsep TEXT Delimiter to use for the left file (default: '\t'). @@ -1184,32 +1194,80 @@ tbl-sum Optional arguments: -h, --help Show this help message and exit. --sep TEXT Delimiter to use (default: '\t'). - --skiprows TEXT Comma-separated line numbers to skip (0-indexed) or - number of lines to skip at the start of the file - (e.g. `--skiprows 1,` will skip the second line, - `--skiprows 2,4` will skip the third and fifth lines, + --skiprows TEXT Comma-separated line numbers to skip (0-indexed) or + number of lines to skip at the start of the file + (e.g. `--skiprows 1,` will skip the second line, + `--skiprows 2,4` will skip the third and fifth lines, and `--skiprows 10` will skip the first 10 lines). --na_values TEXT [TEXT ...] - Additional strings to recognize as NA/NaN (by - default, the following values are interpreted - as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', - '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', - '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', + Additional strings to recognize as NA/NaN (by + default, the following values are interpreted + as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', + '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', + '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'). - --keep_default_na Whether or not to include the default NaN values when + --keep_default_na Whether or not to include the default NaN values when parsing the data (see 'pandas.read_table' for details). - --expr TEXT Query the columns of a pandas.DataFrame with a + --expr TEXT Query the columns of a pandas.DataFrame with a boolean expression (e.g. `--query "A == 'yes'"`). --columns TEXT [TEXT ...] - Columns to be summarized (by default, all columns + Columns to be summarized (by default, all columns will be included). - --dtypes PATH File of column names and their data types (either - 'categorical' or 'numeric'); one tab-delimited pair of + --dtypes PATH File of column names and their data types (either + 'categorical' or 'numeric'); one tab-delimited pair of column name and data type per line. [Example] Summarize a table: $ fuc tbl-sum table.tsv +vcf-call +======== + +.. code-block:: text + + $ fuc vcf-call -h + usage: fuc vcf-call [-h] [-r TEXT [TEXT ...]] [--min-mq INT] [--max-depth INT] + fasta bams [bams ...] + + Call SNVs and indels from BAM files. + + Under the hood, the command utilizes the bcftool program to call variants. + + Positional arguments: + fasta Reference FASTA file. + bams One or more input BAM files. Alternatively, you can + provide a text file (.txt, .tsv, .csv, or .list) + containing one BAM file per line. + + Optional arguments: + -h, --help Show this help message and exit. + -r TEXT [TEXT ...], --regions TEXT [TEXT ...] + By default, the command looks at each genomic + position with coverage in BAM files, which can be + excruciatingly slow for large files (e.g. whole + genome sequencing). Therefore, use this argument to + only call variants in given regions. Each region must + have the format chrom:start-end and be a half-open + interval with (start, end]. This means, for example, + chr1:100-103 will extract positions 101, 102, and + 103. Alternatively, you can provide a BED file + (compressed or uncompressed) to specify regions. Note + that the 'chr' prefix in contig names (e.g. 'chr1' + vs. '1') will be automatically added or removed as + necessary to match the input BAM's contig names. + --min-mq INT Minimum mapping quality for an alignment to be used + (default: 1). + --max-depth INT At a position, read maximally this number of reads + per input file (default: 250). + + [Example] Specify regions manually: + $ fuc vcf-call ref.fa 1.bam 2.bam \ + -r chr1:100-200 chr2:400-500 > out.vcf + + [Example] Specify regions with a BED file: + $ fuc vcf-call ref.fa bam.list \ + -r in.bed > out.vcf + vcf-filter ========== @@ -1229,17 +1287,17 @@ vcf-filter Optional arguments: -h, --help Show this help message and exit. --expr TEXT Expression to evaluate. - --samples PATH File of sample names to apply the marking (one + --samples PATH File of sample names to apply the marking (one sample per line). --drop_duplicates [TEXT ...] - Only consider certain columns for identifying + Only consider certain columns for identifying duplicates, by default use all of the columns. - --greedy Use this flag to mark even ambiguous genotypes + --greedy Use this flag to mark even ambiguous genotypes as missing. - --opposite Use this flag to mark all genotypes that do not - satisfy the query expression as missing and leave + --opposite Use this flag to mark all genotypes that do not + satisfy the query expression as missing and leave those that do intact. - --filter_empty Use this flag to remove rows with no genotype + --filter_empty Use this flag to remove rows with no genotype calls at all. [Example] Mark genotypes with 0/0 as missing: @@ -1280,8 +1338,8 @@ vcf-index This command will create an index file (.tbi) for the input VCF. Positional arguments: - vcf Input VCF file to be indexed. When an uncompressed file is - given, the command will automatically create a BGZF + vcf Input VCF file to be indexed. When an uncompressed file is + given, the command will automatically create a BGZF compressed copy of the file (.gz) before indexing. Optional arguments: @@ -1307,19 +1365,19 @@ vcf-merge Positional arguments: vcf_files VCF files (compressed or uncompressed). Note that the 'chr' - prefix in contig names (e.g. 'chr1' vs. '1') will be - automatically added or removed as necessary to match the + prefix in contig names (e.g. 'chr1' vs. '1') will be + automatically added or removed as necessary to match the contig names of the first VCF. Optional arguments: -h, --help Show this help message and exit. - --how TEXT Type of merge as defined in pandas.DataFrame.merge + --how TEXT Type of merge as defined in pandas.DataFrame.merge (default: 'inner'). - --format TEXT FORMAT subfields to be retained (e.g. 'GT:AD:DP') + --format TEXT FORMAT subfields to be retained (e.g. 'GT:AD:DP') (default: 'GT'). - --sort Use this flag to turn off sorting of records + --sort Use this flag to turn off sorting of records (default: True). - --collapse Use this flag to collapse duplicate records + --collapse Use this flag to collapse duplicate records (default: False). [Example] Merge multiple VCF files: @@ -1353,9 +1411,9 @@ vcf-rename Optional arguments: -h, --help Show this help message and exit. - --mode TEXT Renaming mode (default: 'MAP') (choices: 'MAP', + --mode TEXT Renaming mode (default: 'MAP') (choices: 'MAP', 'INDEX', 'RANGE'). - --range INT INT Index range to use when renaming the samples. + --range INT INT Index range to use when renaming the samples. Applicable only with the 'RANGE' mode. --sep TEXT Delimiter to use for reading the 'names' file (default: '\t'). @@ -1380,17 +1438,17 @@ vcf-slice Slice a VCF file for specified regions. Positional arguments: - vcf Input VCF file must be already BGZF compressed (.gz) and - indexed (.tbi) to allow random access. A VCF file can be - compressed with the fuc-bgzip command and indexed with the + vcf Input VCF file must be already BGZF compressed (.gz) and + indexed (.tbi) to allow random access. A VCF file can be + compressed with the fuc-bgzip command and indexed with the vcf-index command. - regions One or more regions to be sliced. Each region must have the - format chrom:start-end and be a half-open interval with - (start, end]. This means, for example, chr1:100-103 will - extract positions 101, 102, and 103. Alternatively, you can - provide a BED file (compressed or uncompressed) to specify - regions. Note that the 'chr' prefix in contig names (e.g. - 'chr1' vs. '1') will be automatically added or removed as + regions One or more regions to be sliced. Each region must have the + format chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 will + extract positions 101, 102, and 103. Alternatively, you can + provide a BED file (compressed or uncompressed) to specify + regions. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as necessary to match the input VCF's contig names. Optional arguments: @@ -1421,8 +1479,8 @@ vcf-split Optional arguments: -h, --help Show this help message and exit. - --clean By default, the command will only return variants present in - each individual. Use the tag to stop this behavior and make + --clean By default, the command will only return variants present in + each individual. Use the tag to stop this behavior and make sure that all individuals have the same number of variants. --force Overwrite the output directory if it already exists. @@ -1464,7 +1522,7 @@ vcf-vep Optional arguments: -h, --help Show this help message and exit. - --opposite Use this flag to return only records that don't + --opposite Use this flag to return only records that don't meet the said criteria. --as_zero Use this flag to treat missing values as zero instead of NaN. diff --git a/docs/create.py b/docs/create.py index a036d7a..27d9cf9 100644 --- a/docs/create.py +++ b/docs/create.py @@ -74,10 +74,10 @@ Additionally, fuc can be used to parse output data from the following programs: -- Ensembl Variant Effect Predictor (VEP) -- SnpEff -- bcl2fastq and bcl2fastq2 -- Kallisto +- `Ensembl Variant Effect Predictor (VEP) `__ +- `SnpEff `__ +- `bcl2fastq and bcl2fastq2 `__ +- `Kallisto `__ Your contributions (e.g. feature ideas, pull requests) are most welcome. diff --git a/fuc/api/common.py b/fuc/api/common.py index b352265..02cc5d0 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -1316,17 +1316,21 @@ def sort_regions(regions): -------- >>> from fuc import common - >>> regions = ['22:1000-1500', '16:100-200', '22:200-300'] + >>> regions = ['chr22:1000-1500', 'chr16:100-200', 'chr22:200-300', 'chr16_KI270854v1_alt', 'chr3_GL000221v1_random', 'HLA-A*02:10'] >>> sorted(regions) # Lexicographic sorting (not what we want) - ['16:100-200', '22:1000-1500', '22:200-300'] + ['HLA-A*02:10', 'chr16:100-200', 'chr16_KI270854v1_alt', 'chr22:1000-1500', 'chr22:200-300', 'chr3_GL000221v1_random'] >>> common.sort_regions(regions) - ['16:100-200', '22:200-300', '22:1000-1500'] + ['chr16:100-200', 'chr22:200-300', 'chr22:1000-1500', 'chr16_KI270854v1_alt', 'chr3_GL000221v1_random', 'HLA-A*02:10'] """ def func(x): chrom, start, end = parse_region(x) if chrom in pyvcf.CONTIGS: chrom = pyvcf.CONTIGS.index(chrom) - return (chrom, start, end) + alt = '' + else: + chrom = len(pyvcf.CONTIGS) + alt = chrom + return (chrom, alt, start, end) return sorted(regions, key=func) def update_chr_prefix(regions, mode='remove'): diff --git a/fuc/api/pybed.py b/fuc/api/pybed.py index 495fe42..d31b942 100644 --- a/fuc/api/pybed.py +++ b/fuc/api/pybed.py @@ -48,7 +48,8 @@ ] class BedFrame: - """Class for storing BED data. + """ + Class for storing BED data. Parameters ---------- @@ -493,9 +494,9 @@ def to_regions(self, merge=True): ... 'End': [40, 50, 25, 35, 60, 80] ... } >>> bf = pybed.BedFrame.from_dict([], data) - >>> bf.regions() + >>> bf.to_regions() ['chr1:10-50', 'chr2:15-35', 'chr3:50-60', 'chr3:61-80'] - >>> bf.regions(merge=False) + >>> bf.to_regions(merge=False) ['chr1:10-40', 'chr1:30-50', 'chr2:15-25', 'chr2:25-35', 'chr3:50-60', 'chr3:61-80'] """ if merge: diff --git a/fuc/api/pycov.py b/fuc/api/pycov.py index 7cc2b67..f32fb96 100644 --- a/fuc/api/pycov.py +++ b/fuc/api/pycov.py @@ -95,7 +95,7 @@ class CovFrame: See Also -------- CovFrame.from_bam - Construct CovFrame from one or more SAM/BAM/CRAM files. + Construct CovFrame from BAM files. CovFrame.from_dict Construct CovFrame from dict of array-like or dicts. CovFrame.from_file @@ -165,44 +165,42 @@ def shape(self): @classmethod def from_bam( - cls, bam=None, fn=None, bed=None, region=None, zero=False, - map_qual=None, names=None + cls, bams, regions=None, zero=False, map_qual=None, names=None ): """ - Construct CovFrame from one or more SAM/BAM/CRAM files. - - Alignment files must be specified with either ``bam`` or ``fn``, but - it's an error to use both. - - By default, the method will count all reads within the alignment - files. You can specify target regions with either ``bed`` or - ``region``, but not both. When you do this, pay close attention to - the 'chr' string in contig names (e.g. 'chr1' vs. '1'). Note also - that ``region`` requires the input files be indexed. + Construct CovFrame from BAM files. Under the hood, the method computes read depth using the :command:`samtools depth` command. - Sample name is extracted from the SM tag. If the tag is missing, the - method will set the filename as sample name. - Parameters ---------- - bam : str or list, optional - One or more alignment files. - fn : str, optional - File containing one alignment file per line. - bed : str, optional - BED file. - region : str, optional - Target region ('chrom:start-end'). + bams : str or list + One or more input BAM files. Alternatively, you can provide a + text file (.txt, .tsv, .csv, or .list) containing one BAM file + per line. + regions : str, list, or pybed.BedFrame, optional + By default (``regions=None``), the method counts all reads in BAM + files, which can be excruciatingly slow for large files (e.g. + whole genome sequencing). Therefore, use this argument to only + output positions in given regions. Each region must have the + format chrom:start-end and be a half-open interval with (start, + end]. This means, for example, chr1:100-103 will extract + positions 101, 102, and 103. Alternatively, you can provide a BED + file (compressed or uncompressed) or a :class:`pybed.BedFrame` + object to specify regions. Note that the 'chr' prefix in contig + names (e.g. 'chr1' vs. '1') will be automatically added or + removed as necessary to match the input BAM's contig names. zero : bool, default: False - If True, output all positions (including those with zero depth). + If True, output all positions including those with zero depth. map_qual: int, optional Only count reads with mapping quality greater than or equal to this number. names : list, optional - Use these as sample names instead of the SM tags. + By default (``names=None``), sample name is extracted using SM + tag in BAM files. If the tag is missing, the method will set the + filename as sample name. Use this argument to manually provide + sample names. Returns ------- @@ -226,58 +224,70 @@ def from_bam( >>> cf = pycov.CovFrame.from_bam([bam1, bam2]) >>> cf = pycov.CovFrame.from_bam(bam, region='19:41497204-41524301') """ - bam_files = [] - - if bam is None and fn is None: - raise ValueError( - "Either the 'bam' or 'fn' parameter must be provided.") - elif bam is not None and fn is not None: - raise ValueError( - "The 'bam' and 'fn' parameters cannot be used together.") - elif bam is not None and fn is None: - if isinstance(bam, str): - bam_files.append(bam) - else: - bam_files += bam + # Parse input BAM files. + bams = common.parse_list_or_file(bams) + + # Check the 'chr' prefix. + if all([pybam.has_chr_prefix(x) for x in bams]): + chr_prefix = 'chr' else: - bam_files += common.convert_file2list(fn) + chr_prefix = '' + # Run the 'samtools depth' command. args = [] - if zero: args += ['-a'] - if region is not None: - args += ['-r', region] if map_qual is not None: args += ['-Q', str(map_qual)] - if bed is not None: - args += ['-b', bed] + results = '' + if regions is None: + results += pysam.depth(*(bams + args)) + else: + if isinstance(regions, str): + regions = [regions] + elif isinstance(regions, list): + pass + elif isinstance(regions, pybed.BedFrame): + regions = bf.to_regions() + else: + raise TypeError("Incorrect type of argument 'regions'") + if '.bed' in regions[0]: + bf = pybed.BedFrame.from_file(regions[0]) + regions = bf.to_regions() + else: + regions = common.sort_regions(regions) + for region in regions: + region = chr_prefix + region.replace('chr', '') + results += pysam.depth(*(bams + args + ['-r', region])) - args += bam_files - s = pysam.depth(*args) headers = ['Chromosome', 'Position'] dtype = {'Chromosome': str,'Position': int} - for i, bam_file in enumerate(bam_files): + + for i, bam in enumerate(bams): if names: name = names[i] else: - samples = pybam.tag_sm(bam_file) + samples = pybam.tag_sm(bam) if not samples: - basename = Path(bam_file).stem + basename = Path(bam).stem message = ( - f'SM tags were not found for {bam_file}, will use ' + f'SM tags were not found for {bam}, will use ' f'file name as sample name ({basename})' ) samples = [basename] warnings.warn(message) if len(samples) > 1: - m = f'multiple sample names detected: {bam_file}' + m = f'multiple sample names detected: {bam}' raise ValueError(m) name = samples[0] headers.append(name) dtype[name] = int - df = pd.read_csv(StringIO(s), sep='\t', header=None, - names=headers, dtype=dtype) + + df = pd.read_csv( + StringIO(results), sep='\t', header=None, + names=headers, dtype=dtype + ) + return cls(df) @classmethod @@ -300,7 +310,7 @@ def from_dict(cls, data): CovFrame CovFrame object creation using constructor. CovFrame.from_bam - Construct CovFrame from one or more SAM/BAM/CRAM files. + Construct CovFrame from BAM files. CovFrame.from_file Construct CovFrame from a text file containing read depth data. @@ -350,7 +360,7 @@ def from_file(cls, fn, compression=False): CovFrame CovFrame object creation using constructor. CovFrame.from_bam - Construct CovFrame from one or more SAM/BAM/CRAM files. + Construct CovFrame from BAM files. CovFrame.from_dict Construct CovFrame from dict of array-like or dicts. diff --git a/fuc/api/pyfq.py b/fuc/api/pyfq.py index f88262b..a3e1d8b 100644 --- a/fuc/api/pyfq.py +++ b/fuc/api/pyfq.py @@ -30,7 +30,8 @@ def readlen(self): @classmethod def from_file(cls, fn): - """Construct FqFrame from a FASTQ file. + """ + Construct FqFrame from a FASTQ file. Parameters ---------- diff --git a/fuc/api/pykallisto.py b/fuc/api/pykallisto.py index fac4f17..1e8ed63 100644 --- a/fuc/api/pykallisto.py +++ b/fuc/api/pykallisto.py @@ -163,7 +163,7 @@ def compute_fold_change(self, group, genes, unit='tpm', flip=False): df = getattr(self, f'df_gene_{unit}') df = df[df.index.isin(genes)].T df = df.merge(self.metadata[group], left_index=True, right_index=True) - df = df.groupby('lauren').mean().T + df = df.groupby(group).mean().T a, b = df.columns if flip: a, b = b, a diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 4485bab..dfbe434 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -198,7 +198,8 @@ SNV_CLASS_ORDER = ['C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G'] class MafFrame: - """Class for storing MAF data. + """ + Class for storing MAF data. Parameters ---------- @@ -2367,7 +2368,8 @@ def plot_summary( legend_fontsize=12 ): - """Create a summary figure for MafFrame. + """ + Create a summary figure for MafFrame. Parameters ---------- @@ -2709,7 +2711,8 @@ def plot_vaf( return ax def plot_varcls(self, ax=None, figsize=None, **kwargs): - """Create a bar plot for the nonsynonymous variant classes. + """ + Create a bar plot for the nonsynonymous variant classes. Parameters ---------- @@ -3350,7 +3353,7 @@ def to_vcf( Returns ------- VcfFrame - The VcfFrame object. + VcfFrame object. Examples -------- @@ -3454,7 +3457,8 @@ def one_row(r): return vf def to_file(self, fn): - """Write MafFrame to a MAF file. + """ + Write MafFrame to a MAF file. Parameters ---------- @@ -3465,7 +3469,8 @@ def to_file(self, fn): f.write(self.to_string()) def to_string(self): - """Render MafFrame to a console-friendly tabular output. + """ + Render MafFrame to a console-friendly tabular output. Returns ------- diff --git a/fuc/api/pysnpeff.py b/fuc/api/pysnpeff.py index f37ae2b..1177d54 100644 --- a/fuc/api/pysnpeff.py +++ b/fuc/api/pysnpeff.py @@ -31,7 +31,8 @@ """ def row_firstann(r): - """Return the first SnpEff annotation for the row.""" + """ + Return the first SnpEff annotation for the row.""" ann = [x for x in r.INFO.split(';') if 'ANN=' in x] if not ann: return '' @@ -39,7 +40,8 @@ def row_firstann(r): return ann def filter_ann(vf, targets, include=True): - """Filter out rows based on the SnpEff annotations. + """ + Filter out rows based on the SnpEff annotations. Parameters ---------- @@ -71,7 +73,8 @@ def func(r): return vf def parseann(vf, idx, sep=' | '): - """Parse SnpEff annotations. + """ + Parse SnpEff annotations. Parameters ---------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 418e658..1d4f1a3 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -12,45 +12,123 @@ identifier (e.g. 'chr1'). See the VCF specification above for an example VCF file. -Genotype lines have nine required fields for storing variant information -and variable-length fields for storing sample genotype data. For some -fields, missing values are tolerated and can be specified with a dot ('.'). -The nine required fields are: - -+-----+--------+------------------------------------+------------------------+ -| No. | Name | Description | Examples | -+=====+========+====================================+========================+ -| 1 | CHROM | Chromosome or contig identifier | 'chr2', '2', 'chrM' | -+-----+--------+------------------------------------+------------------------+ -| 2 | POS | 1-based reference position | 10041, 23042 | -+-----+--------+------------------------------------+------------------------+ -| 3 | ID | ';'-separated variant identifiers | '.', 'rs35', 'rs9;rs53'| -+-----+--------+------------------------------------+------------------------+ -| 4 | REF | Reference allele | 'A', 'GT' | -+-----+--------+------------------------------------+------------------------+ -| 5 | ALT | ','-separated alternate alleles | 'T', 'ACT', 'C,T' | -+-----+--------+------------------------------------+------------------------+ -| 6 | QUAL | Phred-scaled quality score for ALT | '.', 67, 12 | -+-----+--------+------------------------------------+------------------------+ -| 7 | FILTER | ';'-separated filters that failed | '.', 'PASS', 'q10;s50' | -+-----+--------+------------------------------------+------------------------+ -| 8 | INFO | ';'-separated information fields | '.', 'DP=14;AF=0.5;DB' | -+-----+--------+------------------------------------+------------------------+ -| 9 | FORMAT | ':'-separated genotype fields | 'GT', 'GT:AD:DP' | -+-----+--------+------------------------------------+------------------------+ - -You will sometimes come across VCFs that have only eight columns, and contain -no FORMAT or sample-specific information. These are called "sites-only" VCFs, -and represent variation that has been observed in a population. Generally, -information about the population of origin should be included in the header. - -There are several common, reserved genotype keywords that are standards -across the community. Currently, the pyvcf submodule is aware of the -following: - -* AD - Total read depth for each allele (R, Integer) -* AF - Allele fraction of the event in the tumor (1, Float) -* DP - Read depth (1, Integer) +Genotype lines usually consist of nine columns for storing variant +information (all fixed and mandatory except for the FORMAT column) plus +additional sample-specific columns for expressing individual genotype calls +(e.g. '0/1'). Missing values are allowed in some cases and can be specified +with a dot ('.'). The first nine columns are: + +.. list-table:: + :header-rows: 1 + + * - No. + - Column + - Description + - Required + - Missing + - Examples + * - 1 + - CHROM + - Chromosome or contig identifier + - ✅ + - ❌ + - 'chr2', '2', 'chrM' + * - 2 + - POS + - 1-based reference position + - ✅ + - ❌ + - 10041, 23042 + * - 3 + - ID + - ';'-separated variant identifiers + - ✅ + - ✅ + - '.', 'rs35', 'rs9;rs53' + * - 4 + - REF + - Reference allele + - ✅ + - ❌ + - 'A', 'GT' + * - 5 + - ALT + - ','-separated alternate alleles + - ✅ + - ❌ + - 'T', 'ACT', 'C,T' + * - 6 + - QUAL + - Phred-scaled quality score for ALT + - ✅ + - ✅ + - '.', 67, 12 + * - 7 + - FILTER + - ';'-separated filters that failed + - ✅ + - ✅ + - '.', 'PASS', 'q10;s50' + * - 8 + - INFO + - ';'-separated information fields + - ✅ + - ✅ + - '.', 'DP=14;AF=0.5;DB' + * - 9 + - FORMAT + - ':'-separated genotype fields + - ❌ + - ❌ + - 'GT', 'GT:AD:DP' + +You will sometimes come across VCF files that have only eight columns, and +do not contain the FORMAT column or sample-specific information. These are +called "sites-only" VCF files, and normally represent genetic variation that +has been observed in a large population. Generally, information about the +population of origin should be included in the header. + +There are several reserved keywords in the INFO and FORMAT columns that are +standards across the community. Popular keywords are listed below: + +.. list-table:: + :header-rows: 1 + + * - Column + - Key + - Number + - Type + - Description + * - INFO + - AC + - A + - Integer + - Allele count in genotypes, for each ALT allele, in the same order as listed + * - INFO + - AN + - 1 + - Integer + - Total number of alleles in called genotypes + * - INFO + - AF + - A + - Float + - Allele frequency for each ALT allele in the same order as listed (estimated from primary data, not called genotypes) + * - FORMAT + - AD + - R + - Integer + - Total read depth for each allele + * - FORMAT + - AF + - 1 + - Float + - Allele fraction of the event in the tumor + * - FORMAT + - DP + - 1 + - Integer + - Read depth If sample annotation data are available for a given VCF file, use the :class:`common.AnnFrame` class to import the data. @@ -58,11 +136,13 @@ import os import re +import sys import gzip from copy import deepcopy import warnings +import tempfile -from . import pybed, common, pymaf +from . import pybed, common, pymaf, pybam import numpy as np import pandas as pd @@ -72,7 +152,7 @@ import matplotlib.pyplot as plt from matplotlib_venn import venn2, venn3 import seaborn as sns -from pysam import VariantFile +from pysam import VariantFile, bcftools from io import StringIO HEADERS = { @@ -135,6 +215,108 @@ '#AD_FRAC_ALT': ['AD', lambda x: np.nan if sum([int(y) for y in x.split(',')]) == 0 else sum([int(y) for y in x.split(',')[1:]]) / sum([int(y) for y in x.split(',')]), True], } +def call( + fasta, bams, regions=None, path=None, min_mq=1, max_depth=250 +): + """ + Call SNVs and indels from BAM files. + + Under the hood, the method utilizes the bcftool program to call variants. + + Parameters + ---------- + fasta : str + Reference FASTA file. + bams : str or list + One or more input BAM files. Alternatively, you can provide a text + file (.txt, .tsv, .csv, or .list) containing one BAM file per line. + regions : str, list, or pybed.BedFrame, optional + By default (``regions=None``), the method looks at each genomic + position with coverage in BAM files, which can be excruciatingly slow + for large files (e.g. whole genome sequencing). Therefore, use this + argument to only call variants in given regions. Each region must + have the format chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 will extract + positions 101, 102, and 103. Alternatively, you can provide a BED + file (compressed or uncompressed) or a :class:`pybed.BedFrame` object + to specify regions. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as necessary + to match the input VCF's contig names. + path : str, optional + Output VCF file. Writes to stdout when ``path='-'``. If None is + provided the result is returned as a string. + min_mq : int, default: 1 + Minimum mapping quality for an alignment to be used. + max_depth : int, default: 250 + At a position, read maximally this number of reads per input file. + + Returns + ------- + str + VcfFrame object. + """ + # Parse input BAM files. + bams = common.parse_list_or_file(bams) + + # Check the 'chr' prefix. + if all([pybam.has_chr_prefix(x) for x in bams]): + chr_prefix = 'chr' + else: + chr_prefix = '' + + # Parse target regions, if provided. + if regions is not None: + if isinstance(regions, str): + regions = [regions] + elif isinstance(regions, list): + pass + elif isinstance(regions, pybed.BedFrame): + regions = bf.to_regions() + else: + raise TypeError("Incorrect type of argument 'regions'") + if '.bed' in regions[0]: + bf = pybed.BedFrame.from_file(regions[0]) + regions = bf.to_regions() + else: + regions = common.sort_regions(regions) + regions = [chr_prefix + x.replace('chr', '') for x in regions] + + with tempfile.TemporaryDirectory() as t: + # Step 1: Get genotype likelihoods. + args = ['-Ou', '-a', 'AD'] + args += ['-q', str(min_mq)] + args += ['--max-depth', str(max_depth)] + args += ['-f', fasta] + if regions is not None: + args += ['-r', ','.join(regions)] + results = bcftools.mpileup(*(args + bams)) + with open(f'{t}/likelihoods.bcf', 'wb') as f: + f.write(results) + + # Step 2: Call variants. + args = [f'{t}/likelihoods.bcf', '-Oz', '-mv'] + results = bcftools.call(*args) + with open(f'{t}/calls.bcf', 'wb') as f: + f.write(results) + + # Step 3: Normalize indels. + args = [f'{t}/calls.bcf', '-Ob', '-f', fasta] + results = bcftools.norm(*args) + with open(f'{t}/calls.normalized.bcf', 'wb') as f: + f.write(results) + + # Step 4: Filter variant. + args = [f'{t}/calls.normalized.bcf', '-Ov', '--IndelGap', '5'] + results = bcftools.filter(*args) + + if path is None: + return results + elif path == '-': + sys.stdout.write(results) + else: + with open(path, 'w') as f: + f.write(results) + def rescue_filtered_variants(vfs, format='GT'): """ Rescue filtered variants if they are PASS in at least one of the input @@ -241,7 +423,7 @@ def rescue_filtered_variants(vfs, format='GT'): def gt_miss(g): """ - Return True if sample genotype is missing. + For given genotype, return True if it has missing value. Parameters ---------- @@ -251,7 +433,7 @@ def gt_miss(g): Returns ------- bool - True if sample genotype is missing. + True if genotype is missing. Examples -------- @@ -276,8 +458,50 @@ def gt_miss(g): """ return '.' in g.split(':')[0] +def gt_ploidy(g): + """ + For given genotype, return its ploidy number. + + Parameters + ---------- + g : str + Sample genotype. + + Returns + ------- + int + Ploidy number. + + Examples + -------- + + >>> from fuc import pyvcf + >>> pyvcf.gt_ploidy('1') + 1 + >>> pyvcf.gt_ploidy('.') + 1 + >>> pyvcf.gt_ploidy('0/1') + 2 + >>> pyvcf.gt_ploidy('./.') + 2 + >>> pyvcf.gt_ploidy('0|1') + 2 + >>> pyvcf.gt_ploidy('1|0|1') + 3 + >>> pyvcf.gt_ploidy('0/./1/1') + 4 + """ + gt = g.split(':')[0] + if '/' in gt: + return gt.count('/') + 1 + elif '|' in gt: + return gt.count('|') + 1 + else: + return 1 + def gt_polyp(g): - """Return True if sample genotype has a polyploid call. + """ + For given genotype, return True if it is polyploid. Parameters ---------- @@ -287,7 +511,7 @@ def gt_polyp(g): Returns ------- bool - True if sample genotype has a polyploid call. + True if genotype is polyploid. Examples -------- @@ -304,15 +528,11 @@ def gt_polyp(g): >>> pyvcf.gt_polyp('0/./1/1') True """ - gt = g.split(':')[0] - if '/' in gt: - return gt.count('/') > 1 - else: - return gt.count('|') > 1 + return gt_ploidy(g) > 2 def gt_hasvar(g): """ - Return True if sample genotype has at least one variant call. + For given genotype, return True if it has variation. Parameters ---------- @@ -322,7 +542,7 @@ def gt_hasvar(g): Returns ------- bool - True if sample genotype has a variant call. + True if genotype has variation. Examples -------- @@ -353,7 +573,7 @@ def gt_hasvar(g): def gt_unphase(g): """ - Return unphased sample genotype. + For given genotype, return its unphased form. Parameters ---------- @@ -401,7 +621,7 @@ def gt_unphase(g): def gt_het(g): """ - Return True if genotype call is heterozygous. + For given genotype, return True if it is heterozygous. Parameters ---------- @@ -411,7 +631,7 @@ def gt_het(g): Returns ------- bool - True if genotype call is heterozygous. + True if genotype is heterozygous. Examples -------- @@ -438,7 +658,7 @@ def gt_het(g): def gt_pseudophase(g): """ - Return pseudophased genotype call. + For given genotype, return its pseudophased form. Parameters ---------- @@ -596,7 +816,7 @@ def merge( def row_hasindel(r): """ - Return True if the row has an indel. + For given row, return True if it has indel. Parameters ---------- @@ -642,9 +862,116 @@ def row_hasindel(r): alt_has = max([len(x) for x in r['ALT'].split(',')]) > 1 return ref_has or alt_has + +def row_computeinfo(r, key, decimals=3): + """ + For given row, return AC/AN/AF calculation for INFO column. + + Parameters + ---------- + r : pandas.Series + VCF row. + key : {'AC', 'AN', 'AF'} + INFO key. + decimals : int, default: 3 + Number of decimals to display for AF. + + Returns + ------- + str + Requested INFO data. + + Example + ------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chrX'], + ... 'POS': [100, 101, 102, 100], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'A', 'A'], + ... 'ALT': ['A', 'C', 'T,G', 'G'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT:DP', 'GT', 'GT', 'GT'], + ... 'A': ['1|0:29', '0|0', '1|0', '0'], + ... 'B': ['1/1:34', './.', '0/0', '0/1'], + ... 'C': ['0/0:23', '0/0', '1/2', '1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT:DP 1|0:29 1/1:34 0/0:23 + 1 chr1 101 . T C . . . GT 0|0 ./. 0/0 + 2 chr1 102 . A T,G . . . GT 1|0 0/0 1/2 + 3 chrX 100 . A G . . . GT 0 0/1 1 + >>> pyvcf.row_computeinfo(vf.df.iloc[0, :], 'AC') + '3' + >>> pyvcf.row_computeinfo(vf.df.iloc[0, :], 'AN') + '6' + >>> pyvcf.row_computeinfo(vf.df.iloc[0, :], 'AF') + '0.500' + >>> pyvcf.row_computeinfo(vf.df.iloc[1, :], 'AC') + '0' + >>> pyvcf.row_computeinfo(vf.df.iloc[1, :], 'AN') + '4' + >>> pyvcf.row_computeinfo(vf.df.iloc[1, :], 'AF') + '0.000' + >>> pyvcf.row_computeinfo(vf.df.iloc[2, :], 'AC') + '2,1' + >>> pyvcf.row_computeinfo(vf.df.iloc[2, :], 'AN') + '6' + >>> pyvcf.row_computeinfo(vf.df.iloc[2, :], 'AF') + '0.333,0.167' + >>> pyvcf.row_computeinfo(vf.df.iloc[3, :], 'AC') + '2' + >>> pyvcf.row_computeinfo(vf.df.iloc[3, :], 'AN') + '4' + >>> pyvcf.row_computeinfo(vf.df.iloc[3, :], 'AF') + '0.500' + """ + def get_ac(r): + def one_gt(g, i): + gt = g.split(':')[0] + if '/' in gt: + l = gt.split('/') + else: + l = gt.split('|') + return l.count(str(i + 1)) + counts = [] + for i, allele in enumerate(r.ALT.split(',')): + count = r[9:].apply(one_gt, args=(i, )) + counts.append(sum(count)) + return counts + + def get_an(r): + def one_gt(g): + if '.' in g: + return 0 + else: + return gt_ploidy(g) + return r[9:].apply(one_gt).sum() + + def get_af(r): + return [x / get_an(r) for x in get_ac(r)] + + methods = {'AC': get_ac, 'AN': get_an, 'AF': get_af} + + results = methods[key](r) + + if key == 'AC': + results = ','.join([str(x) for x in results]) + elif key == 'AN': + results = str(results) + else: + results = ','.join([f'{x:.{decimals}f}' for x in results]) + + return results + def row_parseinfo(r, key): """ - Return INFO data in the row that match the given key. + For given row, return requested data from INFO column. Parameters ---------- @@ -696,7 +1023,7 @@ def row_parseinfo(r, key): def row_phased(r): """ - Return True if every genotype in the row is haplotype phased. + For given row, return True if all genotypes are phased. Parameters ---------- @@ -740,9 +1067,9 @@ def one_gt(g): return '|' in g.split(':')[0] return r[9:].apply(one_gt).all() -def row_updateinfo(r, key, value): +def row_updateinfo(r, key, value, force=True, missing=False): """ - Update INFO data in the row that match the given key. + For given row, return updated data from INFO column. Parameters ---------- @@ -752,6 +1079,12 @@ def row_updateinfo(r, key, value): INFO key. value : str New value to be assigned. + force : bool, default: True + If True, overwrite any existing data. + missing : bool, default: False + By default (``missing=False``), the method will not update INFO field + if there is missing value ('.'). Setting ``missing=True`` will stop + this behavior. Returns ------- @@ -763,41 +1096,67 @@ def row_updateinfo(r, key, value): >>> from fuc import pyvcf >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'T', 'A', 'C'], - ... 'ALT': ['A', 'C', 'T', 'A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['DB;AC=0', 'DB;H2;AC=1', 'DB;H2;AC=1', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/0', '0/1', '0/1', '0/0'], + ... 'CHROM': ['chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102], + ... 'ID': ['.', '.', '.'], + ... 'REF': ['G', 'T', 'A'], + ... 'ALT': ['A', 'C', 'T'], + ... 'QUAL': ['.', '.', '.'], + ... 'FILTER': ['.', '.', '.'], + ... 'INFO': ['DB;AC=1', 'DB', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT'], + ... 'A': ['0/1', '1/1', '0/0'], + ... 'B': ['0/0', '0/1', '0/1'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . DB;AC=0 GT 0/0 - 1 chr1 101 . T C . . DB;H2;AC=1 GT 0/1 - 2 chr1 102 . A T . . DB;H2;AC=1 GT 0/1 - 3 chr1 103 . C A . . . GT 0/0 - >>> vf.df.apply(pyvcf.row_updateinfo, args=('AC', '4'), axis=1) - 0 DB;AC=4 - 1 DB;H2;AC=4 - 2 DB;H2;AC=4 - 3 . - dtype: object + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . DB;AC=1 GT 0/1 0/0 + 1 chr1 101 . T C . . DB GT 1/1 0/1 + 2 chr1 102 . A T . . . GT 0/0 0/1 + >>> pyvcf.row_updateinfo(vf.df.iloc[0, :], 'AC', '100') + 'DB;AC=100' + >>> pyvcf.row_updateinfo(vf.df.iloc[0, :], 'AC', '100', force=False) + 'DB;AC=1' + >>> pyvcf.row_updateinfo(vf.df.iloc[1, :], 'AC', '100') + 'DB;AC=100' + >>> pyvcf.row_updateinfo(vf.df.iloc[1, :], 'AC', '100', force=False) + 'DB;AC=100' + >>> pyvcf.row_updateinfo(vf.df.iloc[2, :], 'AC', '100') + '.' + >>> pyvcf.row_updateinfo(vf.df.iloc[2, :], 'AC', '100', force=False) + '.' + >>> pyvcf.row_updateinfo(vf.df.iloc[2, :], 'AC', '100', missing=True) + 'AC=100' """ + data = f'{key}={value}' + + if not missing and r.INFO == '.': + return r.INFO + fields = r.INFO.split(';') + found = False + for i, field in enumerate(fields): if field.startswith(f'{key}='): - fields[i] = field[:len(key)+1] + value + found = True + if force: + fields[i] = data + else: + pass break + + if not found: + if r.INFO == '.': + fields = [data] + else: + fields.append(data) + return ';'.join(fields) def row_missval(r): """ - Return the correctly formatted missing value for the row. + For given row, return formatted missing genotype. Parameters ---------- @@ -1430,7 +1789,8 @@ def one_gt(g): return self.__class__(self.copy_meta(), df) def add_dp(self): - """Compute DP using AD and add it to the FORMAT field. + """ + Compute DP using AD and add it to the FORMAT field. Returns ------- @@ -1490,7 +1850,8 @@ def infunc(x): return vf def add_flag(self, flag, order='last', index=None): - """Add the given flag to the INFO field. + """ + Add the given flag to the INFO field. The default behavior is to add the flag to all rows in the VcfFrame. @@ -1592,84 +1953,9 @@ def f(r): vf = self.__class__(self.copy_meta(), df) return vf - def cfilter_empty(self, opposite=False, as_list=False): - """Remove samples whose genotype calls are all missing. - - Parameters - ---------- - opposite : bool, default: False - If True, return samples that don't meet the said criteria. - as_list : bool, default: False - If True, return a list of sample names instead of a VcfFrame. - - Returns - ------- - VcfFrame - Filtered VcfFrame. - - Examples - -------- - Assume we have the following data: - - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'T', 'G', 'T'], - ... 'ALT': ['A', 'C', 'C', 'C'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '1/1', '1/1', '1/1'], - ... 'Rachel': ['./.', './.', './.', './.'], - ... 'John': ['0/0', './.', '0/0', '0/0'], - ... 'Sara': ['./.', './.', './.', './.'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Rachel John Sara - 0 chr1 100 . G A . . . GT 0/1 ./. 0/0 ./. - 1 chr1 101 . T C . . . GT 1/1 ./. ./. ./. - 2 chr1 102 . G C . . . GT 1/1 ./. 0/0 ./. - 3 chr1 103 . T C . . . GT 1/1 ./. 0/0 ./. - - We can remove samples whose genotypes are all missing: - - >>> vf.cfilter_empty().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven John - 0 chr1 100 . G A . . . GT 0/1 0/0 - 1 chr1 101 . T C . . . GT 1/1 ./. - 2 chr1 102 . G C . . . GT 1/1 0/0 - 3 chr1 103 . T C . . . GT 1/1 0/0 - - We can also select those samples: - - >>> vf.cfilter_empty(opposite=True).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Rachel Sara - 0 chr1 100 . G A . . . GT ./. ./. - 1 chr1 101 . T C . . . GT ./. ./. - 2 chr1 102 . G C . . . GT ./. ./. - 3 chr1 103 . T C . . . GT ./. ./. - - Finally, we can return a list of sample names from the filtering: - - >>> vf.cfilter_empty(as_list=True) - ['Steven', 'John'] - """ - f = lambda r: r[9:].apply(gt_miss) - s = self.df.apply(f, axis=1).all() - if opposite: - s = s[s == True] - else: - s = s[s == False] - l = s.index.to_list() - if as_list: - return l - return self.subset(l) - def collapse(self): - """Collapse duplicate records in the VcfFrame. + """ + Collapse duplicate records in the VcfFrame. Duplicate records have the identical values for CHROM, POS, and REF. They can result from merging two VCF files. @@ -1793,6 +2079,76 @@ def raise_error(c): vf = self.__class__(self.copy_meta(), df) return vf + def compute_info(self, key): + """ + Compute AC/AN/AF for INFO column. + + The method will ignore and overwrite any existing data for selected + key. + + Returns + ------- + VcfFrame + Updated VcfFrame. + key : {'AC', 'AN', 'AF'} + INFO key. + + Example + ------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chrX'], + ... 'POS': [100, 101, 102, 100], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'A', 'C'], + ... 'ALT': ['A', 'C', 'T,G', 'A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['AC=100', 'MQ=59', '.', '.'], + ... 'FORMAT': ['GT:DP', 'GT', 'GT', 'GT'], + ... 'A': ['1|0:34', '0|0', '1|0', '0'], + ... 'B': ['1/1:23', '0/1', '0/0', '0/0'], + ... 'C': ['0/0:28', './.', '1/2', '1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=100 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . . GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . . GT 0 0/0 1 + >>> vf = vf.compute_info('AC') + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=1 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59;AC=1 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . AC=1,1 GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . AC=1 GT 0 0/0 1 + >>> vf = vf.compute_info('AN') + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=1;AN=6 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59;AC=1;AN=4 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . AC=1,1;AN=6 GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . AC=1;AN=4 GT 0 0/0 1 + >>> vf = vf.compute_info('AF') + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . AC=1;AN=6;AF=0.167 GT:DP 1|0:34 1/1:23 0/0:28 + 1 chr1 101 . T C . . MQ=59;AC=1;AN=4;AF=0.250 GT 0|0 0/1 ./. + 2 chr1 102 . A T,G . . AC=1,1;AN=6;AF=0.167,0.167 GT 1|0 0/0 1/2 + 3 chrX 100 . C A . . AC=1;AN=4;AF=0.250 GT 0 0/0 1 + """ + def one_row(r, key): + data = row_computeinfo(r, key) + r.INFO = row_updateinfo(r, key, data, missing=True) + return r + + df = self.df.apply(one_row, args=(key,), axis=1) + + return self.__class__(self.copy_meta(), df) + @classmethod def from_dict(cls, meta, data): """ @@ -2240,6 +2596,71 @@ def copy(self): """Return a copy of the VcfFrame.""" return self.__class__(self.copy_meta(), self.copy_df()) + def to_bed(self): + """ + Convert VcfFrame to BedFrame. + + Returns + ------- + BedFrame + BedFrame obejct. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103, 104], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['A', 'A', 'C', 'C', 'ACGT'], + ... 'ALT': ['C', 'T,G', 'G', 'A,G,CT', 'A'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT:DP', 'GT:DP', 'GT:DP', 'GT:DP', 'GT:DP'], + ... 'Steven': ['0/1:32', './.:.', '0/1:27', '0/2:34', '0/0:31'], + ... 'Sara': ['0/0:28', '1/2:30', '1/1:29', '1/2:38', '0/1:27'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara + 0 chr1 100 . A C . . . GT:DP 0/1:32 0/0:28 + 1 chr1 101 . A T,G . . . GT:DP ./.:. 1/2:30 + 2 chr1 102 . C G . . . GT:DP 0/1:27 1/1:29 + 3 chr1 103 . C A,G,CT . . . GT:DP 0/2:34 1/2:38 + 4 chr1 104 . ACGT A . . . GT:DP 0/0:31 0/1:27 + + We can construct BedFrame from the VcfFrame: + + >>> bf = vf.to_bed() + >>> bf.gr.df + Chromosome Start End + 0 chr1 100 100 + 1 chr1 101 101 + 2 chr1 102 102 + 3 chr1 103 103 + 4 chr1 103 104 + 5 chr1 105 107 + """ + def one_row(r): + if len(r.REF) == len(r.ALT) == 1: + start = r.POS + end = r.POS + elif len(r.REF) > len(r.ALT): + start = r.POS + 1 + end = r.POS + len(r.REF) - len(r.ALT) + else: + start = r.POS + end = r.POS + 1 + return pd.Series([r.CHROM, start, end]) + df = self.expand().df.apply(one_row, axis=1) + df.columns = ['Chromosome', 'Start', 'End'] + df = df.drop_duplicates() + bf = pybed.BedFrame.from_frame([], df) + return bf + def to_file(self, fn, compression=False): """ Write VcfFrame to a VCF file. @@ -2321,6 +2742,44 @@ def to_string(self): ).to_csv(index=False, sep='\t') return s + def to_variants(self): + """ + List unique variants in VcfFrame. + + Returns + ------- + list + List of unique variants. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr2'], + ... 'POS': [100, 101], + ... 'ID': ['.', '.'], + ... 'REF': ['G', 'T'], + ... 'ALT': ['A', 'A,C'], + ... 'QUAL': ['.', '.'], + ... 'FILTER': ['.', '.'], + ... 'INFO': ['.', '.'], + ... 'FORMAT': ['GT', 'GT'], + ... 'A': ['0/1', '1/2'] + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.variants() + ['chr1-100-G-A', 'chr2-101-T-A', 'chr2-101-T-C'] + """ + def one_row(r): + result = [] + for alt in r.ALT.split(','): + result.append(f'{r.CHROM}-{r.POS}-{r.REF}-{alt}') + return ','.join(result) + s = self.df.apply(one_row, axis=1) + s = ','.join(s) + return s.split(',') + def strip(self, format='GT', metadata=False): """ Remove any unnecessary data. @@ -3262,8 +3721,117 @@ def one_gt(g): vf = self.__class__(self.copy_meta(), df) return vf + def empty_samples(self, threshold=0, opposite=False, as_list=False): + """ + Remove samples with high missingness. + + Samples with missingness >= threshold will be removed. + + Parameters + ---------- + threshold : int or float, default: 0 + Number or fraction of missing variants. By default + (``threshold=0``), only samples with 100% missingness will be + removed. + opposite : bool, default: False + If True, return samples that don't meet the said criteria. + as_list : bool, default: False + If True, return a list of sample names instead of a VcfFrame. + + Returns + ------- + VcfFrame + Subsetted VcfFrame. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'G', 'T'], + ... 'ALT': ['A', 'C', 'C', 'C'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'A': ['0/0', '0/0', '0/0', '0/0'], + ... 'B': ['./.', '0/0', '0/0', '0/0'], + ... 'C': ['./.', './.', '0/0', '0/0'], + ... 'D': ['./.', './.', './.', '0/0'], + ... 'E': ['./.', './.', './.', './.'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D E + 0 chr1 100 . G A . . . GT 0/0 ./. ./. ./. ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 ./. ./. ./. + 2 chr1 102 . G C . . . GT 0/0 0/0 0/0 ./. ./. + 3 chr1 103 . T C . . . GT 0/0 0/0 0/0 0/0 ./. + >>> vf.empty_samples().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D + 0 chr1 100 . G A . . . GT 0/0 ./. ./. ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 ./. ./. + 2 chr1 102 . G C . . . GT 0/0 0/0 0/0 ./. + 3 chr1 103 . T C . . . GT 0/0 0/0 0/0 0/0 + >>> vf.empty_samples(threshold=2).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 0/0 ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 + 2 chr1 102 . G C . . . GT 0/0 0/0 + 3 chr1 103 . T C . . . GT 0/0 0/0 + >>> vf.empty_samples(threshold=0.5).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 0/0 ./. + 1 chr1 101 . T C . . . GT 0/0 0/0 + 2 chr1 102 . G C . . . GT 0/0 0/0 + 3 chr1 103 . T C . . . GT 0/0 0/0 + >>> vf.empty_samples(threshold=0.5, opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C D E + 0 chr1 100 . G A . . . GT ./. ./. ./. + 1 chr1 101 . T C . . . GT ./. ./. ./. + 2 chr1 102 . G C . . . GT 0/0 ./. ./. + 3 chr1 103 . T C . . . GT 0/0 0/0 ./. + >>> vf.empty_samples(threshold=0.5, opposite=True, as_list=True) + ['C', 'D', 'E'] + """ + total = self.shape[0] + + if not threshold: + threshold = total + + if isinstance(threshold, int): + use_fraction = False + elif isinstance(threshold, float): + use_fraction = True + else: + raise TypeError("Incorrect argument type 'threshold'") + + def one_col(c): + data = sum(c.apply(gt_miss)) + if use_fraction: + data /= total + return data + + s = self.df.iloc[:, 9:].apply(one_col, axis=0) < threshold + + if opposite: + s = s[s == False] + else: + s = s[s == True] + + l = s.index.to_list() + + if as_list: + return l + + return self.subset(l) + def expand(self): - """Expand each multiallelic locus to multiple rows. + """ + Expand each multiallelic locus to multiple rows. Only the GT subfield of FORMAT will be retained. @@ -3339,7 +3907,9 @@ def one_gt(g, i): def filter_bed(self, bed, opposite=False, as_index=False): """ - Select rows that overlap with the given BED data. + Filter rows intersecting with given BED. + + Only variants intersecting with given BED data will remain. Parameters ---------- @@ -3431,7 +4001,9 @@ def filter_bed(self, bed, opposite=False, as_index=False): def filter_empty(self, threshold=0, opposite=False, as_index=False): """ - Remove rows with missing genotype calls. + Filter rows with high missingness. + + Variants with missingness >= threshold will be removed. Parameters ---------- @@ -3456,141 +4028,68 @@ def filter_empty(self, threshold=0, opposite=False, as_index=False): >>> from fuc import pyvcf >>> data = { ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103, 104], - ... 'ID': ['.', '.', '.', '.', '.'], - ... 'REF': ['G', 'T', 'A', 'C', 'C'], - ... 'ALT': ['A', 'C', 'T', 'A', 'T'], - ... 'QUAL': ['.', '.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], - ... 'A': ['0/1', './.', './.', './.', './.'], - ... 'B': ['0/0', '0/1', './.', './.', './.'], - ... 'C': ['0/0', '0/0', '0/1', './.', './.'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C - 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 - 1 chr1 101 . T C . . . GT ./. 0/1 0/0 - 2 chr1 102 . A T . . . GT ./. ./. 0/1 - 3 chr1 103 . C A . . . GT ./. ./. ./. - 4 chr1 104 . C T . . . GT ./. ./. ./. - - We can remove rows that are completely empty: - - >>> vf.filter_empty().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C - 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 - 1 chr1 101 . T C . . . GT ./. 0/1 0/0 - 2 chr1 102 . A T . . . GT ./. ./. 0/1 - - We can remove rows where at least two samples have missing genotype: - - >>> vf.filter_empty(threshold=2).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C - 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 - 1 chr1 101 . T C . . . GT ./. 0/1 0/0 - - We can show rows that are completely empty: - - >>> vf.filter_empty(opposite=True).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C - 0 chr1 103 . C A . . . GT ./. ./. ./. - 1 chr1 104 . C T . . . GT ./. ./. ./. - - Finally, we can return boolean index array from the filtering: - - >>> vf.filter_empty(as_index=True) - 0 True - 1 True - 2 True - 3 False - 4 False - dtype: bool - """ - if not threshold: - threshold = self.shape[1] - - def one_row(r): - s = r[9:].apply(gt_miss) - return s.sum() < threshold - - i = self.df.apply(one_row, axis=1) - - if opposite: - i = ~i - if as_index: - return i - if i.empty: - return self.__class__(self.copy_meta(), self.copy_df()) - return self.__class__(self.copy_meta(), self.df[i]) - - def filter_indel(self, opposite=False, as_index=False): - """ - Remove rows with an indel. - - Parameters - ---------- - opposite : bool, default: False - If True, return rows that don't meet the said criteria. - as_index : bool, default: False - If True, return boolean index array instead of VcfFrame. - - Returns - ------- - VcfFrame or pandas.Series - Filtered VcfFrame or boolean index array. - - Examples - -------- - Assume we have the following data: - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'CT', 'A', 'C'], - ... 'ALT': ['A', 'C', 'C,AT', 'A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '0/1', '1/2', '0/1'], + ... 'POS': [100, 101, 102, 103, 104], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'A', 'C', 'C'], + ... 'ALT': ['A', 'C', 'T', 'A', 'T'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], + ... 'A': ['0/1', './.', './.', './.', './.'], + ... 'B': ['0/0', '0/1', './.', './.', './.'], + ... 'C': ['0/0', '0/0', '0/1', './.', './.'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/1 - 1 chr1 101 . CT C . . . GT 0/1 - 2 chr1 102 . A C,AT . . . GT 1/2 - 3 chr1 103 . C A . . . GT 0/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 + 1 chr1 101 . T C . . . GT ./. 0/1 0/0 + 2 chr1 102 . A T . . . GT ./. ./. 0/1 + 3 chr1 103 . C A . . . GT ./. ./. ./. + 4 chr1 104 . C T . . . GT ./. ./. ./. - We can remove rows with an indel: + We can remove rows that are completely empty: - >>> vf.filter_indel().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/1 - 1 chr1 103 . C A . . . GT 0/1 + >>> vf.filter_empty().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 + 1 chr1 101 . T C . . . GT ./. 0/1 0/0 + 2 chr1 102 . A T . . . GT ./. ./. 0/1 - We can also select those rows: + We can remove rows where at least two samples have missing genotype: - >>> vf.filter_indel(opposite=True).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 101 . CT C . . . GT 0/1 - 1 chr1 102 . A C,AT . . . GT 1/2 + >>> vf.filter_empty(threshold=2).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 + 1 chr1 101 . T C . . . GT ./. 0/1 0/0 + + We can show rows that are completely empty: + + >>> vf.filter_empty(opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 103 . C A . . . GT ./. ./. ./. + 1 chr1 104 . C T . . . GT ./. ./. ./. Finally, we can return boolean index array from the filtering: - >>> vf.filter_indel(as_index=True) + >>> vf.filter_empty(as_index=True) 0 True - 1 False - 2 False - 3 True + 1 True + 2 True + 3 False + 4 False dtype: bool """ - i = ~self.df.apply(row_hasindel, axis=1) + if not threshold: + threshold = self.shape[1] + + def one_row(r): + s = r[9:].apply(gt_miss) + return s.sum() < threshold + + i = self.df.apply(one_row, axis=1) + if opposite: i = ~i if as_index: @@ -3600,7 +4099,10 @@ def filter_indel(self, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_flagall(self, flags, opposite=False, as_index=False): - """Select rows if all of the given INFO flags are present. + """ + Filter rows with all given INFO flags. + + Only variants with all given INFO flags will remain. Parameters ---------- @@ -3683,7 +4185,10 @@ def f(r): return self.__class__(self.copy_meta(), self.df[i]) def filter_flagany(self, flags, opposite=False, as_index=False): - """Select rows if any one of the given INFO flags is present. + """ + Filter rows with any given INFO flags. + + Only variants with any given INFO flags will remain. Parameters ---------- @@ -3765,9 +4270,86 @@ def f(r): return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) + def filter_indel(self, opposite=False, as_index=False): + """ + Filter rows with indel. + + Variants with indel will be removed. + + Parameters + ---------- + opposite : bool, default: False + If True, return rows that don't meet the said criteria. + as_index : bool, default: False + If True, return boolean index array instead of VcfFrame. + + Returns + ------- + VcfFrame or pandas.Series + Filtered VcfFrame or boolean index array. + + Examples + -------- + Assume we have the following data: + + >>> from fuc import pyvcf + >>> data = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103], + ... 'ID': ['.', '.', '.', '.'], + ... 'REF': ['G', 'CT', 'A', 'C'], + ... 'ALT': ['A', 'C', 'C,AT', 'A'], + ... 'QUAL': ['.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], + ... 'Steven': ['0/1', '0/1', '1/2', '0/1'], + ... } + >>> vf = pyvcf.VcfFrame.from_dict([], data) + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . . GT 0/1 + 1 chr1 101 . CT C . . . GT 0/1 + 2 chr1 102 . A C,AT . . . GT 1/2 + 3 chr1 103 . C A . . . GT 0/1 + + We can remove rows with an indel: + + >>> vf.filter_indel().df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 100 . G A . . . GT 0/1 + 1 chr1 103 . C A . . . GT 0/1 + + We can also select those rows: + + >>> vf.filter_indel(opposite=True).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven + 0 chr1 101 . CT C . . . GT 0/1 + 1 chr1 102 . A C,AT . . . GT 1/2 + + Finally, we can return boolean index array from the filtering: + + >>> vf.filter_indel(as_index=True) + 0 True + 1 False + 2 False + 3 True + dtype: bool + """ + i = ~self.df.apply(row_hasindel, axis=1) + if opposite: + i = ~i + if as_index: + return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) + return self.__class__(self.copy_meta(), self.df[i]) + def filter_multialt(self, opposite=False, as_index=False): """ - Remove rows with multiple ALT alleles. + Filter rows with multiple ALT alleles. + + Variants with multiple ALT alleles will be removed. Parameters ---------- @@ -3841,7 +4423,10 @@ def filter_multialt(self, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_pass(self, opposite=False, as_index=False): - """Select rows with PASS in the FILTER field. + """ + Filter rows with PASS in FILTER column. + + Only variants with PASS in the FILTER column will remain. Parameters ---------- @@ -3915,7 +4500,9 @@ def filter_pass(self, opposite=False, as_index=False): def filter_phased(self, opposite=False, as_index=False): """ - Remove rows with phased genotypes. + Filter rows with phased genotypes. + + Variants with phased genotypes will be removed. Parameters ---------- @@ -3989,7 +4576,9 @@ def filter_phased(self, opposite=False, as_index=False): def filter_polyp(self, opposite=False, as_index=False): """ - Remove rows with a polyploid genotype call. + Filter rows with polyploid genotypes. + + Variants with polyploid genotypes will be removed. Parameters ---------- @@ -4062,7 +4651,10 @@ def filter_polyp(self, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_qual(self, threshold, opposite=False, as_index=False): - """Select rows with minimum QUAL value. + """ + Filter rows with low QUAL values. + + Only variants with QUAL >= threashold will remain. Parameters ---------- @@ -4142,9 +4734,11 @@ def one_row(r): return self.__class__(self.copy_meta(), self.df[i]) def filter_sampall(self, samples=None, opposite=False, as_index=False): - """Select rows if all of the given samples have the variant. + """ + Filter rows where all given samples have variant. - The default behavior is to use all samples in the VcfFrame. + Only variants where all given samples have variant. The default + behavior is to use all samples in the VcfFrame. Parameters ---------- @@ -4241,9 +4835,10 @@ def filter_sampall(self, samples=None, opposite=False, as_index=False): def filter_sampany(self, samples=None, opposite=False, as_index=False): """ - Select rows if any one of the given samples has the variant. + Filter rows where any given samples have variant. - The default behavior is to use all samples in the VcfFrame. + Only variants where any given samples have variant will remain. The + default behavior is to use all samples in the VcfFrame. Parameters ---------- @@ -4338,7 +4933,10 @@ def filter_sampany(self, samples=None, opposite=False, as_index=False): return self.__class__(self.copy_meta(), self.df[i]) def filter_sampnum(self, threshold, opposite=False, as_index=False): - """Select rows if the variant is prevalent enough. + """ + Filter rows with high variant prevalence. + + Only variants with variant prevalence >= threshold will remian. Parameters ---------- @@ -4426,7 +5024,9 @@ def f(r): def filter_vcf(self, vcf, opposite=False, as_index=False): """ - Select rows that overlap with the other VCF. + Filter rows intersecting with given VCF. + + Only variants intersecting with given VCF data will remain. Parameters ---------- @@ -4866,70 +5466,6 @@ def slice(self, region): df = df[df.POS <= end] return self.__class__(self.copy_meta(), df) - def to_bed(self): - """Write BedFrame from the VcfFrame. - - Returns - ------- - BedFrame - BedFrame. - - Examples - -------- - Assume we have the following data: - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103, 104], - ... 'ID': ['.', '.', '.', '.', '.'], - ... 'REF': ['A', 'A', 'C', 'C', 'ACGT'], - ... 'ALT': ['C', 'T,G', 'G', 'A,G,CT', 'A'], - ... 'QUAL': ['.', '.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.', '.'], - ... 'FORMAT': ['GT:DP', 'GT:DP', 'GT:DP', 'GT:DP', 'GT:DP'], - ... 'Steven': ['0/1:32', './.:.', '0/1:27', '0/2:34', '0/0:31'], - ... 'Sara': ['0/0:28', '1/2:30', '1/1:29', '1/2:38', '0/1:27'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara - 0 chr1 100 . A C . . . GT:DP 0/1:32 0/0:28 - 1 chr1 101 . A T,G . . . GT:DP ./.:. 1/2:30 - 2 chr1 102 . C G . . . GT:DP 0/1:27 1/1:29 - 3 chr1 103 . C A,G,CT . . . GT:DP 0/2:34 1/2:38 - 4 chr1 104 . ACGT A . . . GT:DP 0/0:31 0/1:27 - - We can construct BedFrame from the VcfFrame: - - >>> bf = vf.to_bed() - >>> bf.gr.df - Chromosome Start End - 0 chr1 100 100 - 1 chr1 101 101 - 2 chr1 102 102 - 3 chr1 103 103 - 4 chr1 103 104 - 5 chr1 105 107 - """ - def one_row(r): - if len(r.REF) == len(r.ALT) == 1: - start = r.POS - end = r.POS - elif len(r.REF) > len(r.ALT): - start = r.POS + 1 - end = r.POS + len(r.REF) - len(r.ALT) - else: - start = r.POS - end = r.POS + 1 - return pd.Series([r.CHROM, start, end]) - df = self.expand().df.apply(one_row, axis=1) - df.columns = ['Chromosome', 'Start', 'End'] - df = df.drop_duplicates() - bf = pybed.BedFrame.from_frame([], df) - return bf - def extract_format(self, k, func=None, as_nan=False): """ Extract data for the specified FORMAT key. @@ -5966,41 +6502,3 @@ def get_af(self, sample, variant): af = float(field.split(',')[j+1]) return af - - def variants(self): - """ - List unique variants in VcfFrame. - - Returns - ------- - list - List of unique variants. - - Examples - -------- - - >>> from fuc import pyvcf - >>> data = { - ... 'CHROM': ['chr1', 'chr2'], - ... 'POS': [100, 101], - ... 'ID': ['.', '.'], - ... 'REF': ['G', 'T'], - ... 'ALT': ['A', 'A,C'], - ... 'QUAL': ['.', '.'], - ... 'FILTER': ['.', '.'], - ... 'INFO': ['.', '.'], - ... 'FORMAT': ['GT', 'GT'], - ... 'A': ['0/1', '1/2'] - ... } - >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.variants() - ['chr1-100-G-A', 'chr2-101-T-A', 'chr2-101-T-C'] - """ - def one_row(r): - result = [] - for alt in r.ALT.split(','): - result.append(f'{r.CHROM}-{r.POS}-{r.REF}-{alt}') - return ','.join(result) - s = self.df.apply(one_row, axis=1) - s = ','.join(s) - return s.split(',') diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index c29404b..9006495 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -156,7 +156,8 @@ ] def row_firstann(r): - """Return the first result in the row. + """ + Return the first result in the row. Parameters ---------- @@ -208,7 +209,8 @@ def row_firstann(r): return results.split(',')[0] def row_mostsevere(r): - """Return result with the most severe consequence in the row. + """ + Return result with the most severe consequence in the row. Parameters ---------- @@ -260,7 +262,8 @@ def row_mostsevere(r): return sorted(results.split(','), key=f)[0] def parseann(vf, targets, sep=' | ', as_series=False): - """Parse variant annotations in VcfFrame. + """ + Parse variant annotations in VcfFrame. If there are multiple results per row, the method will use the first one for extracting requested data. @@ -391,7 +394,8 @@ def to_frame(vf, as_zero=False): return df def pick_result(vf, mode='mostsevere'): - """Return a new VcfFrame after picking one result per row. + """ + Return a new VcfFrame after picking one result per row. Parameters ---------- @@ -411,6 +415,7 @@ def pick_result(vf, mode='mostsevere'): Examples -------- + >>> from fuc import pyvep, pyvcf >>> meta = [ ... '##fileformat=VCFv4.1', ... '##VEP="v104" time="2021-05-20 10:50:12" cache="/net/isilonP/public/ro/ensweb-data/latest/tools/grch37/e104/vep/cache/homo_sapiens/104_GRCh37" db="homo_sapiens_core_104_37@hh-mysql-ens-grch37-web" 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"', @@ -437,16 +442,16 @@ def pick_result(vf, mode='mostsevere'): >>> vf1 = pyvep.pick_result(vf, mode='mostsevere') >>> vf2 = pyvep.pick_result(vf, mode='firstann') >>> vf1.df.INFO - 0 CSQ=C|missense_variant|MODERATE|SPEN|ENSG00000... - 1 CSQ=T|missense_variant|MODERATE|NRAS|ENSG00000... - 2 CSQ=A|stop_gained&NMD_transcript_variant|HIGH|... - 3 CSQ=-|inframe_deletion|MODERATE|HRAS|ENSG00000... + 0 CSQ=C|missense_variant|MODERATE|SPEN|ENSG00000065526|Transcript|ENST00000375759.3|protein_coding|12/15||||10322|10118|3373|Q/P|cAg/cCg|||1||HGNC|17575|||||tolerated_low_confidence(0.08)|possibly_damaging(0.718)|||||||||| + 1 CSQ=T|missense_variant|MODERATE|NRAS|ENSG00000213281|Transcript|ENST00000369535.4|protein_coding|3/7||||502|248|83|A/D|gCc/gAc|COSV99795232||-1||HGNC|7989|||||deleterious(0.02)|probably_damaging(0.946)|||1|1|||||| + 2 CSQ=A|stop_gained&NMD_transcript_variant|HIGH|NTRK1|ENSG00000198400|Transcript|ENST00000497019.2|nonsense_mediated_decay|10/16||||1183|1032|344|W/*|tgG/tgA|COSV62328771||1||HGNC|8031|||||||||1|1|||||| + 3 CSQ=-|inframe_deletion|MODERATE|HRAS|ENSG00000174775|Transcript|ENST00000311189.7|protein_coding|2/6||||198-200|26-28|9-10|VG/G|gTGGgc/ggc|rs1164486792||-1||HGNC|5173||||||||||1|||||| Name: INFO, dtype: object >>> vf2.df.INFO - 0 CSQ=C|downstream_gene_variant|MODIFIER|ZBTB17|... - 1 CSQ=T|downstream_gene_variant|MODIFIER|CSDE1|E... - 2 CSQ=A|missense_variant|MODERATE|NTRK1|ENSG0000... - 3 CSQ=-|upstream_gene_variant|MODIFIER|LRRC56|EN... + 0 CSQ=C|downstream_gene_variant|MODIFIER|ZBTB17|ENSG00000116809|Transcript|ENST00000375733.2|protein_coding|||||||||||4617|-1||HGNC|12936|||||||||||||||| + 1 CSQ=T|downstream_gene_variant|MODIFIER|CSDE1|ENSG00000009307|Transcript|ENST00000261443.5|protein_coding||||||||||COSV99795232|3453|-1||HGNC|29905|||||||||1|1|||||| + 2 CSQ=A|missense_variant|MODERATE|NTRK1|ENSG00000198400|Transcript|ENST00000358660.3|protein_coding|10/16||||1296|1255|419|A/T|Gcc/Acc|COSV62328771||1||HGNC|8031|||||deleterious(0.01)|probably_damaging(0.999)|||1|1|||||| + 3 CSQ=-|upstream_gene_variant|MODIFIER|LRRC56|ENSG00000161328|Transcript|ENST00000270115.7|protein_coding||||||||||rs1164486792|3230|1||HGNC|25430||||||||||1|||||| Name: INFO, dtype: object """ funcs = {'mostsevere': row_mostsevere, 'firstann': row_firstann} @@ -456,7 +461,8 @@ def pick_result(vf, mode='mostsevere'): return new_vf def annot_names(vf): - """Return the list of avaialble consequence annotations in the VcfFrame. + """ + Return the list of avaialble consequence annotations in the VcfFrame. Parameters ---------- diff --git a/fuc/cli/bam_aldepth.py b/fuc/cli/bam_aldepth.py index 36c09a2..e715d62 100644 --- a/fuc/cli/bam_aldepth.py +++ b/fuc/cli/bam_aldepth.py @@ -3,7 +3,7 @@ from .. import api description = """ -Count allelic depth from a SAM/BAM/CRAM file. +Count allelic depth from a BAM file. """ epilog = f""" @@ -20,17 +20,20 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compute allelic depth from a SAM/BAM/CRAM file.', + help= +"""Compute allelic depth from a BAM file.""" ) parser.add_argument( 'bam', - help='Alignment file.' + help= +"""Input alignment file.""" ) parser.add_argument( 'sites', - help='TSV file containing two columns, chromosome and position. \n' - 'This can also be a BED or VCF file (compressed or \n' - 'uncompressed) Input type will be detected automatically.' + help= +"""TSV file containing two columns, chromosome and position. This +can also be a BED or VCF file (compressed or uncompressed). +Input type will be detected automatically.""" ) def main(args): diff --git a/fuc/cli/bam_depth.py b/fuc/cli/bam_depth.py index fdb2f45..77f90e4 100644 --- a/fuc/cli/bam_depth.py +++ b/fuc/cli/bam_depth.py @@ -5,24 +5,20 @@ import pysam description = """ -Compute read depth from SAM/BAM/CRAM files. +Compute per-base read depth from BAM files. -By default, the command will count all reads within the alignment files. You -can specify regions of interest with --bed or --region. When you do this, pay -close attention to the 'chr' string in contig names (e.g. 'chr1' vs. '1'). -Note also that --region requires the input files be indexed. +Under the hood, the command computes read depth using the 'samtools depth' +command. """ epilog = f""" -[Example] To specify regions with a BED file: - $ fuc {api.common._script_name()} \\ - --bam 1.bam 2.bam \\ - --bed in.bed > out.tsv +[Example] Specify regions manually: + $ fuc {api.common._script_name()} 1.bam 2.bam \\ + -r chr1:100-200 chr2:400-500 > out.tsv -[Example] To specify regions manually: - $ fuc {api.common._script_name()} \\ - --fn bam.list \\ - --region chr1:100-200 > out.tsv +[Example] Specify regions with a BED file: + $ fuc {api.common._script_name()} bam.list \\ + -r in.bed > out.tsv """ def create_parser(subparsers): @@ -31,40 +27,46 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compute read depth from SAM/BAM/CRAM files.', + help= +"""Compute per-base read depth from BAM files.""" ) parser.add_argument( - '--bam', - metavar='PATH', + 'bams', nargs='+', - help='One or more alignment files. Cannot be used with --fn.' + help= +"""One or more input BAM files. Alternatively, you can +provide a text file (.txt, .tsv, .csv, or .list) +containing one BAM file per line.""" ) parser.add_argument( - '--fn', - metavar='PATH', - help='File containing one alignment file per line. Cannot \n' - 'be used with --bam.' - ) - parser.add_argument( - '--bed', - metavar='PATH', - help='BED file. Cannot be used with --region.' - ) - parser.add_argument( - '--region', + '-r', + '--regions', + nargs='+', metavar='TEXT', - help="Target region ('chrom:start-end'). Cannot be used \n" - "with --bed." + help= +"""By default, the command counts all reads in BAM +files, which can be excruciatingly slow for large +files (e.g. whole genome sequencing). Therefore, use +this argument to only output positions in given +regions. Each region must have the format +chrom:start-end and be a half-open interval with +(start, end]. This means, for example, chr1:100-103 +will extract positions 101, 102, and 103. +Alternatively, you can provide a BED file (compressed +or uncompressed) to specify regions. Note that the +'chr' prefix in contig names (e.g. 'chr1' vs. '1') +will be automatically added or removed as necessary +to match the input BAM's contig names.""" ) parser.add_argument( '--zero', action='store_true', - help='Output all positions including those with zero depth.' + help= +"""Output all positions including those with zero depth.""" ) def main(args): cf = api.pycov.CovFrame.from_bam( - bam=args.bam, fn=args.fn, bed=args.bed, region=args.region, - zero=args.zero + args.bams, regions=args.regions, zero=args.zero ) sys.stdout.write(cf.to_string()) diff --git a/fuc/cli/bam_head.py b/fuc/cli/bam_head.py index ca3572b..385a4da 100644 --- a/fuc/cli/bam_head.py +++ b/fuc/cli/bam_head.py @@ -5,7 +5,7 @@ import pysam description = """ -Print the header of a SAM/BAM/CRAM file. +Print the header of a BAM file. """ epilog = f""" @@ -19,11 +19,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Print the header of a SAM/BAM/CRAM file.', + help= +"""Print the header of a BAM file.""" ) parser.add_argument( 'bam', - help='Alignment file.' + help= +"""Input alignment file.""" ) def main(args): diff --git a/fuc/cli/bam_index.py b/fuc/cli/bam_index.py index cdb443d..2c88dc1 100644 --- a/fuc/cli/bam_index.py +++ b/fuc/cli/bam_index.py @@ -3,7 +3,7 @@ import pysam description = """ -Index a SAM/BAM/CRAM file. +Index a BAM file. """ epilog = f""" @@ -17,11 +17,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Index a SAM/BAM/CRAM file.', + help= +"""Index a BAM file.""" ) parser.add_argument( 'bam', - help='Alignment file.' + help= +"""Input alignment file.""" ) def main(args): diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index 714c4c4..e0aa141 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -5,8 +5,8 @@ import pysam -description = f""" -Rename the sample in a SAM/BAM/CRAM file. +description = """ +Rename the sample in a BAM file. """ epilog = f""" @@ -20,10 +20,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Rename the sample in a SAM/BAM/CRAM file.', + help= +"""Rename the sample in a BAM file.""" + ) + parser.add_argument( + 'bam', + help= +"""Input alignment file.""" + ) + parser.add_argument( + 'name', + help= +"""New sample name.""" ) - parser.add_argument('bam', help='Alignment file.') - parser.add_argument('name', help='New sample name.') def main(args): # Detect the input format. diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index 91324df..3f2816f 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -5,7 +5,7 @@ import pysam description = """ -Slice an alignment file (SAM/BAM/CRAM). +Slice a BAM file. """ epilog = f""" @@ -25,38 +25,43 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Slice a SAM/BAM/CRAM file.', + help= +"""Slice a BAM file.""" ) parser.add_argument( 'bam', - help="Input alignment file must be already indexed (.bai) to allow \n" - "random access. You can index an alignment file with the \n" - "bam-index command." + help= +"""Input alignment file must be already indexed (.bai) to allow +random access. You can index an alignment file with the +bam-index command.""" ) parser.add_argument( 'regions', nargs='+', - help="One or more regions to be sliced. Each region must have the \n" - "format chrom:start-end and be a half-open interval with \n" - "(start, end]. This means, for example, chr1:100-103 will \n" - "extract positions 101, 102, and 103. Alternatively, you can \n" - "provide a BED file (compressed or uncompressed) to specify \n" - "regions. Note that the 'chr' prefix in contig names (e.g. \n" - "'chr1' vs. '1') will be automatically added or removed as \n" - "necessary to match the input BED's contig names." + help= +"""One or more regions to be sliced. Each region must have the +format chrom:start-end and be a half-open interval with +(start, end]. This means, for example, chr1:100-103 will +extract positions 101, 102, and 103. Alternatively, you can +provide a BED file (compressed or uncompressed) to specify +regions. Note that the 'chr' prefix in contig names (e.g. +'chr1' vs. '1') will be automatically added or removed as +necessary to match the input BED's contig names.""" ) parser.add_argument( '--format', metavar='TEXT', default='BAM', choices=['SAM', 'BAM', 'CRAM'], - help="Output format (default: 'BAM') (choices: 'SAM', 'BAM', \n" - "'CRAM')." + help= +"""Output format (default: 'BAM') (choices: 'SAM', 'BAM', +'CRAM').""" ) parser.add_argument( '--fasta', metavar='PATH', - help="FASTA file. Required when --format is 'CRAM'." + help= +"""FASTA file. Required when --format is 'CRAM'.""" ) def main(args): diff --git a/fuc/cli/bed_intxn.py b/fuc/cli/bed_intxn.py index 6c78804..d673eb4 100644 --- a/fuc/cli/bed_intxn.py +++ b/fuc/cli/bed_intxn.py @@ -17,9 +17,15 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Find the intersection of BED files.', + help= +"""Find the intersection of BED files.""" + ) + parser.add_argument( + 'bed', + nargs='+', + help= +"""Input BED files.""" ) - parser.add_argument('bed', help='BED files.', nargs='+') def main(args): bfs = [api.pybed.BedFrame.from_file(x) for x in args.bed] diff --git a/fuc/cli/bed_sum.py b/fuc/cli/bed_sum.py index b72844d..d429baf 100644 --- a/fuc/cli/bed_sum.py +++ b/fuc/cli/bed_sum.py @@ -16,25 +16,29 @@ def create_parser(subparsers): subparsers, api.common._script_name(), description=description, - help='Summarize a BED file.', + help= +"""Summarize a BED file.""" ) parser.add_argument( 'bed', - help='BED file.' + help= +"""Input BED file.""" ) parser.add_argument( '--bases', metavar='INT', type=int, default=1, - help='Number to divide covered base pairs (default: 1).' + help= +"""Number to divide covered base pairs (default: 1).""" ) parser.add_argument( '--decimals', metavar='INT', type=int, default=0, - help='Number of decimals (default: 0).' + help= +"""Number of decimals (default: 0).""" ) def main(args): diff --git a/fuc/cli/cov_concat.py b/fuc/cli/cov_concat.py index fdae26c..69a4bea 100644 --- a/fuc/cli/cov_concat.py +++ b/fuc/cli/cov_concat.py @@ -22,21 +22,23 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Concatenate depth of coverage files.', + help= +"""Concatenate depth of coverage files.""" ) parser.add_argument( 'tsv', - metavar='PATH', nargs='+', - help='One or more TSV files.' + help= +"""Input TSV files.""" ) parser.add_argument( '--axis', metavar='INT', default=0, type=int, - help='The axis to concatenate along (default: 0) (choices: \n' - '0, 1 where 0 is index and 1 is columns).' + help= +"""The axis to concatenate along (default: 0) (choices: +0, 1 where 0 is index and 1 is columns).""" ) def main(args): diff --git a/fuc/cli/cov_rename.py b/fuc/cli/cov_rename.py index 9498ec5..56ed8e2 100644 --- a/fuc/cli/cov_rename.py +++ b/fuc/cli/cov_rename.py @@ -33,15 +33,18 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Rename the samples in a depth of coverage file.', + help= +"""Rename the samples in a depth of coverage file.""" ) parser.add_argument( 'tsv', - help='TSV file (compressed or uncompressed).' + help= +"""TSV file (compressed or uncompressed).""" ) parser.add_argument( 'names', - help='Text file containing information for renaming the samples.' + help= +"""Text file containing information for renaming the samples.""" ) parser.add_argument( '--mode', @@ -56,15 +59,17 @@ def create_parser(subparsers): metavar='INT', type=int, nargs=2, - help="Index range to use when renaming the samples. \n" - "Applicable only with the 'RANGE' mode." + help= +"""Index range to use when renaming the samples. +Applicable only with the 'RANGE' mode.""" ) parser.add_argument( '--sep', metavar='TEXT', default='\t', - help="Delimiter to use when reading the names file \n" - "(default: '\\t')." + help= +"""Delimiter to use when reading the names file +(default: '\\t').""" ) def main(args): diff --git a/fuc/cli/fa_filter.py b/fuc/cli/fa_filter.py index 83fcae0..dbe7316 100644 --- a/fuc/cli/fa_filter.py +++ b/fuc/cli/fa_filter.py @@ -24,23 +24,27 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Filter sequence records in a FASTA file', + help= +"""Filter sequence records in a FASTA file.""" ) parser.add_argument( 'fasta', - help='FASTA file (compressed or uncompressed).' + help= +"""Input FASTA file (compressed or uncompressed).""" ) parser.add_argument( '--contigs', metavar='TEXT', nargs='+', - help='One or more contigs to be selected. Alternatively, you can \n' - 'provide a file containing one contig per line. ' + help= +"""One or more contigs to be selected. Alternatively, you can +provide a file containing one contig per line.""" ) parser.add_argument( '--exclude', action='store_true', - help='Exclude specified contigs.' + help= +"""Exclude specified contigs.""" ) def main(args): diff --git a/fuc/cli/fq_count.py b/fuc/cli/fq_count.py index df4155f..d22856f 100644 --- a/fuc/cli/fq_count.py +++ b/fuc/cli/fq_count.py @@ -21,12 +21,14 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Count sequence reads in FASTQ files.', + help= +"""Count sequence reads in FASTQ files.""" ) parser.add_argument( 'fastq', nargs='*', - help='FASTQ files (compressed or uncompressed) (default: stdin).' + help= +"""Input FASTQ files (compressed or uncompressed) (default: stdin).""" ) def main(args): diff --git a/fuc/cli/fq_sum.py b/fuc/cli/fq_sum.py index 70c519c..50bdb33 100644 --- a/fuc/cli/fq_sum.py +++ b/fuc/cli/fq_sum.py @@ -19,11 +19,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Summarize a FASTQ file.', + help= +"""Summarize a FASTQ file.""" ) parser.add_argument( 'fastq', - help='FASTQ file (zipped or unqzipped).' + help= +"""Input FASTQ file (compressed or uncompressed).""" ) def main(args): diff --git a/fuc/cli/fuc_bgzip.py b/fuc/cli/fuc_bgzip.py index b3a681b..6ef96ac 100644 --- a/fuc/cli/fuc_bgzip.py +++ b/fuc/cli/fuc_bgzip.py @@ -28,12 +28,14 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Write a BGZF compressed file.', + help= +"""Write a BGZF compressed file.""" ) parser.add_argument( 'file', nargs='*', - help='File to be compressed (default: stdin).' + help= +"""Input file to be compressed (default: stdin).""" ) def main(args): diff --git a/fuc/cli/fuc_compf.py b/fuc/cli/fuc_compf.py index 8762b77..31c266a 100644 --- a/fuc/cli/fuc_compf.py +++ b/fuc/cli/fuc_compf.py @@ -20,10 +20,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compare the contents of two files.', + help= +"""Compare the contents of two files.""" + ) + parser.add_argument( + 'left', + help= +"""Input left file.""" + ) + parser.add_argument( + 'right', + help= +"""Input right file.""" ) - parser.add_argument('left', help='Left file.') - parser.add_argument('right', help='Right file.') def main(args): print(filecmp.cmp(args.left, args.right)) diff --git a/fuc/cli/fuc_demux.py b/fuc/cli/fuc_demux.py index 0b92c6d..aed7dd8 100644 --- a/fuc/cli/fuc_demux.py +++ b/fuc/cli/fuc_demux.py @@ -36,22 +36,26 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='Parse the Reports directory from bcl2fastq.', description=description, + help= +"""Parse the Reports directory from bcl2fastq.""" ) parser.add_argument( 'reports', - help='Reports directory.' + help= +"""Input Reports directory.""" ) parser.add_argument( 'output', - help='Output directory (will be created).' + help= +"""Output directory (will be created).""" ) parser.add_argument( '--sheet', metavar='PATH', - help='SampleSheet.csv file. Used for sorting and/or subsetting \n' - 'samples.' + help= +"""SampleSheet.csv file. Used for sorting and/or subsetting +samples.""" ) def main(args): diff --git a/fuc/cli/fuc_exist.py b/fuc/cli/fuc_exist.py index 4e6a1af..b54d4f5 100644 --- a/fuc/cli/fuc_exist.py +++ b/fuc/cli/fuc_exist.py @@ -27,10 +27,15 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Check whether certain files exist.', + help= +"""Check whether certain files exist.""" + ) + parser.add_argument( + 'files', + nargs='*', + help= +"""Files and directories to be tested (default: stdin).""" ) - parser.add_argument('files', nargs='*', - help='Files and directories to be tested (default: stdin).') def main(args): if args.files: diff --git a/fuc/cli/fuc_find.py b/fuc/cli/fuc_find.py index 39e2164..9659b92 100644 --- a/fuc/cli/fuc_find.py +++ b/fuc/cli/fuc_find.py @@ -25,25 +25,29 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Retrieve absolute paths of files whose name matches a \n' - 'specified pattern, optionally recursively.', + help= +"""Retrieve absolute paths of files whose name matches a +specified pattern, optionally recursively.""" ) parser.add_argument( 'pattern', - help='Filename pattern.' + help= +"""Filename pattern.""" ) parser.add_argument( '-r', '--recursive', action='store_true', - help='Turn on recursive retrieving.' + help= +"""Turn on recursive retrieving.""" ) parser.add_argument( '-d', '--directory', metavar='PATH', default=os.getcwd(), - help='Directory to search in (default: current directory).' + help= +"""Directory to search in (default: current directory).""" ) def main(args): diff --git a/fuc/cli/fuc_undetm.py b/fuc/cli/fuc_undetm.py index 3bc82d9..4e71440 100644 --- a/fuc/cli/fuc_undetm.py +++ b/fuc/cli/fuc_undetm.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Compute top unknown barcodes using undertermined FASTQ from bcl2fastq. This command will compute top unknown barcodes using undertermined FASTQ from @@ -20,19 +20,22 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Compute top unknown barcodes using undertermined FASTQ ' - 'from bcl2fastq.', + help= +"""Compute top unknown barcodes using undertermined FASTQ from +bcl2fastq.""" ) parser.add_argument( 'fastq', - help='Undertermined FASTQ (compressed or uncompressed).' + help= +"""Undertermined FASTQ (compressed or uncompressed).""" ) parser.add_argument( '--count', metavar='INT', type=int, default=30, - help='Number of top unknown barcodes to return (default: 30).' + help= +"""Number of top unknown barcodes to return (default: 30).""" ) def main(args): diff --git a/fuc/cli/maf_maf2vcf.py b/fuc/cli/maf_maf2vcf.py index 7286135..a413720 100644 --- a/fuc/cli/maf_maf2vcf.py +++ b/fuc/cli/maf_maf2vcf.py @@ -44,33 +44,39 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Convert a MAF file to a VCF file.', + help= +"""Convert a MAF file to a VCF file.""" ) parser.add_argument( 'maf', - help='MAF file (compressed or uncompressed).' + help= +"""MAF file (compressed or uncompressed).""" ) parser.add_argument( '--fasta', metavar='PATH', - help='FASTA file (required to include INDELs in the output).' + help= +"""FASTA file (required to include INDELs in the output).""" ) parser.add_argument( '--ignore_indels', action='store_true', - help='Use this flag to exclude INDELs from the output.' + help= +"""Use this flag to exclude INDELs from the output.""" ) parser.add_argument( '--cols', metavar='TEXT', nargs='+', - help='Column(s) in the MAF file.' + help= +"""Column(s) in the MAF file.""" ) parser.add_argument( '--names', metavar='TEXT', nargs='+', - help='Name(s) to be displayed in the FORMAT field.' + help= +"""Name(s) to be displayed in the FORMAT field.""" ) def main(args): diff --git a/fuc/cli/maf_oncoplt.py b/fuc/cli/maf_oncoplt.py index 15b786c..458e420 100644 --- a/fuc/cli/maf_oncoplt.py +++ b/fuc/cli/maf_oncoplt.py @@ -23,22 +23,26 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Create an oncoplot with a MAF file.', + help= +"""Create an oncoplot with a MAF file.""" ) parser.add_argument( 'maf', - help='MAF file.' + help= +"""Input MAF file.""" ) parser.add_argument( 'out', - help='Output image file.' + help= +"""Output image file.""" ) parser.add_argument( '--count', metavar='INT', type=int, default=10, - help='Number of top mutated genes to display (default: 10).' + help= +"""Number of top mutated genes to display (default: 10).""" ) parser.add_argument( '--figsize', @@ -46,28 +50,32 @@ def create_parser(subparsers): type=float, default=[15, 10], nargs=2, - help='Width, height in inches (default: [15, 10]).' + help= +"""Width, height in inches (default: [15, 10]).""" ) parser.add_argument( '--label_fontsize', metavar='FLOAT', type=float, default=15, - help='Font size of labels (default: 15).' + help= +"""Font size of labels (default: 15).""" ) parser.add_argument( '--ticklabels_fontsize', metavar='FLOAT', type=float, default=15, - help='Font size of tick labels (default: 15).' + help= +"""Font size of tick labels (default: 15).""" ) parser.add_argument( '--legend_fontsize', metavar='FLOAT', type=float, default=15, - help='Font size of legend texts (default: 15).' + help= +"""Font size of legend texts (default: 15).""" ) def main(args): diff --git a/fuc/cli/maf_sumplt.py b/fuc/cli/maf_sumplt.py index 87ecad1..03f6e85 100644 --- a/fuc/cli/maf_sumplt.py +++ b/fuc/cli/maf_sumplt.py @@ -23,15 +23,18 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Create a summary plot with a MAF file.', + help= +"""Create a summary plot with a MAF file.""" ) parser.add_argument( 'maf', - help='MAF file.' + help= +"""Input MAF file.""" ) parser.add_argument( 'out', - help='Output image file.' + help= +"""Output image file.""" ) parser.add_argument( '--figsize', @@ -39,28 +42,32 @@ def create_parser(subparsers): type=float, default=[15, 10], nargs=2, - help='width, height in inches (default: [15, 10])' + help= +"""Width, height in inches (default: [15, 10]).""" ) parser.add_argument( '--title_fontsize', metavar='FLOAT', type=float, default=16, - help='font size of subplot titles (default: 16)' + help= +"""Font size of subplot titles (default: 16).""" ) parser.add_argument( '--ticklabels_fontsize', metavar='FLOAT', type=float, default=12, - help='font size of tick labels (default: 12)' + help= +"""Font size of tick labels (default: 12).""" ) parser.add_argument( '--legend_fontsize', metavar='FLOAT', type=float, default=12, - help='font size of legend texts (default: 12)' + help= +"""Font size of legend texts (default: 12).""" ) def main(args): diff --git a/fuc/cli/maf_vcf2maf.py b/fuc/cli/maf_vcf2maf.py index 6fc653a..5846652 100644 --- a/fuc/cli/maf_vcf2maf.py +++ b/fuc/cli/maf_vcf2maf.py @@ -17,9 +17,14 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Convert a VCF file to a MAF file.', + help= +"""Convert a VCF file to a MAF file.""" + ) + parser.add_argument( + 'vcf', + help= +"""Annotated VCF file.""" ) - parser.add_argument('vcf', help='Annotated VCF file.') def main(args): mf = api.pymaf.MafFrame.from_vcf(args.vcf) diff --git a/fuc/cli/ngs_bam2fq.py b/fuc/cli/ngs_bam2fq.py index 7561c20..6dd92d1 100644 --- a/fuc/cli/ngs_bam2fq.py +++ b/fuc/cli/ngs_bam2fq.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for converting BAM files to FASTQ files. This pipeline will assume input BAM files consist of paired-end reads @@ -44,36 +44,42 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for converting BAM files to FASTQ files.', + help= +"""Pipeline for converting BAM files to FASTQ files.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help="SGE resoruce to request with qsub for BAM to FASTQ \n" - "conversion. Since this oppoeration supports multithreading, \n" - "it is recommended to speicfy a parallel environment (PE) \n" - "to speed up the process (also see --thread)." + help= +"""SGE resoruce to request with qsub for BAM to FASTQ +conversion. Since this oppoeration supports multithreading, +it is recommended to speicfy a parallel environment (PE) +to speed up the process (also see --thread).""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) def main(args): diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index c207b0a..0955113 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for converting FASTQ files to analysis-ready BAM files. Here, "analysis-ready" means that the final BAM files are: 1) aligned to a @@ -54,72 +54,86 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for converting FASTQ files to analysis-ready BAM files.', + help= +"""Pipeline for converting FASTQ files to analysis-ready BAM +files.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java', - help='Java resoruce to request for GATK.' + help= +"""Java resoruce to request for GATK.""" ) parser.add_argument( 'vcf', type=str, nargs='+', - help='One or more reference VCF files containing known variant \n' - 'sites (e.g. 1000 Genomes Project).' + help= +"""One or more reference VCF files containing known variant +sites (e.g. 1000 Genomes Project).""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--platform', metavar='TEXT', type=str, default='Illumina', - help="Sequencing platform (default: 'Illumina')." + help= +"""Sequencing platform (default: 'Illumina').""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) def main(args): diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index b39e29d..3d437e1 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -45,92 +45,107 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for germline short variant discovery.', + help= +"""Pipeline for germline short variant discovery.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java1', type=str, - help='Java resoruce to request for single-sample variant calling.' + help= +"""Java resoruce to request for single-sample variant calling.""" ) parser.add_argument( 'java2', type=str, - help='Java resoruce to request for joint variant calling.' + help= +"""Java resoruce to request for joint variant calling.""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--dbsnp', metavar='PATH', type=str, - help='VCF file from dbSNP.' + help= +"""VCF file from dbSNP.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--batch', metavar='INT', type=int, default=0, - help='Batch size used for GenomicsDBImport (default: 0). This \n' - 'controls the number of samples for which readers are \n' - 'open at once and therefore provides a way to minimize \n' - 'memory consumption. The size of 0 means no batching (i.e. \n' - 'readers for all samples will be opened at once).' + help= +"""Batch size used for GenomicsDBImport (default: 0). This +controls the number of samples for which readers are +open at once and therefore provides a way to minimize +memory consumption. The size of 0 means no batching (i.e. +readers for all samples will be opened at once).""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) parser.add_argument( '--posix', action='store_true', - help='Set GenomicsDBImport to allow for optimizations to improve \n' - 'the usability and performance for shared Posix Filesystems \n' - '(e.g. NFS, Lustre). If set, file level locking is disabled \n' - 'and file system writes are minimized by keeping a higher \n' - 'number of file descriptors open for longer periods of time. \n' - 'Use with --batch if keeping a large number of file \n' - 'descriptors open is an issue.' + help= +"""Set GenomicsDBImport to allow for optimizations to improve +the usability and performance for shared Posix Filesystems +(e.g. NFS, Lustre). If set, file level locking is disabled +and file system writes are minimized by keeping a higher +number of file descriptors open for longer periods of time. +Use with --batch if keeping a large number of file +descriptors open is an issue.""" ) def main(args): diff --git a/fuc/cli/ngs_m2.py b/fuc/cli/ngs_m2.py index 867cb31..618a90d 100644 --- a/fuc/cli/ngs_m2.py +++ b/fuc/cli/ngs_m2.py @@ -50,50 +50,60 @@ def create_parser(subparsers): ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'pon', - help='PoN VCF file.' + help= +"""PoN VCF file.""" ) parser.add_argument( 'germline', - help='Germline VCF file.' + help= +"""Germline VCF file.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java', type=str, - help='Java resoruce to request for GATK.' + help= +"""Java resoruce to request for GATK.""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) def main(args): diff --git a/fuc/cli/ngs_pon.py b/fuc/cli/ngs_pon.py index 459942b..82809ae 100644 --- a/fuc/cli/ngs_pon.py +++ b/fuc/cli/ngs_pon.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for constructing a panel of normals (PoN). Dependencies: @@ -40,45 +40,54 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for constructing a panel of normals (PoN).', + help= +"""Pipeline for constructing a panel of normals (PoN).""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'fasta', - help='Reference FASTA file.' + help= +"""Reference FASTA file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( 'java', - help='Java resoruce to request for GATK.' + help= +"""Java resoruce to request for GATK.""" ) parser.add_argument( '--bed', metavar='PATH', type=str, - help='BED file.' + help= +"""BED file.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--keep', action='store_true', - help='Keep temporary files.' + help= +"""Keep temporary files.""" ) def main(args): diff --git a/fuc/cli/ngs_quant.py b/fuc/cli/ngs_quant.py index 28d41de..e731995 100644 --- a/fuc/cli/ngs_quant.py +++ b/fuc/cli/ngs_quant.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for running RNAseq quantification from FASTQ files with Kallisto. External dependencies: @@ -35,58 +35,68 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for running RNAseq quantification from FASTQ files \n' - 'with Kallisto.', + help= +"""Pipeline for running RNAseq quantification from FASTQ files +with Kallisto.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'index', - help='Kallisto index file.' + help= +"""Kallisto index file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--bootstrap', metavar='INT', type=int, default=50, - help='Number of bootstrap samples (default: 50).' + help= +"""Number of bootstrap samples (default: 50).""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) parser.add_argument( '--posix', action='store_true', - help='Set the environment variable HDF5_USE_FILE_LOCKING=FALSE \n' - 'before running Kallisto. This is required for shared Posix \n' - 'Filesystems (e.g. NFS, Lustre).' + help= +"""Set the environment variable HDF5_USE_FILE_LOCKING=FALSE +before running Kallisto. This is required for shared Posix +Filesystems (e.g. NFS, Lustre).""" ) def main(args): diff --git a/fuc/cli/ngs_trim.py b/fuc/cli/ngs_trim.py index e44a5b5..89f33bf 100644 --- a/fuc/cli/ngs_trim.py +++ b/fuc/cli/ngs_trim.py @@ -6,7 +6,7 @@ import pandas as pd -description = f""" +description = """ Pipeline for trimming adapters from FASTQ files. External dependencies: @@ -34,39 +34,46 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Pipeline for trimming adapters from FASTQ files.', + help= +"""Pipeline for trimming adapters from FASTQ files.""" ) parser.add_argument( 'manifest', - help='Sample manifest CSV file.' + help= +"""Sample manifest CSV file.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( 'qsub', type=str, - help='SGE resoruce to request for qsub.' + help= +"""SGE resoruce to request for qsub.""" ) parser.add_argument( '--thread', metavar='INT', type=int, default=1, - help='Number of threads to use (default: 1).' + help= +"""Number of threads to use (default: 1).""" ) parser.add_argument( '--job', metavar='TEXT', type=str, - help='Job submission ID for SGE.' + help= +"""Job submission ID for SGE.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) def main(args): diff --git a/fuc/cli/tabix_index.py b/fuc/cli/tabix_index.py index 993ee29..bc3e759 100644 --- a/fuc/cli/tabix_index.py +++ b/fuc/cli/tabix_index.py @@ -32,16 +32,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Index a GFF/BED/SAM/VCF file with Tabix.', + help= +"""Index a GFF/BED/SAM/VCF file with Tabix.""" ) parser.add_argument( 'file', - help='File to be indexed.' + help= +"""File to be indexed.""" ) parser.add_argument( '--force', action='store_true', - help='Force to overwrite the index file if it is present.' + help= +"""Force to overwrite the index file if it is present.""" ) def main(args): diff --git a/fuc/cli/tabix_slice.py b/fuc/cli/tabix_slice.py index d8f3bd7..dfc08e6 100644 --- a/fuc/cli/tabix_slice.py +++ b/fuc/cli/tabix_slice.py @@ -25,16 +25,19 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Slice a GFF/BED/SAM/VCF file with Tabix.', + help= +"""Slice a GFF/BED/SAM/VCF file with Tabix.""" ) parser.add_argument( 'file', - help='File to be sliced.' + help= +"""File to be sliced.""" ) parser.add_argument( 'regions', nargs='+', - help='One or more regions.' + help= +"""One or more regions.""" ) def main(args): diff --git a/fuc/cli/tbl_merge.py b/fuc/cli/tbl_merge.py index b98d6c4..9765883 100644 --- a/fuc/cli/tbl_merge.py +++ b/fuc/cli/tbl_merge.py @@ -32,47 +32,55 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Merge two table files.', + help= +"""Merge two table files.""" ) parser.add_argument( 'left', - help='Left file.' + help= +"""Input left file.""" ) parser.add_argument( 'right', - help='Right file.' + help= +"""Input right file.""" ) parser.add_argument( '--how', metavar='TEXT', choices=CHOICES, default='inner', - help="Type of merge to be performed (default: 'inner') \n" - "(choices: 'left', 'right', 'outer', 'inner', 'cross')." + help= +"""Type of merge to be performed (default: 'inner') +(choices: 'left', 'right', 'outer', 'inner', 'cross').""" ) parser.add_argument( '--on', metavar='TEXT', nargs='+', - help='Column names to join on.' + help= +"""Column names to join on.""" ) parser.add_argument( '--lsep', metavar='TEXT', default='\t', - help="Delimiter to use for the left file (default: '\\t')." + help= +"""Delimiter to use for the left file (default: '\\t').""" ) parser.add_argument( '--rsep', metavar='TEXT', default='\t', - help="Delimiter to use for the right file (default: '\\t')." + help= +"""Delimiter to use for the right file (default: '\\t').""" ) parser.add_argument( '--osep', metavar='TEXT', default='\t', - help="Delimiter to use for the output file (default: '\\t')." + help= +"""Delimiter to use for the output file (default: '\\t').""" ) return parser diff --git a/fuc/cli/tbl_sum.py b/fuc/cli/tbl_sum.py index 1b46794..6cdebd0 100644 --- a/fuc/cli/tbl_sum.py +++ b/fuc/cli/tbl_sum.py @@ -18,63 +18,72 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Summarize a table file.', + help= +"""Summarize a table file.""" ) parser.add_argument( 'table_file', - help='Table file.' + help= +"""Table file.""" ) parser.add_argument( '--sep', metavar='TEXT', default='\t', - help="Delimiter to use (default: '\\t')." + help= +"""Delimiter to use (default: '\\t').""" ) parser.add_argument( '--skiprows', metavar='TEXT', - help='Comma-separated line numbers to skip (0-indexed) or \n' - 'number of lines to skip at the start of the file \n' - '(e.g. `--skiprows 1,` will skip the second line, \n' - '`--skiprows 2,4` will skip the third and fifth lines, \n' - 'and `--skiprows 10` will skip the first 10 lines).' + help= +"""Comma-separated line numbers to skip (0-indexed) or +number of lines to skip at the start of the file +(e.g. `--skiprows 1,` will skip the second line, +`--skiprows 2,4` will skip the third and fifth lines, +and `--skiprows 10` will skip the first 10 lines).""" ) parser.add_argument( '--na_values', metavar='TEXT', nargs='+', - help='Additional strings to recognize as NA/NaN (by \n' - "default, the following values are interpreted \n" - "as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', \n" - "'-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', \n" - "'', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', \n" - "'null')." + help= +"""Additional strings to recognize as NA/NaN (by +default, the following values are interpreted +as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', +'-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', +'', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', +'null').""" ) parser.add_argument( '--keep_default_na', action='store_false', - help='Whether or not to include the default NaN values when \n' - "parsing the data (see 'pandas.read_table' for details)." + help= +"""Whether or not to include the default NaN values when +parsing the data (see 'pandas.read_table' for details).""" ) parser.add_argument( '--expr', metavar='TEXT', - help='Query the columns of a pandas.DataFrame with a \n' - """boolean expression (e.g. `--query "A == 'yes'"`).""" + help= +"""Query the columns of a pandas.DataFrame with a +boolean expression (e.g. `--query "A == 'yes'"`).""" ) parser.add_argument( '--columns', metavar='TEXT', nargs='+', - help='Columns to be summarized (by default, all columns \n' - 'will be included).' + help= +"""Columns to be summarized (by default, all columns +will be included).""" ) parser.add_argument( '--dtypes', metavar='PATH', - help="File of column names and their data types (either \n" - "'categorical' or 'numeric'); one tab-delimited pair of \n" - "column name and data type per line." + help= +"""File of column names and their data types (either +'categorical' or 'numeric'); one tab-delimited pair of +column name and data type per line.""" ) def main(args): diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py new file mode 100644 index 0000000..b200783 --- /dev/null +++ b/fuc/cli/vcf_call.py @@ -0,0 +1,86 @@ +import sys + +from .. import api + +description = """ +Call SNVs and indels from BAM files. + +Under the hood, the command utilizes the bcftool program to call variants. +""" + +epilog = f""" +[Example] Specify regions manually: + $ fuc {api.common._script_name()} ref.fa 1.bam 2.bam \\ + -r chr1:100-200 chr2:400-500 > out.vcf + +[Example] Specify regions with a BED file: + $ fuc {api.common._script_name()} ref.fa bam.list \\ + -r in.bed > out.vcf +""" + +def create_parser(subparsers): + parser = api.common._add_parser( + subparsers, + api.common._script_name(), + description=description, + epilog=epilog, + help= +"""Call SNVs and indels from BAM files.""" + ) + parser.add_argument( + 'fasta', + help= +"""Reference FASTA file.""" + ) + parser.add_argument( + 'bams', + nargs='+', + help= +"""One or more input BAM files. Alternatively, you can +provide a text file (.txt, .tsv, .csv, or .list) +containing one BAM file per line.""" + ) + parser.add_argument( + '-r', + '--regions', + nargs='+', + metavar='TEXT', + help= +"""By default, the command looks at each genomic +position with coverage in BAM files, which can be +excruciatingly slow for large files (e.g. whole +genome sequencing). Therefore, use this argument to +only call variants in given regions. Each region must +have the format chrom:start-end and be a half-open +interval with (start, end]. This means, for example, +chr1:100-103 will extract positions 101, 102, and +103. Alternatively, you can provide a BED file +(compressed or uncompressed) to specify regions. Note +that the 'chr' prefix in contig names (e.g. 'chr1' +vs. '1') will be automatically added or removed as +necessary to match the input BAM's contig names.""" + ) + parser.add_argument( + '--min-mq', + metavar='INT', + type=int, + default=1, + help= +"""Minimum mapping quality for an alignment to be used +(default: 1).""" + ) + parser.add_argument( + '--max-depth', + metavar='INT', + type=int, + default=250, + help= +"""At a position, read maximally this number of reads +per input file (default: 250).""" + ) + +def main(args): + api.pyvcf.call( + args.fasta, args.bams, regions=args.regions, path='-', + min_mq=args.min_mq, max_depth=args.max_depth + ) diff --git a/fuc/cli/vcf_filter.py b/fuc/cli/vcf_filter.py index c80be07..5de1933 100644 --- a/fuc/cli/vcf_filter.py +++ b/fuc/cli/vcf_filter.py @@ -41,48 +41,56 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Filter a VCF file.', + help= +"""Filter a VCF file.""" ) parser.add_argument( 'vcf', - help='VCF file (compressed or uncompressed).' + help= +"""VCF file (compressed or uncompressed).""" ) parser.add_argument( '--expr', metavar='TEXT', - help='Expression to evaluate.' + help= +"""Expression to evaluate.""" ) parser.add_argument( '--samples', metavar='PATH', - help='File of sample names to apply the marking (one \n' - 'sample per line).' + help= +"""File of sample names to apply the marking (one +sample per line).""" ) parser.add_argument( '--drop_duplicates', metavar='TEXT', nargs='*', - help='Only consider certain columns for identifying \n' - 'duplicates, by default use all of the columns.' + help= +"""Only consider certain columns for identifying +duplicates, by default use all of the columns.""" ) parser.add_argument( '--greedy', action='store_true', - help='Use this flag to mark even ambiguous genotypes \n' - 'as missing.' + help= +"""Use this flag to mark even ambiguous genotypes +as missing.""" ) parser.add_argument( '--opposite', action='store_true', - help='Use this flag to mark all genotypes that do not \n' - 'satisfy the query expression as missing and leave \n' - 'those that do intact.' + help= +"""Use this flag to mark all genotypes that do not +satisfy the query expression as missing and leave +those that do intact.""" ) parser.add_argument( '--filter_empty', action='store_true', - help='Use this flag to remove rows with no genotype \n' - 'calls at all.' + help= +"""Use this flag to remove rows with no genotype +calls at all.""" ) def main(args): diff --git a/fuc/cli/vcf_index.py b/fuc/cli/vcf_index.py index 4ecd621..6f77ce0 100644 --- a/fuc/cli/vcf_index.py +++ b/fuc/cli/vcf_index.py @@ -28,14 +28,16 @@ def create_parser(subparsers): ) parser.add_argument( 'vcf', - help='Input VCF file to be indexed. When an uncompressed file is \n' - 'given, the command will automatically create a BGZF \n' - 'compressed copy of the file (.gz) before indexing.' + help= +"""Input VCF file to be indexed. When an uncompressed file is +given, the command will automatically create a BGZF +compressed copy of the file (.gz) before indexing.""" ) parser.add_argument( '--force', action='store_true', - help='Force to overwrite the index file if it is already present.' + help= +"""Force to overwrite the index file if it is already present.""" ) def main(args): diff --git a/fuc/cli/vcf_merge.py b/fuc/cli/vcf_merge.py index 0f7a05b..710cd25 100644 --- a/fuc/cli/vcf_merge.py +++ b/fuc/cli/vcf_merge.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Merge two or more VCF files. """ @@ -25,36 +25,41 @@ def create_parser(subparsers): parser.add_argument( 'vcf_files', nargs='+', - help="VCF files (compressed or uncompressed). Note that the 'chr'\n" - "prefix in contig names (e.g. 'chr1' vs. '1') will be \n" - "automatically added or removed as necessary to match the \n" - "contig names of the first VCF." + help= +"""VCF files (compressed or uncompressed). Note that the 'chr' +prefix in contig names (e.g. 'chr1' vs. '1') will be +automatically added or removed as necessary to match the +contig names of the first VCF.""" ) parser.add_argument( '--how', metavar='TEXT', default='inner', - help="Type of merge as defined in pandas.DataFrame.merge \n" - "(default: 'inner')." + help= +"""Type of merge as defined in pandas.DataFrame.merge +(default: 'inner').""" ) parser.add_argument( '--format', metavar='TEXT', default='GT', - help="FORMAT subfields to be retained (e.g. 'GT:AD:DP') \n" - "(default: 'GT')." + help= +"""FORMAT subfields to be retained (e.g. 'GT:AD:DP') +(default: 'GT').""" ) parser.add_argument( '--sort', action='store_false', - help='Use this flag to turn off sorting of records \n' - '(default: True).' + help= +"""Use this flag to turn off sorting of records +(default: True).""" ) parser.add_argument( '--collapse', action='store_true', - help='Use this flag to collapse duplicate records \n' - '(default: False).' + help= +"""Use this flag to collapse duplicate records +(default: False).""" ) def main(args): diff --git a/fuc/cli/vcf_rename.py b/fuc/cli/vcf_rename.py index d03db01..3d0f7a0 100644 --- a/fuc/cli/vcf_rename.py +++ b/fuc/cli/vcf_rename.py @@ -33,38 +33,44 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Rename the samples in a VCF file.', + help= +"""Rename the samples in a VCF file.""" ) parser.add_argument( 'vcf', - help='VCF file (compressed or uncompressed).' + help= +"""VCF file (compressed or uncompressed).""" ) parser.add_argument( 'names', - help='Text file containing information for renaming the samples.' + help= +"""Text file containing information for renaming the samples.""" ) parser.add_argument( '--mode', metavar='TEXT', default='MAP', choices=['MAP', 'INDEX', 'RANGE'], - help="Renaming mode (default: 'MAP') (choices: 'MAP', \n" - "'INDEX', 'RANGE')." + help= +"""Renaming mode (default: 'MAP') (choices: 'MAP', +'INDEX', 'RANGE').""" ) parser.add_argument( '--range', metavar='INT', type=int, nargs=2, - help="Index range to use when renaming the samples. \n" - "Applicable only with the 'RANGE' mode." + help= +"""Index range to use when renaming the samples. +Applicable only with the 'RANGE' mode.""" ) parser.add_argument( '--sep', metavar='TEXT', default='\t', - help="Delimiter to use for reading the 'names' file \n" - "(default: '\\t')." + help= +"""Delimiter to use for reading the 'names' file +(default: '\\t').""" ) def main(args): diff --git a/fuc/cli/vcf_slice.py b/fuc/cli/vcf_slice.py index 3acfba2..68d154c 100644 --- a/fuc/cli/vcf_slice.py +++ b/fuc/cli/vcf_slice.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Slice a VCF file for specified regions. """ @@ -23,26 +23,29 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Slice a VCF file for specified regions.', + help= +"""Slice a VCF file for specified regions.""" ) parser.add_argument( 'vcf', - help="Input VCF file must be already BGZF compressed (.gz) and \n" - "indexed (.tbi) to allow random access. A VCF file can be \n" - "compressed with the fuc-bgzip command and indexed with the \n" - "vcf-index command." + help= +"""Input VCF file must be already BGZF compressed (.gz) and +indexed (.tbi) to allow random access. A VCF file can be +compressed with the fuc-bgzip command and indexed with the +vcf-index command.""" ) parser.add_argument( 'regions', nargs='+', - help="One or more regions to be sliced. Each region must have the \n" - "format chrom:start-end and be a half-open interval with \n" - "(start, end]. This means, for example, chr1:100-103 will \n" - "extract positions 101, 102, and 103. Alternatively, you can \n" - "provide a BED file (compressed or uncompressed) to specify \n" - "regions. Note that the 'chr' prefix in contig names (e.g. \n" - "'chr1' vs. '1') will be automatically added or removed as \n" - "necessary to match the input VCF's contig names." + help= +"""One or more regions to be sliced. Each region must have the +format chrom:start-end and be a half-open interval with +(start, end]. This means, for example, chr1:100-103 will +extract positions 101, 102, and 103. Alternatively, you can +provide a BED file (compressed or uncompressed) to specify +regions. Note that the 'chr' prefix in contig names (e.g. +'chr1' vs. '1') will be automatically added or removed as +necessary to match the input VCF's contig names.""" ) def main(args): diff --git a/fuc/cli/vcf_split.py b/fuc/cli/vcf_split.py index 8761a09..23529f5 100644 --- a/fuc/cli/vcf_split.py +++ b/fuc/cli/vcf_split.py @@ -4,7 +4,7 @@ from .. import api -description = f""" +description = """ Split a VCF file by individual. """ @@ -19,28 +19,33 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Split a VCF file by individual.', + help= +"""Split a VCF file by individual.""" ) parser.add_argument( 'vcf', - help="VCF file to be split." + help= +"""VCF file to be split.""" ) parser.add_argument( 'output', type=os.path.abspath, - help='Output directory.' + help= +"""Output directory.""" ) parser.add_argument( '--clean', action='store_false', - help="By default, the command will only return variants present in \n" - "each individual. Use the tag to stop this behavior and make \n" - "sure that all individuals have the same number of variants." + help= +"""By default, the command will only return variants present in +each individual. Use the tag to stop this behavior and make +sure that all individuals have the same number of variants.""" ) parser.add_argument( '--force', action='store_true', - help='Overwrite the output directory if it already exists.' + help= +"""Overwrite the output directory if it already exists.""" ) def main(args): diff --git a/fuc/cli/vcf_vcf2bed.py b/fuc/cli/vcf_vcf2bed.py index bda3991..2fb07f9 100644 --- a/fuc/cli/vcf_vcf2bed.py +++ b/fuc/cli/vcf_vcf2bed.py @@ -17,11 +17,13 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Convert a VCF file to a BED file.', + help= +"""Convert a VCF file to a BED file.""" ) parser.add_argument( 'vcf', - help='VCF file (compressed or uncompressed).' + help= +"""VCF file (compressed or uncompressed).""" ) def main(args): diff --git a/fuc/cli/vcf_vep.py b/fuc/cli/vcf_vep.py index eb1774a..d0605c5 100644 --- a/fuc/cli/vcf_vep.py +++ b/fuc/cli/vcf_vep.py @@ -2,7 +2,7 @@ from .. import api -description = f""" +description = """ Filter a VCF file by annotations from Ensembl VEP. """ @@ -37,26 +37,31 @@ def create_parser(subparsers): api.common._script_name(), description=description, epilog=epilog, - help='Filter a VCF file by annotations from Ensembl VEP.', + help= +"""Filter a VCF file by annotations from Ensembl VEP.""" ) parser.add_argument( 'vcf', - help='VCF file annotated by Ensembl VEP (compressed or uncompressed).' + help= +"""VCF file annotated by Ensembl VEP (compressed or uncompressed).""" ) parser.add_argument( 'expr', - help='Query expression to evaluate.' + help= +"""Query expression to evaluate.""" ) parser.add_argument( '--opposite', action='store_true', - help="Use this flag to return only records that don't \n" - "meet the said criteria." + help= +"""Use this flag to return only records that don't +meet the said criteria.""" ) parser.add_argument( '--as_zero', action='store_true', - help='Use this flag to treat missing values as zero instead of NaN.' + help= +"""Use this flag to treat missing values as zero instead of NaN.""" ) def main(args): diff --git a/fuc/version.py b/fuc/version.py index e187e0a..c3d10d7 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.30.0' +__version__ = '0.31.0'