diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6b5ab4f..abc8678 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,15 @@ Changelog ********* +0.34.0 (2022-06-08) +------------------- + +* Add new optional argument ``--stranded`` to :command:`ngs-quant` command. +* Add new method :meth:`pycov.CovFrame.merge`. +* Add new method :meth:`pycov.merge`. +* :issue:`61`: Update :meth:`pymaf.MafFrame.from_vcf` method to automatically detect CSQ field in INFO column (thanks `@lbeltrame `__). +* :issue:`63`: Update :meth:`pyvcf.VcfFrame.sort` method to handle contigs that are not pre-defined. + 0.33.1 (2022-05-03) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index 3857afe..6e753aa 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -1029,7 +1029,7 @@ ngs-quant $ fuc ngs-quant -h usage: fuc ngs-quant [-h] [--thread INT] [--bootstrap INT] [--job TEXT] - [--force] [--posix] + [--force] [--posix] [--stranded TEXT] manifest index output qsub Pipeline for running RNAseq quantification from FASTQ files with Kallisto. @@ -1058,6 +1058,8 @@ ngs-quant --posix Set the environment variable HDF5_USE_FILE_LOCKING=FALSE before running Kallisto. This is required for shared Posix Filesystems (e.g. NFS, Lustre). + --stranded TEXT Strand specific reads (default: 'none') (choices: + 'none', 'forward', 'reverse'). [Example] Specify queue: $ fuc ngs-quant \ diff --git a/fuc/api/pycov.py b/fuc/api/pycov.py index f32fb96..e9098ef 100644 --- a/fuc/api/pycov.py +++ b/fuc/api/pycov.py @@ -83,6 +83,90 @@ def simulate(mode='wgs', loc=30, scale=5, size=1000): return a +def merge( + cfs, how='inner' +): + """ + Merge CovFrame objects. + + Parameters + ---------- + cfs : list + List of CovFrames to be merged. Note that the 'chr' prefix in contig + names (e.g. 'chr1' vs. '1') will be automatically added or removed as + necessary to match the contig names of the first CovFrame. + how : str, default: 'inner' + Type of merge as defined in :meth:`pandas.merge`. + + Returns + ------- + CovFrame + Merged CovFrame. + + See Also + -------- + CovFrame.merge + Merge self with another CovFrame. + + Examples + -------- + Assume we have the following data: + + >>> import numpy as np + >>> from fuc import pycov + >>> data1 = { + ... 'Chromosome': ['chr1'] * 5, + ... 'Position': np.arange(100, 105), + ... 'A': pycov.simulate(loc=35, scale=5, size=5), + ... 'B': pycov.simulate(loc=25, scale=7, size=5), + ... } + >>> data2 = { + ... 'Chromosome': ['1'] * 5, + ... 'Position': np.arange(102, 107), + ... 'C': pycov.simulate(loc=35, scale=5, size=5), + ... } + >>> cf1 = pycov.CovFrame.from_dict(data1) + >>> cf2 = pycov.CovFrame.from_dict(data2) + >>> cf1.df + Chromosome Position A B + 0 chr1 100 33 17 + 1 chr1 101 36 20 + 2 chr1 102 39 39 + 3 chr1 103 31 19 + 4 chr1 104 31 10 + >>> cf2.df + Chromosome Position C + 0 1 102 41 + 1 1 103 37 + 2 1 104 35 + 3 1 105 33 + 4 1 106 39 + + We can merge the two VcfFrames with `how='inner'` (default): + + >>> pycov.merge([cf1, cf2]).df + Chromosome Position A B C + 0 chr1 102 39 39 41 + 1 chr1 103 31 19 37 + 2 chr1 104 31 10 35 + + We can also merge with `how='outer'`: + + >>> pycov.merge([cf1, cf2], how='outer').df + Chromosome Position A B C + 0 chr1 100 33.0 17.0 NaN + 1 chr1 101 36.0 20.0 NaN + 2 chr1 102 39.0 39.0 41.0 + 3 chr1 103 31.0 19.0 37.0 + 4 chr1 104 31.0 10.0 35.0 + 5 chr1 105 NaN NaN 33.0 + 6 chr1 106 NaN NaN 39.0 + """ + merged_cf = cfs[0] + for cf in cfs[1:]: + merged_cf = merged_cf.merge(cf, how=how) + return merged_cf + class CovFrame: """ Class for storing read depth data from one or more SAM/BAM/CRAM files. @@ -1128,3 +1212,97 @@ def rename(self, names, indicies=None): cf = self.copy() cf.df.columns = columns return cf + + def merge( + self, other, how='inner' + ): + """ + Merge with the other CovFrame. + + Parameters + ---------- + other : CovFrame + Other CovFrame. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as + necessary to match the contig names of ``self``. + how : str, default: 'inner' + Type of merge as defined in :meth:`pandas.DataFrame.merge`. + + Returns + ------- + CovFrame + Merged CovFrame. + + See Also + -------- + merge + Merge multiple CovFrame objects. + + Examples + -------- + Assume we have the following data: + + >>> import numpy as np + >>> from fuc import pycov + >>> data1 = { + ... 'Chromosome': ['chr1'] * 5, + ... 'Position': np.arange(100, 105), + ... 'A': pycov.simulate(loc=35, scale=5, size=5), + ... 'B': pycov.simulate(loc=25, scale=7, size=5), + ... } + >>> data2 = { + ... 'Chromosome': ['1'] * 5, + ... 'Position': np.arange(102, 107), + ... 'C': pycov.simulate(loc=35, scale=5, size=5), + ... } + >>> cf1 = pycov.CovFrame.from_dict(data1) + >>> cf2 = pycov.CovFrame.from_dict(data2) + >>> cf1.df + Chromosome Position A B + 0 chr1 100 40 27 + 1 chr1 101 32 33 + 2 chr1 102 32 22 + 3 chr1 103 32 29 + 4 chr1 104 37 22 + >>> cf2.df + Chromosome Position C + 0 1 102 33 + 1 1 103 29 + 2 1 104 35 + 3 1 105 27 + 4 1 106 25 + + We can merge the two VcfFrames with `how='inner'` (default): + + >>> cf1.merge(cf2).df + Chromosome Position A B C + 0 chr1 102 32 22 33 + 1 chr1 103 32 29 29 + 2 chr1 104 37 22 35 + + We can also merge with `how='outer'`: + + >>> cf1.merge(cf2, how='outer').df + Chromosome Position A B C + 0 chr1 100 40.0 27.0 NaN + 1 chr1 101 32.0 33.0 NaN + 2 chr1 102 32.0 22.0 33.0 + 3 chr1 103 32.0 29.0 29.0 + 4 chr1 104 37.0 22.0 35.0 + 5 chr1 105 NaN NaN 27.0 + 6 chr1 106 NaN NaN 25.0 + """ + if self.has_chr_prefix and other.has_chr_prefix: + pass + elif self.has_chr_prefix and not other.has_chr_prefix: + other = other.update_chr_prefix('add') + elif not self.has_chr_prefix and other.has_chr_prefix: + other = other.update_chr_prefix('remove') + else: + pass + + df = self.df.merge(other.df, on=['Chromosome', 'Position'], how=how) + + merged = self.__class__(df) + + return merged diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index dfbe434..b666736 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -481,7 +481,10 @@ def one_row(r): tumor_seq_allele1 = r.ALT[1:] tumor_seq_allele2 = r.ALT[1:] - fields = r.INFO.replace('CSQ=', '').split(',')[0].split('|') + # Extract annotation fields. + if has_annot: + csq = [x for x in r.INFO.split(';') if x.startswith('CSQ=')][0] + fields = csq.replace('CSQ=', '').split('|') # Get the Strand data. if has_annot: diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index fabcee6..b5f74f8 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -750,7 +750,7 @@ def merge( names (e.g. 'chr1' vs. '1') will be automatically added or removed as necessary to match the contig names of the first VCF. how : str, default: 'inner' - Type of merge as defined in pandas.DataFrame.merge. + Type of merge as defined in :meth:`pandas.merge`. format : str, default: 'GT' FORMAT subfields to be retained (e.g. 'GT:AD:DP'). sort : bool, default: True @@ -2897,7 +2897,7 @@ def merge( 'chr1' vs. '1') will be automatically added or removed as necessary to match the contig names of ``self``. how : str, default: 'inner' - Type of merge as defined in `pandas.DataFrame.merge`. + Type of merge as defined in :meth:`pandas.DataFrame.merge`. format : str, default: 'GT' FORMAT subfields to be retained (e.g. 'GT:AD:DP'). sort : bool, default: True @@ -2910,6 +2910,11 @@ def merge( VcfFrame Merged VcfFrame. + See Also + -------- + merge + Merge multiple VcfFrame objects. + Examples -------- Assume we have the following data: @@ -5273,10 +5278,15 @@ def sort(self): 2 chr2 101 . T C . . . GT:DP 0/0:29 3 chr10 100 . G A . . . GT:DP ./.:. """ - df = self.df.sort_values(by=['CHROM', 'POS'], ignore_index=True, - key=lambda col: [CONTIGS.index(x) if isinstance(x, str) - else x for x in col]) + def f(col): + return [CONTIGS.index(x) if x in CONTIGS + else len(CONTIGS) if isinstance(x, str) + else x for x in col] + + df = self.df.sort_values(by=['CHROM', 'POS'], + ignore_index=True, key=f) vf = self.__class__(self.copy_meta(), df) + return vf def subset(self, samples, exclude=False): diff --git a/fuc/cli/ngs_quant.py b/fuc/cli/ngs_quant.py index e731995..70dc93f 100644 --- a/fuc/cli/ngs_quant.py +++ b/fuc/cli/ngs_quant.py @@ -98,6 +98,15 @@ def create_parser(subparsers): before running Kallisto. This is required for shared Posix Filesystems (e.g. NFS, Lustre).""" ) + parser.add_argument( + '--stranded', + metavar='TEXT', + default='none', + choices=['none', 'forward', 'reverse'], + help= +"""Strand specific reads (default: 'none') (choices: +'none', 'forward', 'reverse').""" + ) def main(args): if os.path.exists(args.output) and args.force: @@ -129,6 +138,12 @@ def main(args): command += f' -o {args.output}/{r.Name}' command += f' -b {args.bootstrap}' command += f' -t {args.thread}' + if args.stranded == 'forward': + command += ' --fr-stranded' + elif args.stranded == 'reverse': + command += ' --rf-stranded' + else: + pass command += f' {r.Read1}' command += f' {r.Read2}' diff --git a/fuc/version.py b/fuc/version.py index 52c1b08..cac7112 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.33.1' +__version__ = '0.34.0'