Merge pull request #64 from sbslee/0.34.0-dev

0.34.0 dev
sbslee · Jun 7, 2022 · de8ce31 · de8ce31
2 parents 4df48f8 + 25e163a
commit de8ce31
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 8 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,15 @@
 Changelog
 *********
 
+0.34.0 (2022-06-08)
+-------------------
+
+* Add new optional argument ``--stranded`` to :command:`ngs-quant` command.
+* Add new method :meth:`pycov.CovFrame.merge`.
+* Add new method :meth:`pycov.merge`.
+* :issue:`61`: Update :meth:`pymaf.MafFrame.from_vcf` method to automatically detect CSQ field in INFO column (thanks `@lbeltrame <https://github.com/lbeltrame>`__).
+* :issue:`63`: Update :meth:`pyvcf.VcfFrame.sort` method to handle contigs that are not pre-defined.
+
 0.33.1 (2022-05-03)
 -------------------
 

diff --git a/docs/cli.rst b/docs/cli.rst
@@ -1029,7 +1029,7 @@ ngs-quant
 
    $ fuc ngs-quant -h
    usage: fuc ngs-quant [-h] [--thread INT] [--bootstrap INT] [--job TEXT]
-                        [--force] [--posix]
+                        [--force] [--posix] [--stranded TEXT]
                         manifest index output qsub
    
    Pipeline for running RNAseq quantification from FASTQ files with Kallisto.
@@ -1058,6 +1058,8 @@ ngs-quant
      --posix          Set the environment variable HDF5_USE_FILE_LOCKING=FALSE
                       before running Kallisto. This is required for shared Posix
                       Filesystems (e.g. NFS, Lustre).
+     --stranded TEXT  Strand specific reads (default: 'none') (choices:
+                      'none', 'forward', 'reverse').
    
    [Example] Specify queue:
      $ fuc ngs-quant \

diff --git a/fuc/api/pycov.py b/fuc/api/pycov.py
@@ -83,6 +83,90 @@ def simulate(mode='wgs', loc=30, scale=5, size=1000):
 
     return a
 
+def merge(
+    cfs, how='inner'
+):
+    """
+    Merge CovFrame objects.
+
+    Parameters
+    ----------
+    cfs : list
+        List of CovFrames to be merged. Note that the 'chr' prefix in contig
+        names (e.g. 'chr1' vs. '1') will be automatically added or removed as
+        necessary to match the contig names of the first CovFrame.
+    how : str, default: 'inner'
+        Type of merge as defined in :meth:`pandas.merge`.
+
+    Returns
+    -------
+    CovFrame
+        Merged CovFrame.
+
+    See Also
+    --------
+    CovFrame.merge
+        Merge self with another CovFrame.
+
+    Examples
+    --------
+    Assume we have the following data:
+
+    >>> import numpy as np
+    >>> from fuc import pycov
+    >>> data1 = {
+    ...     'Chromosome': ['chr1'] * 5,
+    ...     'Position': np.arange(100, 105),
+    ...     'A': pycov.simulate(loc=35, scale=5, size=5),
+    ...     'B': pycov.simulate(loc=25, scale=7, size=5),
+    ... }
+    >>> data2 = {
+    ...     'Chromosome': ['1'] * 5,
+    ...     'Position': np.arange(102, 107),
+    ...     'C': pycov.simulate(loc=35, scale=5, size=5),
+    ... }
+    >>> cf1 = pycov.CovFrame.from_dict(data1)
+    >>> cf2 = pycov.CovFrame.from_dict(data2)
+    >>> cf1.df
+      Chromosome  Position   A   B
+    0       chr1       100  33  17
+    1       chr1       101  36  20
+    2       chr1       102  39  39
+    3       chr1       103  31  19
+    4       chr1       104  31  10
+    >>> cf2.df
+      Chromosome  Position   C
+    0          1       102  41
+    1          1       103  37
+    2          1       104  35
+    3          1       105  33
+    4          1       106  39
+
+    We can merge the two VcfFrames with `how='inner'` (default):
+
+    >>> pycov.merge([cf1, cf2]).df
+      Chromosome  Position   A   B   C
+    0       chr1       102  39  39  41
+    1       chr1       103  31  19  37
+    2       chr1       104  31  10  35
+
+    We can also merge with `how='outer'`:
+
+    >>> pycov.merge([cf1, cf2], how='outer').df
+      Chromosome  Position     A     B     C
+    0       chr1       100  33.0  17.0   NaN
+    1       chr1       101  36.0  20.0   NaN
+    2       chr1       102  39.0  39.0  41.0
+    3       chr1       103  31.0  19.0  37.0
+    4       chr1       104  31.0  10.0  35.0
+    5       chr1       105   NaN   NaN  33.0
+    6       chr1       106   NaN   NaN  39.0
+    """
+    merged_cf = cfs[0]
+    for cf in cfs[1:]:
+        merged_cf = merged_cf.merge(cf, how=how)
+    return merged_cf
+
 class CovFrame:
     """
     Class for storing read depth data from one or more SAM/BAM/CRAM files.
@@ -1128,3 +1212,97 @@ def rename(self, names, indicies=None):
         cf = self.copy()
         cf.df.columns = columns
         return cf
+
+    def merge(
+        self, other, how='inner'
+    ):
+        """
+        Merge with the other CovFrame.
+
+        Parameters
+        ----------
+        other : CovFrame
+            Other CovFrame. Note that the 'chr' prefix in contig names (e.g.
+            'chr1' vs. '1') will be automatically added or removed as
+            necessary to match the contig names of ``self``.
+        how : str, default: 'inner'
+            Type of merge as defined in :meth:`pandas.DataFrame.merge`.
+
+        Returns
+        -------
+        CovFrame
+            Merged CovFrame.
+
+        See Also
+        --------
+        merge
+            Merge multiple CovFrame objects.
+
+        Examples
+        --------
+        Assume we have the following data:
+
+        >>> import numpy as np
+        >>> from fuc import pycov
+        >>> data1 = {
+        ...     'Chromosome': ['chr1'] * 5,
+        ...     'Position': np.arange(100, 105),
+        ...     'A': pycov.simulate(loc=35, scale=5, size=5),
+        ...     'B': pycov.simulate(loc=25, scale=7, size=5),
+        ... }
+        >>> data2 = {
+        ...     'Chromosome': ['1'] * 5,
+        ...     'Position': np.arange(102, 107),
+        ...     'C': pycov.simulate(loc=35, scale=5, size=5),
+        ... }
+        >>> cf1 = pycov.CovFrame.from_dict(data1)
+        >>> cf2 = pycov.CovFrame.from_dict(data2)
+        >>> cf1.df
+          Chromosome  Position   A   B
+        0       chr1       100  40  27
+        1       chr1       101  32  33
+        2       chr1       102  32  22
+        3       chr1       103  32  29
+        4       chr1       104  37  22
+        >>> cf2.df
+          Chromosome  Position   C
+        0          1       102  33
+        1          1       103  29
+        2          1       104  35
+        3          1       105  27
+        4          1       106  25
+
+        We can merge the two VcfFrames with `how='inner'` (default):
+
+        >>> cf1.merge(cf2).df
+          Chromosome  Position   A   B   C
+        0       chr1       102  32  22  33
+        1       chr1       103  32  29  29
+        2       chr1       104  37  22  35
+
+        We can also merge with `how='outer'`:
+
+        >>> cf1.merge(cf2, how='outer').df
+          Chromosome  Position     A     B     C
+        0       chr1       100  40.0  27.0   NaN
+        1       chr1       101  32.0  33.0   NaN
+        2       chr1       102  32.0  22.0  33.0
+        3       chr1       103  32.0  29.0  29.0
+        4       chr1       104  37.0  22.0  35.0
+        5       chr1       105   NaN   NaN  27.0
+        6       chr1       106   NaN   NaN  25.0
+        """
+        if self.has_chr_prefix and other.has_chr_prefix:
+            pass
+        elif self.has_chr_prefix and not other.has_chr_prefix:
+            other = other.update_chr_prefix('add')
+        elif not self.has_chr_prefix and other.has_chr_prefix:
+            other = other.update_chr_prefix('remove')
+        else:
+            pass
+
+        df = self.df.merge(other.df, on=['Chromosome', 'Position'], how=how)
+
+        merged = self.__class__(df)
+
+        return merged
diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py
@@ -481,7 +481,10 @@ def one_row(r):
                 tumor_seq_allele1 = r.ALT[1:]
                 tumor_seq_allele2 = r.ALT[1:]
 
-            fields = r.INFO.replace('CSQ=', '').split(',')[0].split('|')
+            # Extract annotation fields.
+            if has_annot:
+                csq = [x for x in r.INFO.split(';') if x.startswith('CSQ=')][0]
+                fields = csq.replace('CSQ=', '').split('|')
 
             # Get the Strand data.
             if has_annot:

diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
@@ -750,7 +750,7 @@ def merge(
         names (e.g. 'chr1' vs. '1') will be automatically added or removed as
         necessary to match the contig names of the first VCF.
     how : str, default: 'inner'
-        Type of merge as defined in pandas.DataFrame.merge.
+        Type of merge as defined in :meth:`pandas.merge`.
     format : str, default: 'GT'
         FORMAT subfields to be retained (e.g. 'GT:AD:DP').
     sort : bool, default: True
@@ -2897,7 +2897,7 @@ def merge(
             'chr1' vs. '1') will be automatically added or removed as
             necessary to match the contig names of ``self``.
         how : str, default: 'inner'
-            Type of merge as defined in `pandas.DataFrame.merge`.
+            Type of merge as defined in :meth:`pandas.DataFrame.merge`.
         format : str, default: 'GT'
             FORMAT subfields to be retained (e.g. 'GT:AD:DP').
         sort : bool, default: True
@@ -2910,6 +2910,11 @@ def merge(
         VcfFrame
             Merged VcfFrame.
 
+        See Also
+        --------
+        merge
+            Merge multiple VcfFrame objects.
+
         Examples
         --------
         Assume we have the following data:
@@ -5273,10 +5278,15 @@ def sort(self):
         2   chr2  101  .   T   C    .      .    .  GT:DP  0/0:29
         3  chr10  100  .   G   A    .      .    .  GT:DP   ./.:.
         """
-        df = self.df.sort_values(by=['CHROM', 'POS'], ignore_index=True,
-            key=lambda col: [CONTIGS.index(x) if isinstance(x, str)
-                             else x for x in col])
+        def f(col):
+            return [CONTIGS.index(x) if x in CONTIGS
+                else len(CONTIGS) if isinstance(x, str)
+                else x for x in col]
+
+        df = self.df.sort_values(by=['CHROM', 'POS'],
+            ignore_index=True, key=f)
         vf = self.__class__(self.copy_meta(), df)
+
         return vf
 
     def subset(self, samples, exclude=False):

diff --git a/fuc/cli/ngs_quant.py b/fuc/cli/ngs_quant.py
@@ -98,6 +98,15 @@ def create_parser(subparsers):
 before running Kallisto. This is required for shared Posix
 Filesystems (e.g. NFS, Lustre)."""
     )
+    parser.add_argument(
+        '--stranded',
+        metavar='TEXT',
+        default='none',
+        choices=['none', 'forward', 'reverse'],
+        help=
+"""Strand specific reads (default: 'none') (choices:
+'none', 'forward', 'reverse')."""
+    )
 
 def main(args):
     if os.path.exists(args.output) and args.force:
@@ -129,6 +138,12 @@ def main(args):
             command += f' -o {args.output}/{r.Name}'
             command += f' -b {args.bootstrap}'
             command += f' -t {args.thread}'
+            if args.stranded == 'forward':
+                command += ' --fr-stranded'
+            elif args.stranded == 'reverse':
+                command += ' --rf-stranded'
+            else:
+                pass
             command += f' {r.Read1}'
             command += f' {r.Read2}'
 

diff --git a/fuc/version.py b/fuc/version.py
@@ -1 +1 @@
-__version__ = '0.33.1'
+__version__ = '0.34.0'