Merge pull request #59 from sbslee/0.32.0-dev

0.32.0 dev
sbslee · Apr 2, 2022 · d362725 · d362725
2 parents 46dba50 + 8312f58
commit d362725
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 48 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,15 @@
 Changelog
 *********
 
+0.32.0 (2022-04-02)
+-------------------
+
+* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
+* Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files.
+* Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity.
+* Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations.
+* Fix minor bug in :meth:`pyvcf.call` method when ``pybed.BedFrame`` object is given as ``regions``.
+
 0.31.0 (2022-03-01)
 -------------------
 

diff --git a/docs/cli.rst b/docs/cli.rst
@@ -1227,6 +1227,8 @@ vcf-call
 
    $ fuc vcf-call -h
    usage: fuc vcf-call [-h] [-r TEXT [TEXT ...]] [--min-mq INT] [--max-depth INT]
+                       [--dir-path PATH] [--gap_frac FLOAT]
+                       [--group-samples PATH]
                        fasta bams [bams ...]
    
    Call SNVs and indels from BAM files.
@@ -1259,6 +1261,26 @@ vcf-call
                            (default: 1).
      --max-depth INT       At a position, read maximally this number of reads
                            per input file (default: 250).
+     --dir-path PATH       By default, intermediate files (likelihoods.bcf,
+                           calls.bcf, and calls.normalized.bcf) will be stored
+                           in a temporary directory, which is automatically
+                           deleted after creating final VCF. If you provide a
+                           directory path, intermediate files will be stored
+                           there.
+     --gap_frac FLOAT      Minimum fraction of gapped reads for calling indels
+                           (default: 0.002).
+     --group-samples PATH  By default, all samples are assumed to come from a
+                           single population. This option allows to group
+                           samples into populations and apply the HWE assumption
+                           within but not across the populations. To use this
+                           option, provide a tab-delimited text file with sample
+                           names in the first column and group names in the
+                           second column. If '--group-samples -' is given
+                           instead, no HWE assumption is made at all and
+                           single-sample calling is performed. Note that in low
+                           coverage data this inflates the rate of false
+                           positives. Therefore, make sure you know what you are
+                           doing.
    
    [Example] Specify regions manually:
      $ fuc vcf-call ref.fa 1.bam 2.bam \

diff --git a/fuc/api/pykallisto.py b/fuc/api/pykallisto.py
@@ -28,10 +28,16 @@ class KallistoFrame:
         default, the :meth:`pykallisto.basic_filter` method will be used.
     filter_target_id : list, optional
         Transcripts to filter using methods that can't be implemented using
-        ``filter_fun``. If provided, this will override ``filter_fun``.
+        ``filter_func``. If provided, this will override ``filter_func``.
+    filter_off : bool, default: False
+        If True, do not apply any filtering. Useful for generating a simple
+        count or tpm matrix.
     """
 
-    def _import_data(self, metadata, filter_func=None, filter_target_id=None):
+    def _import_data(
+        self, metadata, filter_func=None, filter_target_id=None,
+        filter_off=False
+    ):
         dfs = {}
         for i, r in metadata.iterrows():
             df = pd.read_table(r['path'] + '/abundance.tsv', index_col=0)
@@ -40,23 +46,29 @@ def _import_data(self, metadata, filter_func=None, filter_target_id=None):
         df_tx_count.columns = metadata.index
         df_tx_tpm = pd.concat([v['tpm'] for k, v in dfs.items()], axis=1)
         df_tx_tpm.columns = metadata.index
-        if filter_target_id is None:
-            if filter_func is None:
-                filter_func = basic_filter
-            filtered_ids = df_tx_count.apply(filter_func, axis=1)
+
+        if filter_off:
+            filtered_ids = None
         else:
-            filtered_ids = pd.Series(df_tx_count.index.isin(filter_target_id),
-                index=df_tx_count.index)
+            if filter_target_id is None:
+                if filter_func is None:
+                    filter_func = basic_filter
+                filtered_ids = df_tx_count.apply(filter_func, axis=1)
+            else:
+                filtered_ids = pd.Series(df_tx_count.index.isin(filter_target_id),
+                    index=df_tx_count.index)
+
         return df_tx_count, df_tx_tpm, filtered_ids
 
     def __init__(
         self, metadata, tx2gene, aggregation_column, filter_func=None,
-        filter_target_id=None
+        filter_target_id=None, filter_off=False
     ):
         self.metadata = metadata
         self.tx2gene = tx2gene
         self.aggregation_column = aggregation_column
-        results = self._import_data(metadata, filter_func, filter_target_id)
+        results = self._import_data(metadata, filter_func,
+            filter_target_id, filter_off)
         self.df_tx_count = results[0]
         self.df_tx_tpm = results[1]
         self.filtered_ids = results[2]
@@ -80,7 +92,8 @@ def aggregate(self, filter=True):
             gene_s = self.tx2gene[self.aggregation_column]
             df = pd.concat([tx_df, gene_s], axis=1)
             if filter:
-                df = df[self.filtered_ids]
+                if self.filtered_ids is not None:
+                    df = df[self.filtered_ids]
             df = df.groupby(['gene_symbol']).sum()
             setattr(self, f'df_gene_{unit}', df)
 
@@ -131,7 +144,8 @@ def plot_differential_abundance(
         else:
             df = getattr(self, f'df_tx_{unit}')
             if filter:
-                df = df[self.filtered_ids]
+                if self.filtered_ids is not None:
+                    df = df[self.filtered_ids]
             s = self.tx2gene[self.tx2gene[self.aggregation_column] == gene]
             df = df[df.index.isin(s.index.to_list())]
             if name != 'target_id':

diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py
@@ -216,7 +216,8 @@
 }
 
 def call(
-    fasta, bams, regions=None, path=None, min_mq=1, max_depth=250
+    fasta, bams, regions=None, path=None, min_mq=1, max_depth=250,
+    dir_path=None, gap_frac=0.002, group_samples=None
 ):
     """
     Call SNVs and indels from BAM files.
@@ -249,6 +250,23 @@ def call(
         Minimum mapping quality for an alignment to be used.
     max_depth : int, default: 250
         At a position, read maximally this number of reads per input file.
+    dir_path : str, optional
+        By default, intermediate files (likelihoods.bcf, calls.bcf, and
+        calls.normalized.bcf) will be stored in a temporary directory, which
+        is automatically deleted after creating final VCF. If you provide a
+        directory path, intermediate files will be stored there.
+    gap_frac : float, default: 0.002
+        Minimum fraction of gapped reads for calling indels.
+    group_samples : str, optional
+        By default, all samples are assumed to come from a single population.
+        This option allows to group samples into populations and apply the
+        HWE assumption within but not across the populations. To use this
+        option, provide a tab-delimited text file with sample names in the
+        first column and group names in the second column. If
+        ``group_samples='-'`` is given instead, no HWE assumption is made at
+        all and single-sample calling is performed. Note that in low coverage
+        data this inflates the rate of false positives. Therefore, make sure
+        you know what you are doing.
 
     Returns
     -------
@@ -271,7 +289,7 @@ def call(
         elif isinstance(regions, list):
             pass
         elif isinstance(regions, pybed.BedFrame):
-            regions = bf.to_regions()
+            regions = regions.to_regions()
         else:
             raise TypeError("Incorrect type of argument 'regions'")
         if '.bed' in regions[0]:
@@ -281,41 +299,52 @@ def call(
             regions = common.sort_regions(regions)
         regions = [chr_prefix + x.replace('chr', '') for x in regions]
 
-    with tempfile.TemporaryDirectory() as t:
-        # Step 1: Get genotype likelihoods.
-        args = ['-Ou', '-a', 'AD']
-        args += ['-q', str(min_mq)]
-        args += ['--max-depth', str(max_depth)]
-        args += ['-f', fasta]
-        if regions is not None:
-            args += ['-r', ','.join(regions)]
-        results = bcftools.mpileup(*(args + bams))
-        with open(f'{t}/likelihoods.bcf', 'wb') as f:
-            f.write(results)
-
-        # Step 2: Call variants.
-        args = [f'{t}/likelihoods.bcf', '-Oz', '-mv']
-        results = bcftools.call(*args)
-        with open(f'{t}/calls.bcf', 'wb') as f:
-            f.write(results)
+    if dir_path is None:
+        t = tempfile.TemporaryDirectory()
+        temp_dir = t.name
+    else:
+        temp_dir = dir_path
+
+    # Step 1: Get genotype likelihoods.
+    args = ['-Ou', '-a', 'AD']
+    args += ['-q', str(min_mq)]
+    args += ['--max-depth', str(max_depth)]
+    args += ['-f', fasta]
+    args += ['-F', str(gap_frac)]
+    if regions is not None:
+        args += ['-r', ','.join(regions)]
+    results = bcftools.mpileup(*(args + bams))
+    with open(f'{temp_dir}/likelihoods.bcf', 'wb') as f:
+        f.write(results)
+
+    # Step 2: Call variants.
+    args = [f'{temp_dir}/likelihoods.bcf', '-Ou', '-mv']
+    if group_samples is not None:
+        args += ['-G', group_samples]
+    results = bcftools.call(*args)
+    with open(f'{temp_dir}/calls.bcf', 'wb') as f:
+        f.write(results)
+
+    # Step 3: Normalize indels.
+    args = [f'{temp_dir}/calls.bcf', '-Ou', '-f', fasta]
+    results = bcftools.norm(*args)
+    with open(f'{temp_dir}/calls.normalized.bcf', 'wb') as f:
+        f.write(results)
+
+    # Step 4: Filter variant.
+    args = [f'{temp_dir}/calls.normalized.bcf', '-Ov', '--IndelGap', '5']
+    results = bcftools.filter(*args)
 
-        # Step 3: Normalize indels.
-        args = [f'{t}/calls.bcf', '-Ob', '-f', fasta]
-        results = bcftools.norm(*args)
-        with open(f'{t}/calls.normalized.bcf', 'wb') as f:
+    if path is None:
+        return results
+    elif path == '-':
+        sys.stdout.write(results)
+    else:
+        with open(path, 'w') as f:
             f.write(results)
 
-        # Step 4: Filter variant.
-        args = [f'{t}/calls.normalized.bcf', '-Ov', '--IndelGap', '5']
-        results = bcftools.filter(*args)
-
-        if path is None:
-            return results
-        elif path == '-':
-            sys.stdout.write(results)
-        else:
-            with open(path, 'w') as f:
-                f.write(results)
+    if dir_path is None:
+        t.cleanup()
 
 def rescue_filtered_variants(vfs, format='GT'):
     """

diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py
@@ -78,9 +78,47 @@ def create_parser(subparsers):
 """At a position, read maximally this number of reads
 per input file (default: 250)."""
     )
+    parser.add_argument(
+        '--dir-path',
+        metavar='PATH',
+        help=
+"""By default, intermediate files (likelihoods.bcf,
+calls.bcf, and calls.normalized.bcf) will be stored
+in a temporary directory, which is automatically
+deleted after creating final VCF. If you provide a
+directory path, intermediate files will be stored
+there."""
+    )
+    parser.add_argument(
+        '--gap_frac',
+        metavar='FLOAT',
+        type=float,
+        default=0.002,
+        help=
+"""Minimum fraction of gapped reads for calling indels
+(default: 0.002)."""
+    )
+    parser.add_argument(
+        '--group-samples',
+        metavar='PATH',
+        help=
+"""By default, all samples are assumed to come from a
+single population. This option allows to group
+samples into populations and apply the HWE assumption
+within but not across the populations. To use this
+option, provide a tab-delimited text file with sample
+names in the first column and group names in the
+second column. If '--group-samples -' is given
+instead, no HWE assumption is made at all and
+single-sample calling is performed. Note that in low
+coverage data this inflates the rate of false
+positives. Therefore, make sure you know what you are
+doing."""
+    )
 
 def main(args):
     api.pyvcf.call(
         args.fasta, args.bams, regions=args.regions, path='-',
-        min_mq=args.min_mq, max_depth=args.max_depth
+        min_mq=args.min_mq, max_depth=args.max_depth, dir_path=args.dir_path,
+        gap_frac=args.gap_frac, group_samples=args.group_samples
     )
diff --git a/fuc/version.py b/fuc/version.py
@@ -1 +1 @@
-__version__ = '0.31.0'
+__version__ = '0.32.0'