diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 45de339..fb7c24b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,15 @@ Changelog ********* +0.32.0 (2022-04-02) +------------------- + +* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix. +* Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files. +* Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity. +* Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations. +* Fix minor bug in :meth:`pyvcf.call` method when ``pybed.BedFrame`` object is given as ``regions``. + 0.31.0 (2022-03-01) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index e815334..6f3ad11 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -1227,6 +1227,8 @@ vcf-call $ fuc vcf-call -h usage: fuc vcf-call [-h] [-r TEXT [TEXT ...]] [--min-mq INT] [--max-depth INT] + [--dir-path PATH] [--gap_frac FLOAT] + [--group-samples PATH] fasta bams [bams ...] Call SNVs and indels from BAM files. @@ -1259,6 +1261,26 @@ vcf-call (default: 1). --max-depth INT At a position, read maximally this number of reads per input file (default: 250). + --dir-path PATH By default, intermediate files (likelihoods.bcf, + calls.bcf, and calls.normalized.bcf) will be stored + in a temporary directory, which is automatically + deleted after creating final VCF. If you provide a + directory path, intermediate files will be stored + there. + --gap_frac FLOAT Minimum fraction of gapped reads for calling indels + (default: 0.002). + --group-samples PATH By default, all samples are assumed to come from a + single population. This option allows to group + samples into populations and apply the HWE assumption + within but not across the populations. To use this + option, provide a tab-delimited text file with sample + names in the first column and group names in the + second column. If '--group-samples -' is given + instead, no HWE assumption is made at all and + single-sample calling is performed. Note that in low + coverage data this inflates the rate of false + positives. Therefore, make sure you know what you are + doing. [Example] Specify regions manually: $ fuc vcf-call ref.fa 1.bam 2.bam \ diff --git a/fuc/api/pykallisto.py b/fuc/api/pykallisto.py index 1e8ed63..723fed6 100644 --- a/fuc/api/pykallisto.py +++ b/fuc/api/pykallisto.py @@ -28,10 +28,16 @@ class KallistoFrame: default, the :meth:`pykallisto.basic_filter` method will be used. filter_target_id : list, optional Transcripts to filter using methods that can't be implemented using - ``filter_fun``. If provided, this will override ``filter_fun``. + ``filter_func``. If provided, this will override ``filter_func``. + filter_off : bool, default: False + If True, do not apply any filtering. Useful for generating a simple + count or tpm matrix. """ - def _import_data(self, metadata, filter_func=None, filter_target_id=None): + def _import_data( + self, metadata, filter_func=None, filter_target_id=None, + filter_off=False + ): dfs = {} for i, r in metadata.iterrows(): df = pd.read_table(r['path'] + '/abundance.tsv', index_col=0) @@ -40,23 +46,29 @@ def _import_data(self, metadata, filter_func=None, filter_target_id=None): df_tx_count.columns = metadata.index df_tx_tpm = pd.concat([v['tpm'] for k, v in dfs.items()], axis=1) df_tx_tpm.columns = metadata.index - if filter_target_id is None: - if filter_func is None: - filter_func = basic_filter - filtered_ids = df_tx_count.apply(filter_func, axis=1) + + if filter_off: + filtered_ids = None else: - filtered_ids = pd.Series(df_tx_count.index.isin(filter_target_id), - index=df_tx_count.index) + if filter_target_id is None: + if filter_func is None: + filter_func = basic_filter + filtered_ids = df_tx_count.apply(filter_func, axis=1) + else: + filtered_ids = pd.Series(df_tx_count.index.isin(filter_target_id), + index=df_tx_count.index) + return df_tx_count, df_tx_tpm, filtered_ids def __init__( self, metadata, tx2gene, aggregation_column, filter_func=None, - filter_target_id=None + filter_target_id=None, filter_off=False ): self.metadata = metadata self.tx2gene = tx2gene self.aggregation_column = aggregation_column - results = self._import_data(metadata, filter_func, filter_target_id) + results = self._import_data(metadata, filter_func, + filter_target_id, filter_off) self.df_tx_count = results[0] self.df_tx_tpm = results[1] self.filtered_ids = results[2] @@ -80,7 +92,8 @@ def aggregate(self, filter=True): gene_s = self.tx2gene[self.aggregation_column] df = pd.concat([tx_df, gene_s], axis=1) if filter: - df = df[self.filtered_ids] + if self.filtered_ids is not None: + df = df[self.filtered_ids] df = df.groupby(['gene_symbol']).sum() setattr(self, f'df_gene_{unit}', df) @@ -131,7 +144,8 @@ def plot_differential_abundance( else: df = getattr(self, f'df_tx_{unit}') if filter: - df = df[self.filtered_ids] + if self.filtered_ids is not None: + df = df[self.filtered_ids] s = self.tx2gene[self.tx2gene[self.aggregation_column] == gene] df = df[df.index.isin(s.index.to_list())] if name != 'target_id': diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 1d4f1a3..341ce5e 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -216,7 +216,8 @@ } def call( - fasta, bams, regions=None, path=None, min_mq=1, max_depth=250 + fasta, bams, regions=None, path=None, min_mq=1, max_depth=250, + dir_path=None, gap_frac=0.002, group_samples=None ): """ Call SNVs and indels from BAM files. @@ -249,6 +250,23 @@ def call( Minimum mapping quality for an alignment to be used. max_depth : int, default: 250 At a position, read maximally this number of reads per input file. + dir_path : str, optional + By default, intermediate files (likelihoods.bcf, calls.bcf, and + calls.normalized.bcf) will be stored in a temporary directory, which + is automatically deleted after creating final VCF. If you provide a + directory path, intermediate files will be stored there. + gap_frac : float, default: 0.002 + Minimum fraction of gapped reads for calling indels. + group_samples : str, optional + By default, all samples are assumed to come from a single population. + This option allows to group samples into populations and apply the + HWE assumption within but not across the populations. To use this + option, provide a tab-delimited text file with sample names in the + first column and group names in the second column. If + ``group_samples='-'`` is given instead, no HWE assumption is made at + all and single-sample calling is performed. Note that in low coverage + data this inflates the rate of false positives. Therefore, make sure + you know what you are doing. Returns ------- @@ -271,7 +289,7 @@ def call( elif isinstance(regions, list): pass elif isinstance(regions, pybed.BedFrame): - regions = bf.to_regions() + regions = regions.to_regions() else: raise TypeError("Incorrect type of argument 'regions'") if '.bed' in regions[0]: @@ -281,41 +299,52 @@ def call( regions = common.sort_regions(regions) regions = [chr_prefix + x.replace('chr', '') for x in regions] - with tempfile.TemporaryDirectory() as t: - # Step 1: Get genotype likelihoods. - args = ['-Ou', '-a', 'AD'] - args += ['-q', str(min_mq)] - args += ['--max-depth', str(max_depth)] - args += ['-f', fasta] - if regions is not None: - args += ['-r', ','.join(regions)] - results = bcftools.mpileup(*(args + bams)) - with open(f'{t}/likelihoods.bcf', 'wb') as f: - f.write(results) - - # Step 2: Call variants. - args = [f'{t}/likelihoods.bcf', '-Oz', '-mv'] - results = bcftools.call(*args) - with open(f'{t}/calls.bcf', 'wb') as f: - f.write(results) + if dir_path is None: + t = tempfile.TemporaryDirectory() + temp_dir = t.name + else: + temp_dir = dir_path + + # Step 1: Get genotype likelihoods. + args = ['-Ou', '-a', 'AD'] + args += ['-q', str(min_mq)] + args += ['--max-depth', str(max_depth)] + args += ['-f', fasta] + args += ['-F', str(gap_frac)] + if regions is not None: + args += ['-r', ','.join(regions)] + results = bcftools.mpileup(*(args + bams)) + with open(f'{temp_dir}/likelihoods.bcf', 'wb') as f: + f.write(results) + + # Step 2: Call variants. + args = [f'{temp_dir}/likelihoods.bcf', '-Ou', '-mv'] + if group_samples is not None: + args += ['-G', group_samples] + results = bcftools.call(*args) + with open(f'{temp_dir}/calls.bcf', 'wb') as f: + f.write(results) + + # Step 3: Normalize indels. + args = [f'{temp_dir}/calls.bcf', '-Ou', '-f', fasta] + results = bcftools.norm(*args) + with open(f'{temp_dir}/calls.normalized.bcf', 'wb') as f: + f.write(results) + + # Step 4: Filter variant. + args = [f'{temp_dir}/calls.normalized.bcf', '-Ov', '--IndelGap', '5'] + results = bcftools.filter(*args) - # Step 3: Normalize indels. - args = [f'{t}/calls.bcf', '-Ob', '-f', fasta] - results = bcftools.norm(*args) - with open(f'{t}/calls.normalized.bcf', 'wb') as f: + if path is None: + return results + elif path == '-': + sys.stdout.write(results) + else: + with open(path, 'w') as f: f.write(results) - # Step 4: Filter variant. - args = [f'{t}/calls.normalized.bcf', '-Ov', '--IndelGap', '5'] - results = bcftools.filter(*args) - - if path is None: - return results - elif path == '-': - sys.stdout.write(results) - else: - with open(path, 'w') as f: - f.write(results) + if dir_path is None: + t.cleanup() def rescue_filtered_variants(vfs, format='GT'): """ diff --git a/fuc/cli/vcf_call.py b/fuc/cli/vcf_call.py index b200783..b08a6e8 100644 --- a/fuc/cli/vcf_call.py +++ b/fuc/cli/vcf_call.py @@ -78,9 +78,47 @@ def create_parser(subparsers): """At a position, read maximally this number of reads per input file (default: 250).""" ) + parser.add_argument( + '--dir-path', + metavar='PATH', + help= +"""By default, intermediate files (likelihoods.bcf, +calls.bcf, and calls.normalized.bcf) will be stored +in a temporary directory, which is automatically +deleted after creating final VCF. If you provide a +directory path, intermediate files will be stored +there.""" + ) + parser.add_argument( + '--gap_frac', + metavar='FLOAT', + type=float, + default=0.002, + help= +"""Minimum fraction of gapped reads for calling indels +(default: 0.002).""" + ) + parser.add_argument( + '--group-samples', + metavar='PATH', + help= +"""By default, all samples are assumed to come from a +single population. This option allows to group +samples into populations and apply the HWE assumption +within but not across the populations. To use this +option, provide a tab-delimited text file with sample +names in the first column and group names in the +second column. If '--group-samples -' is given +instead, no HWE assumption is made at all and +single-sample calling is performed. Note that in low +coverage data this inflates the rate of false +positives. Therefore, make sure you know what you are +doing.""" + ) def main(args): api.pyvcf.call( args.fasta, args.bams, regions=args.regions, path='-', - min_mq=args.min_mq, max_depth=args.max_depth + min_mq=args.min_mq, max_depth=args.max_depth, dir_path=args.dir_path, + gap_frac=args.gap_frac, group_samples=args.group_samples ) diff --git a/fuc/version.py b/fuc/version.py index c3d10d7..2ef0c52 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.31.0' +__version__ = '0.32.0'