Skip to content

Commit

Permalink
Merge pull request #59 from sbslee/0.32.0-dev
Browse files Browse the repository at this point in the history
0.32.0 dev
  • Loading branch information
sbslee authored Apr 2, 2022
2 parents 46dba50 + 8312f58 commit d362725
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 48 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
Changelog
*********

0.32.0 (2022-04-02)
-------------------

* Add new optional argument ``filter_off`` for :class:`pykallisto.KallistoFrame` constructor, which is useful for generating a simple count or tpm matrix.
* Add new optional argument ``--dir-path`` to :command:`vcf-call` command for storing intermediate files.
* Add new optional argument ``--gap_frac`` to :command:`vcf-call` command so that users can control indel calling sensitivity.
* Add new optional argument ``--group-samples`` to :command:`vcf-call` command so that users can group samples into populations and apply the HWE assumption within but not across the populations.
* Fix minor bug in :meth:`pyvcf.call` method when ``pybed.BedFrame`` object is given as ``regions``.

0.31.0 (2022-03-01)
-------------------

Expand Down
22 changes: 22 additions & 0 deletions docs/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1227,6 +1227,8 @@ vcf-call
$ fuc vcf-call -h
usage: fuc vcf-call [-h] [-r TEXT [TEXT ...]] [--min-mq INT] [--max-depth INT]
[--dir-path PATH] [--gap_frac FLOAT]
[--group-samples PATH]
fasta bams [bams ...]
Call SNVs and indels from BAM files.
Expand Down Expand Up @@ -1259,6 +1261,26 @@ vcf-call
(default: 1).
--max-depth INT At a position, read maximally this number of reads
per input file (default: 250).
--dir-path PATH By default, intermediate files (likelihoods.bcf,
calls.bcf, and calls.normalized.bcf) will be stored
in a temporary directory, which is automatically
deleted after creating final VCF. If you provide a
directory path, intermediate files will be stored
there.
--gap_frac FLOAT Minimum fraction of gapped reads for calling indels
(default: 0.002).
--group-samples PATH By default, all samples are assumed to come from a
single population. This option allows to group
samples into populations and apply the HWE assumption
within but not across the populations. To use this
option, provide a tab-delimited text file with sample
names in the first column and group names in the
second column. If '--group-samples -' is given
instead, no HWE assumption is made at all and
single-sample calling is performed. Note that in low
coverage data this inflates the rate of false
positives. Therefore, make sure you know what you are
doing.
[Example] Specify regions manually:
$ fuc vcf-call ref.fa 1.bam 2.bam \
Expand Down
38 changes: 26 additions & 12 deletions fuc/api/pykallisto.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,16 @@ class KallistoFrame:
default, the :meth:`pykallisto.basic_filter` method will be used.
filter_target_id : list, optional
Transcripts to filter using methods that can't be implemented using
``filter_fun``. If provided, this will override ``filter_fun``.
``filter_func``. If provided, this will override ``filter_func``.
filter_off : bool, default: False
If True, do not apply any filtering. Useful for generating a simple
count or tpm matrix.
"""

def _import_data(self, metadata, filter_func=None, filter_target_id=None):
def _import_data(
self, metadata, filter_func=None, filter_target_id=None,
filter_off=False
):
dfs = {}
for i, r in metadata.iterrows():
df = pd.read_table(r['path'] + '/abundance.tsv', index_col=0)
Expand All @@ -40,23 +46,29 @@ def _import_data(self, metadata, filter_func=None, filter_target_id=None):
df_tx_count.columns = metadata.index
df_tx_tpm = pd.concat([v['tpm'] for k, v in dfs.items()], axis=1)
df_tx_tpm.columns = metadata.index
if filter_target_id is None:
if filter_func is None:
filter_func = basic_filter
filtered_ids = df_tx_count.apply(filter_func, axis=1)

if filter_off:
filtered_ids = None
else:
filtered_ids = pd.Series(df_tx_count.index.isin(filter_target_id),
index=df_tx_count.index)
if filter_target_id is None:
if filter_func is None:
filter_func = basic_filter
filtered_ids = df_tx_count.apply(filter_func, axis=1)
else:
filtered_ids = pd.Series(df_tx_count.index.isin(filter_target_id),
index=df_tx_count.index)

return df_tx_count, df_tx_tpm, filtered_ids

def __init__(
self, metadata, tx2gene, aggregation_column, filter_func=None,
filter_target_id=None
filter_target_id=None, filter_off=False
):
self.metadata = metadata
self.tx2gene = tx2gene
self.aggregation_column = aggregation_column
results = self._import_data(metadata, filter_func, filter_target_id)
results = self._import_data(metadata, filter_func,
filter_target_id, filter_off)
self.df_tx_count = results[0]
self.df_tx_tpm = results[1]
self.filtered_ids = results[2]
Expand All @@ -80,7 +92,8 @@ def aggregate(self, filter=True):
gene_s = self.tx2gene[self.aggregation_column]
df = pd.concat([tx_df, gene_s], axis=1)
if filter:
df = df[self.filtered_ids]
if self.filtered_ids is not None:
df = df[self.filtered_ids]
df = df.groupby(['gene_symbol']).sum()
setattr(self, f'df_gene_{unit}', df)

Expand Down Expand Up @@ -131,7 +144,8 @@ def plot_differential_abundance(
else:
df = getattr(self, f'df_tx_{unit}')
if filter:
df = df[self.filtered_ids]
if self.filtered_ids is not None:
df = df[self.filtered_ids]
s = self.tx2gene[self.tx2gene[self.aggregation_column] == gene]
df = df[df.index.isin(s.index.to_list())]
if name != 'target_id':
Expand Down
97 changes: 63 additions & 34 deletions fuc/api/pyvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@
}

def call(
fasta, bams, regions=None, path=None, min_mq=1, max_depth=250
fasta, bams, regions=None, path=None, min_mq=1, max_depth=250,
dir_path=None, gap_frac=0.002, group_samples=None
):
"""
Call SNVs and indels from BAM files.
Expand Down Expand Up @@ -249,6 +250,23 @@ def call(
Minimum mapping quality for an alignment to be used.
max_depth : int, default: 250
At a position, read maximally this number of reads per input file.
dir_path : str, optional
By default, intermediate files (likelihoods.bcf, calls.bcf, and
calls.normalized.bcf) will be stored in a temporary directory, which
is automatically deleted after creating final VCF. If you provide a
directory path, intermediate files will be stored there.
gap_frac : float, default: 0.002
Minimum fraction of gapped reads for calling indels.
group_samples : str, optional
By default, all samples are assumed to come from a single population.
This option allows to group samples into populations and apply the
HWE assumption within but not across the populations. To use this
option, provide a tab-delimited text file with sample names in the
first column and group names in the second column. If
``group_samples='-'`` is given instead, no HWE assumption is made at
all and single-sample calling is performed. Note that in low coverage
data this inflates the rate of false positives. Therefore, make sure
you know what you are doing.
Returns
-------
Expand All @@ -271,7 +289,7 @@ def call(
elif isinstance(regions, list):
pass
elif isinstance(regions, pybed.BedFrame):
regions = bf.to_regions()
regions = regions.to_regions()
else:
raise TypeError("Incorrect type of argument 'regions'")
if '.bed' in regions[0]:
Expand All @@ -281,41 +299,52 @@ def call(
regions = common.sort_regions(regions)
regions = [chr_prefix + x.replace('chr', '') for x in regions]

with tempfile.TemporaryDirectory() as t:
# Step 1: Get genotype likelihoods.
args = ['-Ou', '-a', 'AD']
args += ['-q', str(min_mq)]
args += ['--max-depth', str(max_depth)]
args += ['-f', fasta]
if regions is not None:
args += ['-r', ','.join(regions)]
results = bcftools.mpileup(*(args + bams))
with open(f'{t}/likelihoods.bcf', 'wb') as f:
f.write(results)

# Step 2: Call variants.
args = [f'{t}/likelihoods.bcf', '-Oz', '-mv']
results = bcftools.call(*args)
with open(f'{t}/calls.bcf', 'wb') as f:
f.write(results)
if dir_path is None:
t = tempfile.TemporaryDirectory()
temp_dir = t.name
else:
temp_dir = dir_path

# Step 1: Get genotype likelihoods.
args = ['-Ou', '-a', 'AD']
args += ['-q', str(min_mq)]
args += ['--max-depth', str(max_depth)]
args += ['-f', fasta]
args += ['-F', str(gap_frac)]
if regions is not None:
args += ['-r', ','.join(regions)]
results = bcftools.mpileup(*(args + bams))
with open(f'{temp_dir}/likelihoods.bcf', 'wb') as f:
f.write(results)

# Step 2: Call variants.
args = [f'{temp_dir}/likelihoods.bcf', '-Ou', '-mv']
if group_samples is not None:
args += ['-G', group_samples]
results = bcftools.call(*args)
with open(f'{temp_dir}/calls.bcf', 'wb') as f:
f.write(results)

# Step 3: Normalize indels.
args = [f'{temp_dir}/calls.bcf', '-Ou', '-f', fasta]
results = bcftools.norm(*args)
with open(f'{temp_dir}/calls.normalized.bcf', 'wb') as f:
f.write(results)

# Step 4: Filter variant.
args = [f'{temp_dir}/calls.normalized.bcf', '-Ov', '--IndelGap', '5']
results = bcftools.filter(*args)

# Step 3: Normalize indels.
args = [f'{t}/calls.bcf', '-Ob', '-f', fasta]
results = bcftools.norm(*args)
with open(f'{t}/calls.normalized.bcf', 'wb') as f:
if path is None:
return results
elif path == '-':
sys.stdout.write(results)
else:
with open(path, 'w') as f:
f.write(results)

# Step 4: Filter variant.
args = [f'{t}/calls.normalized.bcf', '-Ov', '--IndelGap', '5']
results = bcftools.filter(*args)

if path is None:
return results
elif path == '-':
sys.stdout.write(results)
else:
with open(path, 'w') as f:
f.write(results)
if dir_path is None:
t.cleanup()

def rescue_filtered_variants(vfs, format='GT'):
"""
Expand Down
40 changes: 39 additions & 1 deletion fuc/cli/vcf_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,47 @@ def create_parser(subparsers):
"""At a position, read maximally this number of reads
per input file (default: 250)."""
)
parser.add_argument(
'--dir-path',
metavar='PATH',
help=
"""By default, intermediate files (likelihoods.bcf,
calls.bcf, and calls.normalized.bcf) will be stored
in a temporary directory, which is automatically
deleted after creating final VCF. If you provide a
directory path, intermediate files will be stored
there."""
)
parser.add_argument(
'--gap_frac',
metavar='FLOAT',
type=float,
default=0.002,
help=
"""Minimum fraction of gapped reads for calling indels
(default: 0.002)."""
)
parser.add_argument(
'--group-samples',
metavar='PATH',
help=
"""By default, all samples are assumed to come from a
single population. This option allows to group
samples into populations and apply the HWE assumption
within but not across the populations. To use this
option, provide a tab-delimited text file with sample
names in the first column and group names in the
second column. If '--group-samples -' is given
instead, no HWE assumption is made at all and
single-sample calling is performed. Note that in low
coverage data this inflates the rate of false
positives. Therefore, make sure you know what you are
doing."""
)

def main(args):
api.pyvcf.call(
args.fasta, args.bams, regions=args.regions, path='-',
min_mq=args.min_mq, max_depth=args.max_depth
min_mq=args.min_mq, max_depth=args.max_depth, dir_path=args.dir_path,
gap_frac=args.gap_frac, group_samples=args.group_samples
)
2 changes: 1 addition & 1 deletion fuc/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.31.0'
__version__ = '0.32.0'

0 comments on commit d362725

Please sign in to comment.