-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CountMatrix bugfix and new tests #43
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,30 +13,36 @@ | |
|
||
Methods | ||
------- | ||
iter_tag_groups function to iterate over reads by an arbitrary tag | ||
iter_cell_barcodes wrapper for iter_tag_groups that iterates over cell barcode tags | ||
iter_genes wrapper for iter_tag_groups that iterates over gene tags | ||
iter_molecules wrapper for iter_tag_groups that iterates over molecule tags | ||
iter_tag_groups function to iterate over reads by an arbitrary tag | ||
iter_cell_barcodes wrapper for iter_tag_groups that iterates over cell barcode tags | ||
iter_genes wrapper for iter_tag_groups that iterates over gene tags | ||
iter_molecules wrapper for iter_tag_groups that iterates over molecule tags | ||
|
||
Classes | ||
------- | ||
SubsetAlignments class to extract reads specific to requested chromosome(s) | ||
Tagger class to add tags to sam/bam records from paired fastq records | ||
SubsetAlignments class to extract reads specific to requested chromosome(s) | ||
Tagger class to add tags to sam/bam records from paired fastq records | ||
AlignmentSortOrder abstract class to represent alignment sort orders | ||
QueryNameSortOrder alignment sort order by query name | ||
CellMoleculeGeneQueryNameSortOrder alignment sort order hierarchically cell > molecule > gene > query name | ||
|
||
References | ||
---------- | ||
htslib : https://github.com/samtools/htslib | ||
|
||
""" | ||
|
||
import warnings | ||
import os | ||
import math | ||
import os | ||
import warnings | ||
from abc import abstractmethod | ||
from itertools import cycle | ||
from typing import Iterator, Generator, List, Union, Tuple | ||
from typing import Iterator, Generator, List, Dict, Union, Tuple, Callable, Any, Optional | ||
|
||
import pysam | ||
|
||
from . import consts | ||
|
||
|
||
class SubsetAlignments: | ||
"""Wrapper for pysam/htslib that extracts reads corresponding to requested chromosome(s) | ||
|
@@ -74,9 +80,8 @@ def __init__(self, alignment_file: str, open_mode: str=None): | |
open_mode = 'r' | ||
else: | ||
raise ValueError( | ||
'could not autodetect file type for alignment_file %s (detectible suffixes: ' | ||
'.sam, .bam)' % alignment_file | ||
) | ||
f'Could not autodetect file type for alignment_file {alignment_file} (detectable suffixes: ' | ||
f'.sam, .bam)') | ||
self._file: str = alignment_file | ||
self._open_mode: str = open_mode | ||
|
||
|
@@ -164,7 +169,7 @@ class Tagger: | |
|
||
def __init__(self, bam_file: str) -> None: | ||
if not isinstance(bam_file, str): | ||
raise TypeError('bam_file must be type str, not %s' % type(bam_file)) | ||
raise TypeError(f'The argument "bam_file" must be of type str, not {type(bam_file)}') | ||
self.bam_file = bam_file | ||
|
||
# todo add type to tag_generators (make sure it doesn't introduce import issues | ||
|
@@ -193,7 +198,7 @@ def tag(self, output_bam_name: str, tag_generators) -> None: | |
outbam.write(sam_record) | ||
|
||
|
||
def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=True) -> List[str]: | ||
def split(in_bam: str, out_prefix: str, *tags, approx_mb_per_split=1000, raise_missing=True) -> List[str]: | ||
"""split `in_bam` by tag into files of `approx_mb_per_split` | ||
|
||
Parameters | ||
|
@@ -203,7 +208,7 @@ def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=Tru | |
out_prefix : str | ||
Prefix for all output files; output will be named as prefix_n where n is an integer equal | ||
to the chunk number. | ||
tags : list | ||
tags : tuple | ||
The bam tags to split on. The tags are checked in order, and sorting is done based on the | ||
first identified tag. Further tags are only checked if the first tag is missing. This is | ||
useful in cases where sorting is executed over a corrected barcode, but some records only | ||
|
@@ -231,44 +236,47 @@ def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=Tru | |
if len(tags) == 0: | ||
raise ValueError('At least one tag must be passed') | ||
|
||
def _cleanup(files_to_counts, files_to_names, rm_all=False) -> None: | ||
def _cleanup( | ||
_files_to_counts: Dict[pysam.AlignmentFile, int], _files_to_names: Dict[pysam.AlignmentFile, str], | ||
rm_all: bool=False) -> None: | ||
"""Closes file handles and remove any empty files. | ||
|
||
Parameters | ||
---------- | ||
files_to_counts : dict | ||
_files_to_counts : dict | ||
Dictionary of file objects to the number of reads each contains. | ||
files_to_names : dict | ||
_files_to_names : dict | ||
Dictionary of file objects to file handles. | ||
rm_all : bool, optional | ||
If True, indicates all files should be removed, regardless of count number, else only | ||
empty files without counts are removed (default = False) | ||
|
||
""" | ||
for bamfile, count in files_to_counts.items(): | ||
for bamfile, count in _files_to_counts.items(): | ||
# corner case: clean up files that were created, but didn't get data because | ||
# n_cell < n_barcode | ||
bamfile.close() | ||
if count == 0 or rm_all: | ||
os.remove(files_to_names[bamfile]) | ||
del files_to_names[bamfile] | ||
os.remove(_files_to_names[bamfile]) | ||
del _files_to_names[bamfile] | ||
|
||
# find correct number of subfiles to spawn | ||
bam_mb = os.path.getsize(in_bam) * 1e-6 | ||
n_subfiles = int(math.ceil(bam_mb / approx_mb_per_split)) | ||
if n_subfiles > 500: | ||
warnings.warn('Number of requested subfiles (%d) exceeds 500; this may cause OS errors by ' | ||
'exceeding fid limits' % n_subfiles) | ||
if n_subfiles > 1000: | ||
raise ValueError('Number of requested subfiles (%d) exceeds 1000; this will usually cause ' | ||
'OS errors, think about increasing max_mb_per_split.' % n_subfiles) | ||
if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN: | ||
warnings.warn(f'Number of requested subfiles ({n_subfiles}) exceeds ' | ||
f'{consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN}; this may cause OS errors by exceeding fid limits') | ||
if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE: | ||
raise ValueError(f'Number of requested subfiles ({n_subfiles}) exceeds ' | ||
f'{consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE}; this will usually cause OS errors, ' | ||
f'think about increasing max_mb_per_split.') | ||
|
||
# create all the output files | ||
with pysam.AlignmentFile(in_bam, 'rb', check_sq=False) as input_alignments: | ||
|
||
# map files to counts | ||
files_to_counts = {} | ||
files_to_names = {} | ||
files_to_counts: Dict[pysam.AlignmentFile, int] = {} | ||
files_to_names: Dict[pysam.AlignmentFile, str] = {} | ||
for i in range(n_subfiles): | ||
out_bam_name = out_prefix + '_%d.bam' % i | ||
open_bam = pysam.AlignmentFile(out_bam_name, 'wb', template=input_alignments) | ||
|
@@ -297,8 +305,7 @@ def _cleanup(files_to_counts, files_to_names, rm_all=False) -> None: | |
if tag_content is None: | ||
if raise_missing: | ||
_cleanup(files_to_counts, files_to_names, rm_all=True) | ||
raise RuntimeError( | ||
'alignment encountered that is missing %s tag(s).' % repr(tags)) | ||
raise RuntimeError('Alignment encountered that is missing {repr(tags)} tag(s).') | ||
else: | ||
continue # move on to next alignment | ||
|
||
|
@@ -379,12 +386,12 @@ def iter_molecule_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Gene | |
Yields | ||
------ | ||
grouped_by_tag : Iterator[pysam.AlignedSegment] | ||
reads sharing a unique molecule barcode (``UB`` tag) | ||
reads sharing a unique molecule barcode tag | ||
current_tag : str | ||
the molecule barcode that records in the group all share | ||
|
||
""" | ||
return iter_tag_groups(tag='UB', bam_iterator=bam_iterator) | ||
return iter_tag_groups(tag=consts.MOLECULE_BARCODE_TAG_KEY, bam_iterator=bam_iterator) | ||
|
||
|
||
def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: | ||
|
@@ -398,12 +405,12 @@ def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generato | |
Yields | ||
------ | ||
grouped_by_tag : Iterator[pysam.AlignedSegment] | ||
reads sharing a unique cell barcode (``CB`` tag) | ||
reads sharing a unique cell barcode tag | ||
current_tag : str | ||
the cell barcode that reads in the group all share | ||
|
||
""" | ||
return iter_tag_groups(tag='CB', bam_iterator=bam_iterator) | ||
return iter_tag_groups(tag=consts.CELL_BARCODE_TAG_KEY, bam_iterator=bam_iterator) | ||
|
||
|
||
def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: | ||
|
@@ -417,9 +424,69 @@ def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: | |
Yields | ||
------ | ||
grouped_by_tag : Iterator[pysam.AlignedSegment] | ||
reads sharing a unique gene id (``GE`` tag) | ||
reads sharing a unique gene name tag | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yikes I thought these were gene ids, not gene names. Good catch. |
||
current_tag : str | ||
the gene id that reads in the group all share | ||
|
||
""" | ||
return iter_tag_groups(tag='GE', bam_iterator=bam_iterator) | ||
return iter_tag_groups(tag=consts.GENE_NAME_TAG_KEY, bam_iterator=bam_iterator) | ||
|
||
|
||
def get_tag_or_default(alignment: pysam.AlignedSegment, tag_key: str, default: Optional[str] = None) -> Optional[str]: | ||
"""Extracts the value associated to `tag_key` from `alignment`, and returns a default value | ||
if the tag is not present.""" | ||
try: | ||
return alignment.get_tag(tag_key) | ||
except KeyError: | ||
return default | ||
|
||
|
||
class AlignmentSortOrder: | ||
"""The base class of alignment sort orders.""" | ||
@property | ||
@abstractmethod | ||
def key_generator(self) -> Callable[[pysam.AlignedSegment], Any]: | ||
"""Returns a callable function that calculates a sort key from given pysam.AlignedSegment.""" | ||
raise NotImplementedError | ||
|
||
|
||
class QueryNameSortOrder(AlignmentSortOrder): | ||
"""Alignment record sort order by query name.""" | ||
@staticmethod | ||
def get_sort_key(alignment: pysam.AlignedSegment) -> str: | ||
return alignment.query_name | ||
|
||
@property | ||
def key_generator(self): | ||
return QueryNameSortOrder.get_sort_key | ||
|
||
def __repr__(self) -> str: | ||
return 'query_name' | ||
|
||
|
||
class CellMoleculeGeneQueryNameSortOrder(AlignmentSortOrder): | ||
"""Hierarchical alignment record sort order (cell barcode >= molecule barcode >= gene name >= query name).""" | ||
def __init__( | ||
self, | ||
cell_barcode_tag_key: str = consts.CELL_BARCODE_TAG_KEY, | ||
molecule_barcode_tag_key: str = consts.MOLECULE_BARCODE_TAG_KEY, | ||
gene_name_tag_key: str = consts.GENE_NAME_TAG_KEY) -> None: | ||
assert cell_barcode_tag_key, "Cell barcode tag key can not be None" | ||
assert molecule_barcode_tag_key, "Molecule barcode tag key can not be None" | ||
assert gene_name_tag_key, "Gene name tag key can not be None" | ||
self.cell_barcode_tag_key = cell_barcode_tag_key | ||
self.molecule_barcode_tag_key = molecule_barcode_tag_key | ||
self.gene_name_tag_key = gene_name_tag_key | ||
|
||
def _get_sort_key(self, alignment: pysam.AlignedSegment) -> Tuple[str, str, str, str]: | ||
return (get_tag_or_default(alignment, self.cell_barcode_tag_key, default='N'), | ||
get_tag_or_default(alignment, self.molecule_barcode_tag_key, default='N'), | ||
get_tag_or_default(alignment, self.gene_name_tag_key, default='N'), | ||
alignment.query_name) | ||
|
||
@property | ||
def key_generator(self) -> Callable[[pysam.AlignedSegment], Tuple[str, str, str, str]]: | ||
return self._get_sort_key | ||
|
||
def __repr__(self) -> str: | ||
return 'hierarchical__cell_molecule_gene_query_name' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
""" | ||
Global constants | ||
================ | ||
|
||
.. currentmodule:: sctools | ||
|
||
This module contains global constants, such as various barcoded BAM tags, and sctools-specific | ||
constants. | ||
""" | ||
|
||
# BAM tag constants | ||
|
||
RAW_SAMPLE_BARCODE_TAG_KEY = 'SR' | ||
QUALITY_SAMPLE_BARCODE_TAG_KEY = 'SY' | ||
|
||
MOLECULE_BARCODE_TAG_KEY = 'UB' | ||
RAW_MOLECULE_BARCODE_TAG_KEY = 'UR' | ||
QUALITY_MOLECULE_BARCODE_TAG_KEY = 'UY' | ||
|
||
CELL_BARCODE_TAG_KEY = 'CB' | ||
RAW_CELL_BARCODE_TAG_KEY = 'CR' | ||
QUALITY_CELL_BARCODE_TAG_KEY = 'CY' | ||
|
||
GENE_NAME_TAG_KEY = 'GE' | ||
NUMBER_OF_HITS_TAG_KEY = 'NH' | ||
|
||
ALIGNMENT_LOCATION_TAG_KEY = 'XF' | ||
INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE = 'INTRONIC' | ||
CODING_ALIGNMENT_LOCATION_TAG_VALUE = 'CODING' | ||
UTR_ALIGNMENT_LOCATION_TAG_VALUE = 'UTR' | ||
INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE = 'INTERGENIC' | ||
|
||
# bam.py constants | ||
|
||
MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500 | ||
MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000 |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice 👍
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
;-)