Skip to content

Commit

Permalink
Merge pull request #118 from briney/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
briney authored May 4, 2023
2 parents 00b3ad8 + 426b7bf commit 401d310
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 116 deletions.
2 changes: 1 addition & 1 deletion abstar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
warnings.simplefilter("ignore", BiopythonWarning)

from .core.abstar import run, run_standalone, main, create_parser, validate_args
from .preprocess import fastqc, adapter_trim, quality_trim
from .utils.preprocess import fastqc, adapter_trim, quality_trim

from .version import __version__
22 changes: 15 additions & 7 deletions abstar/core/abstar.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
get_parquet_dtypes,
)

from ..utils.output import PARQUET_INCOMPATIBLE
# from ..utils.output import PARQUET_INCOMPATIBLE
from ..utils.queue.celery import celery


Expand All @@ -99,7 +99,6 @@
#####################################################################



def create_parser() -> ArgumentParser:
parser = ArgumentParser(
prog="abstar",
Expand Down Expand Up @@ -390,7 +389,9 @@ def __init__(
temp=None,
sequences=None,
chunksize=500,
output_type=["json",],
output_type=[
"json",
],
assigner="blastn",
merge=False,
pandaseq_algo="simple_bayesian",
Expand Down Expand Up @@ -423,7 +424,13 @@ def __init__(
self.log = os.path.abspath(log) if log is not None else log
self.temp = os.path.abspath(temp) if temp is not None else temp
self.chunksize = int(chunksize)
self.output_type = [output_type,] if output_type in STR_TYPES else output_type
self.output_type = (
[
output_type,
]
if output_type in STR_TYPES
else output_type
)
self.parquet = parquet
self.assigner = assigner
self.merge = True if basespace else merge
Expand Down Expand Up @@ -726,7 +733,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
for temp_file in temp_files:
with open(temp_file, "rb") as f:
shutil.copyfileobj(
f, out_file, length=16 * 1024 ** 2
f, out_file, length=16 * 1024**2
) # Increasing buffer size to 16MB for faster transfer
# For file formats with headers, only keep headers from the first file
elif output_type in ["imgt", "tabular", "airr"]:
Expand All @@ -737,7 +744,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
out_file.write(line)
elif j >= 1:
out_file.write(line)

if args.parquet:
logger.info("Converting concatenated output to parquet format")
# Make clear the output format from which the parquet file is generated.
Expand All @@ -764,7 +771,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
df.to_parquet(
pfile, engine="pyarrow", compression="snappy", write_index=False
)

ofiles.append(ofile)
return ofiles

Expand Down Expand Up @@ -1544,5 +1551,6 @@ def run_main(arg_list: Optional[Iterable[str]] = None):
output_dir = main(args)
sys.stdout.write("\n\n")


if __name__ == "__main__":
run_main()
81 changes: 38 additions & 43 deletions abstar/preprocess/umi.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@


class UMI:
"""
"""
""" """

def __init__(
self,
Expand All @@ -47,9 +45,7 @@ def __init__(
ignore_strand: bool = False,
extra_length_for_alignment: int = 25,
):
"""
"""
""" """
self.length = length
self.pattern = pattern.strip().upper()
self.ignore_strand = ignore_strand
Expand Down Expand Up @@ -138,7 +134,7 @@ def align(

def get_mismatches(self) -> Optional[int]:
"""
Returns the total number of mismatches between the pattern(s)
Returns the total number of mismatches between the pattern(s)
and the input sequence.
"""
total = len(self.pattern.replace("[UMI]", ""))
Expand Down Expand Up @@ -202,54 +198,54 @@ def parse_umis(
4. a list of lists/tuples, of the format ``[sequence_id, sequence]``
output_file : str, default=None
Path to an output file. Required if `sequences` is not a
file. If `sequences` is a file and `output` is not provided, UMI parsing is
Path to an output file. Required if `sequences` is not a
file. If `sequences` is a file and `output` is not provided, UMI parsing is
done in-place and the `sequences` file is updated.
pattern : str or iterable, default=None
Pattern (or iterable of patterns) for identifying the location of the UMI,
or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``.
Patterns may optionally contain leading and/or trailing conserved regions, with
the UMI position within the pattern represented by ``"[UMI]"``. As an example,
the built-in pattern for SmartSeq-BCR UMIs is::
"[UMI]TCAGCGGGAAGACATT"
which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``.
By default, the pattern is matched on the 5' -> 3' strand. This allows
users to more easily construct patterns from their amplification primers without
needing to worry about reverse-complementing patterns for UMIs at the 3' end of the
input sequence. To override this, set `ignore_strand` to ``True``. If `pattern`
is not provided, UMIs will be parsed using only `length`, starting at the start
or end of the sequence.
Pattern (or iterable of patterns) for identifying the location of the UMI,
or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``.
Patterns may optionally contain leading and/or trailing conserved regions, with
the UMI position within the pattern represented by ``"[UMI]"``. As an example,
the built-in pattern for SmartSeq-BCR UMIs is::
"[UMI]TCAGCGGGAAGACATT"
which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``.
By default, the pattern is matched on the 5' -> 3' strand. This allows
users to more easily construct patterns from their amplification primers without
needing to worry about reverse-complementing patterns for UMIs at the 3' end of the
input sequence. To override this, set `ignore_strand` to ``True``. If `pattern`
is not provided, UMIs will be parsed using only `length`, starting at the start
or end of the sequence.
.. note::
The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in
the conserved leading/trailing portions will be concatenated into the final
The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in
the conserved leading/trailing portions will be concatenated into the final
UMI. This allows the use of multiple `patterns` for either:
* different patterns designed to match one of several different types of
sequences in a heterogeneous sample. For example, if heavy, kappa and
lambda primers each have different conserved regions flanking the UMI
and the input file contains a mix of heavy, kappa and lambda chains,
sequences in a heterogeneous sample. For example, if heavy, kappa and
lambda primers each have different conserved regions flanking the UMI
and the input file contains a mix of heavy, kappa and lambda chains,
supplying all patterns (assuming they're sufficiently different from
each other) will allow parsing of UMIs from all chains.
* different patterns for sequences that contain multiple UMIs, either
at opposite ends of the sequence or on the same end of the sequence
* different patterns for sequences that contain multiple UMIs, either
at opposite ends of the sequence or on the same end of the sequence
but in different locations and with different conserved flanking regions.
length : int or iterable, default=None
Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are
provided, there must be an equal number of `patterns`, and they must be in the
same order (the first `pattern` should correspond to the first UMI `length`). If
`length` is positive, the UMI will be parsed from the start of the sequence. If
`length` is negative, the UMI will be parsed from the end of the sequence. If multiple
`patterns` are provided with a single `length`, that `length` will be used for all
`patterns`. Required if `pattern` does not have a conserved trailing region.
If `length` is not provided and a trailing conserved region is present in `pattern`,
the entire portion of `sequence` preceeding the trailing region will be parsed as the
UMI. Ignored if both leading and trailing sequences are present in `pattern`,
as the entire region between the conserved flanking regions will be parsed as
Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are
provided, there must be an equal number of `patterns`, and they must be in the
same order (the first `pattern` should correspond to the first UMI `length`). If
`length` is positive, the UMI will be parsed from the start of the sequence. If
`length` is negative, the UMI will be parsed from the end of the sequence. If multiple
`patterns` are provided with a single `length`, that `length` will be used for all
`patterns`. Required if `pattern` does not have a conserved trailing region.
If `length` is not provided and a trailing conserved region is present in `pattern`,
the entire portion of `sequence` preceeding the trailing region will be parsed as the
UMI. Ignored if both leading and trailing sequences are present in `pattern`,
as the entire region between the conserved flanking regions will be parsed as
the UMI regardless of what `length` is provided.
Expand Down Expand Up @@ -460,4 +456,3 @@ def _get_allowed_mismatches(name):
"allowed_mismatches": SMARTSEQ_HUMAN_BCR_MISMATCH,
},
}

Loading

0 comments on commit 401d310

Please sign in to comment.