Merge pull request #118 from briney/development

Development
brineylab · May 4, 2023 · 401d310 · 401d310
2 parents 00b3ad8 + 426b7bf
commit 401d310
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 116 deletions.
diff --git a/abstar/__init__.py b/abstar/__init__.py
@@ -4,6 +4,6 @@
 warnings.simplefilter("ignore", BiopythonWarning)
 
 from .core.abstar import run, run_standalone, main, create_parser, validate_args
-from .preprocess import fastqc, adapter_trim, quality_trim
+from .utils.preprocess import fastqc, adapter_trim, quality_trim
 
 from .version import __version__
diff --git a/abstar/core/abstar.py b/abstar/core/abstar.py
@@ -73,7 +73,7 @@
     get_parquet_dtypes,
 )
 
-from ..utils.output import PARQUET_INCOMPATIBLE
+# from ..utils.output import PARQUET_INCOMPATIBLE
 from ..utils.queue.celery import celery
 
 
@@ -99,7 +99,6 @@
 #####################################################################
 
 
-
 def create_parser() -> ArgumentParser:
     parser = ArgumentParser(
         prog="abstar",
@@ -390,7 +389,9 @@ def __init__(
         temp=None,
         sequences=None,
         chunksize=500,
-        output_type=["json",],
+        output_type=[
+            "json",
+        ],
         assigner="blastn",
         merge=False,
         pandaseq_algo="simple_bayesian",
@@ -423,7 +424,13 @@ def __init__(
         self.log = os.path.abspath(log) if log is not None else log
         self.temp = os.path.abspath(temp) if temp is not None else temp
         self.chunksize = int(chunksize)
-        self.output_type = [output_type,] if output_type in STR_TYPES else output_type
+        self.output_type = (
+            [
+                output_type,
+            ]
+            if output_type in STR_TYPES
+            else output_type
+        )
         self.parquet = parquet
         self.assigner = assigner
         self.merge = True if basespace else merge
@@ -726,7 +733,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
                 for temp_file in temp_files:
                     with open(temp_file, "rb") as f:
                         shutil.copyfileobj(
-                            f, out_file, length=16 * 1024 ** 2
+                            f, out_file, length=16 * 1024**2
                         )  # Increasing buffer size to 16MB for faster transfer
             # For file formats with headers, only keep headers from the first file
             elif output_type in ["imgt", "tabular", "airr"]:
@@ -737,7 +744,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
                                 out_file.write(line)
                             elif j >= 1:
                                 out_file.write(line)
-                                
+
         if args.parquet:
             logger.info("Converting concatenated output to parquet format")
             # Make clear the output format from which the parquet file is generated.
@@ -764,7 +771,7 @@ def concat_outputs(input_file, temp_output_file_dicts, output_dir, args):
                 df.to_parquet(
                     pfile, engine="pyarrow", compression="snappy", write_index=False
                 )
-                
+
         ofiles.append(ofile)
     return ofiles
 
@@ -1544,5 +1551,6 @@ def run_main(arg_list: Optional[Iterable[str]] = None):
     output_dir = main(args)
     sys.stdout.write("\n\n")
 
+
 if __name__ == "__main__":
     run_main()
diff --git a/abstar/preprocess/umi.py b/abstar/preprocess/umi.py
@@ -35,9 +35,7 @@
 
 
 class UMI:
-    """
-    
-    """
+    """ """
 
     def __init__(
         self,
@@ -47,9 +45,7 @@ def __init__(
         ignore_strand: bool = False,
         extra_length_for_alignment: int = 25,
     ):
-        """
-        
-        """
+        """ """
         self.length = length
         self.pattern = pattern.strip().upper()
         self.ignore_strand = ignore_strand
@@ -138,7 +134,7 @@ def align(
 
     def get_mismatches(self) -> Optional[int]:
         """
-        Returns the total number of mismatches between the pattern(s) 
+        Returns the total number of mismatches between the pattern(s)
         and the input sequence.
         """
         total = len(self.pattern.replace("[UMI]", ""))
@@ -202,54 +198,54 @@ def parse_umis(
             4. a list of lists/tuples, of the format ``[sequence_id, sequence]``
 
     output_file : str, default=None
-        Path to an output file. Required if `sequences` is not a 
-        file. If `sequences` is a file and `output` is not provided, UMI parsing is 
+        Path to an output file. Required if `sequences` is not a
+        file. If `sequences` is a file and `output` is not provided, UMI parsing is
         done in-place and the `sequences` file is updated.
 
     pattern : str or iterable, default=None
-        Pattern (or iterable of patterns) for identifying the location of the UMI, 
-        or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``. 
-        Patterns may optionally contain leading and/or trailing conserved regions, with 
-        the UMI position within the pattern represented by ``"[UMI]"``. As an example, 
-        the built-in pattern for SmartSeq-BCR UMIs is::  
-
-            "[UMI]TCAGCGGGAAGACATT"  
-        
-        which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``. 
-        By default, the pattern is matched on the 5' -> 3' strand. This allows 
-        users to more easily construct patterns from their amplification primers without 
-        needing to worry about reverse-complementing patterns for UMIs at the 3' end of the 
-        input sequence. To override this, set `ignore_strand` to ``True``. If `pattern` 
-        is not provided, UMIs will be parsed using only `length`, starting at the start 
-        or end of the sequence.  
+        Pattern (or iterable of patterns) for identifying the location of the UMI,
+        or the name of a built-in pattern. Built-in options include ``"smartseq-human-bcr"``.
+        Patterns may optionally contain leading and/or trailing conserved regions, with
+        the UMI position within the pattern represented by ``"[UMI]"``. As an example,
+        the built-in pattern for SmartSeq-BCR UMIs is::
+
+            "[UMI]TCAGCGGGAAGACATT"
+
+        which would be a UMI sequence followed immediately by ``"TCAGCGGGAAGACATT"``.
+        By default, the pattern is matched on the 5' -> 3' strand. This allows
+        users to more easily construct patterns from their amplification primers without
+        needing to worry about reverse-complementing patterns for UMIs at the 3' end of the
+        input sequence. To override this, set `ignore_strand` to ``True``. If `pattern`
+        is not provided, UMIs will be parsed using only `length`, starting at the start
+        or end of the sequence.
 
         .. note::
 
-            The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in 
-            the conserved leading/trailing portions will be concatenated into the final 
+            The UMIs for all `patterns` that meet the `allowed_mismatches` criteria in
+            the conserved leading/trailing portions will be concatenated into the final
             UMI. This allows the use of multiple `patterns` for either:
               * different patterns designed to match one of several different types of
-                sequences in a heterogeneous sample. For example, if heavy, kappa and 
-                lambda primers each have different conserved regions flanking the UMI 
-                and the input file contains a mix of heavy, kappa and lambda chains, 
+                sequences in a heterogeneous sample. For example, if heavy, kappa and
+                lambda primers each have different conserved regions flanking the UMI
+                and the input file contains a mix of heavy, kappa and lambda chains,
                 supplying all patterns (assuming they're sufficiently different from
                 each other) will allow parsing of UMIs from all chains.
-              * different patterns for sequences that contain multiple UMIs, either 
-                at opposite ends of the sequence or on the same end of the sequence 
+              * different patterns for sequences that contain multiple UMIs, either
+                at opposite ends of the sequence or on the same end of the sequence
                 but in different locations and with different conserved flanking regions.
 
     length : int or iterable, default=None
-        Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are 
-        provided, there must be an equal number of `patterns`, and they must be in the 
-        same order (the first `pattern` should correspond to the first UMI `length`). If 
-        `length` is positive, the UMI will be parsed from the start of the sequence. If 
-        `length` is negative, the UMI will be parsed from the end of the sequence. If multiple 
-        `patterns` are provided with a single `length`, that `length` will be used for all 
-        `patterns`. Required if `pattern` does not have a conserved trailing region. 
-        If `length` is not provided and a trailing conserved region is present in `pattern`, 
-        the entire portion of `sequence` preceeding the trailing region will be parsed as the 
-        UMI. Ignored if both leading and trailing sequences are present in `pattern`, 
-        as the entire region between the conserved flanking regions will be parsed as 
+        Length of the UMI sequence, or iterable of UMI lengths. If multiple lengths are
+        provided, there must be an equal number of `patterns`, and they must be in the
+        same order (the first `pattern` should correspond to the first UMI `length`). If
+        `length` is positive, the UMI will be parsed from the start of the sequence. If
+        `length` is negative, the UMI will be parsed from the end of the sequence. If multiple
+        `patterns` are provided with a single `length`, that `length` will be used for all
+        `patterns`. Required if `pattern` does not have a conserved trailing region.
+        If `length` is not provided and a trailing conserved region is present in `pattern`,
+        the entire portion of `sequence` preceeding the trailing region will be parsed as the
+        UMI. Ignored if both leading and trailing sequences are present in `pattern`,
+        as the entire region between the conserved flanking regions will be parsed as
         the UMI regardless of what `length` is provided.
 
 
@@ -460,4 +456,3 @@ def _get_allowed_mismatches(name):
         "allowed_mismatches": SMARTSEQ_HUMAN_BCR_MISMATCH,
     },
 }
-