From 2f99709e3f7af6f7b857fe647470da61dcccbdc4 Mon Sep 17 00:00:00 2001
From: Alex Kanitz <alexander.kanitz@alumni.ethz.ch>
Date: Wed, 18 May 2022 15:09:03 +0200
Subject: [PATCH] fix(orient): handle sources (#79)

---
 htsinfer/get_read_orientation.py   | 39 ++++++++++++++++++++++--------
 htsinfer/htsinfer.py               |  2 +-
 tests/test_get_read_orientation.py | 34 ++++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py
index 7aafc51d..1de49a13 100644
--- a/htsinfer/get_read_orientation.py
+++ b/htsinfer/get_read_orientation.py
@@ -17,6 +17,7 @@
 )
 from htsinfer.models import (
     ResultsOrientation,
+    ResultsSource,
     ResultsType,
     StatesOrientation,
     StatesOrientationRelationship,
@@ -33,7 +34,10 @@ class GetOrientation:
     Args:
         paths: Tuple of one or two paths for single-end and paired end library
             files.
-        library_type: Library type and mate relationship.
+        library_type: ResultsType object with library type and mate
+            relationship.
+        library_source: ResultsSource object with source information on each
+            library file.
         transcripts_file: File path to an uncompressed transcripts file in
             FASTA format.
         tmp_dir: Path to directory where temporary output is written to.
@@ -49,7 +53,10 @@ class GetOrientation:
     Attributes:
         paths: Tuple of one or two paths for single-end and paired end library
             files.
-        library_type: Library type and mate relationship.
+        library_type: ResultsType object with library type and mate
+            relationship.
+        library_source: ResultsSource object with source information on each
+            library file.
         transcripts_file: File path to an uncompressed transcripts file in
             FASTA format.
         tmp_dir: Path to directory where temporary output is written to.
@@ -66,20 +73,20 @@ def __init__(
         self,
         paths: Tuple[Path, Optional[Path]],
         library_type: ResultsType,
+        library_source: ResultsSource,
         transcripts_file: Path,
         tmp_dir: Path = Path(tempfile.gettempdir()) / 'tmp_htsinfer',
         threads_star: int = 1,
-        source: str = "hsapiens",
         min_mapped_reads: int = 20,
         min_fraction: float = 0.75,
     ):
         """Class contructor."""
         self.paths = paths
         self.library_type = library_type
+        self.library_source = library_source
         self.transcripts_file = transcripts_file
         self.tmp_dir = tmp_dir
         self.threads_star = threads_star
-        self.source = source
         self.min_mapped_reads = min_mapped_reads
         self.min_fraction = min_fraction
 
@@ -111,7 +118,14 @@ def evaluate(self) -> ResultsOrientation:
         return orientation
 
     def subset_transcripts_by_organism(self) -> Path:
-        """Filter FASTA file of transcripts by current organism.
+        """Filter FASTA file of transcripts by current sources.
+
+        The filtered file contains records from the indicated sources.
+            Typically, this is one source. However, for if two input files
+            were supplied that are originating from different sources (i.e.,
+            not from a valid paired-ended library), it may be from two
+            different sources. If no source is supplied (because it could
+            not be inferred), no filtering is done.
 
         Returns:
             Path to filtered FASTA file.
@@ -120,18 +134,23 @@ def subset_transcripts_by_organism(self) -> Path:
             FileProblem: Could not open input/output FASTA file for
                 reading/writing.
         """
-        LOGGER.debug(f"Subsetting transcripts for: {self.source}")
+        LOGGER.debug(f"Subsetting transcripts for: {self.library_source}")
 
-        outfile = self.tmp_dir / f"{self.source}.fasta"
+        outfile = self.tmp_dir / f"{self.library_source}.fasta"
 
         def yield_filtered_seqs():
-            """Generator yielding sequence records for specified organism.
+            """Generator yielding sequence records for specified sources.
 
             Yields:
-                Next FASTA sequence record of the specified organism.
+                Next FASTA sequence record of the specified sources.
 
             Raises: Could not process input FASTA file.
             """
+            sources = []
+            if self.library_source.file_1.short_name is not None:
+                sources.append(self.library_source.file_1.short_name)
+            if self.library_source.file_2.short_name is not None:
+                sources.append(self.library_source.file_2.short_name)
             try:
                 for record in SeqIO.parse(
                     handle=self.transcripts_file,
@@ -141,7 +160,7 @@ def yield_filtered_seqs():
                         org_name = record.description.split("|")[3]
                     except ValueError:
                         continue
-                    if org_name == self.source:
+                    if org_name in sources or len(sources) == 0:
                         yield record
 
             except OSError as exc:
diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py
index 91633d2f..e46636e6 100755
--- a/htsinfer/htsinfer.py
+++ b/htsinfer/htsinfer.py
@@ -348,9 +348,9 @@ def get_read_orientation(self):
         get_read_orientation = GetOrientation(
             paths=(self.path_1_processed, self.path_2_processed),
             library_type=self.results.library_type,
+            library_source=self.results.library_source,
             transcripts_file=self.transcripts_file_processed,
             threads_star=self.threads,
-            source="hsapiens",
             tmp_dir=self.tmp_dir,
             min_mapped_reads=self.read_orientation_min_mapped_reads,
             min_fraction=self.read_orientation_min_fraction,
diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py
index 3dbc4153..4d51fd6d 100644
--- a/tests/test_get_read_orientation.py
+++ b/tests/test_get_read_orientation.py
@@ -3,7 +3,9 @@
 from htsinfer.get_read_orientation import GetOrientation
 from htsinfer.models import (
     ResultsOrientation,
+    ResultsSource,
     ResultsType,
+    Source,
     StatesOrientation,
     StatesOrientationRelationship,
     StatesTypeRelationship,
@@ -35,6 +37,7 @@ def test_init_required(self):
         test_instance = GetOrientation(
             paths=(FILE_MATE_1, None),
             library_type=ResultsType(),
+            library_source=ResultsSource(),
             transcripts_file=FILE_TRANSCRIPTS,
         )
         assert test_instance.paths[0] == FILE_MATE_1
@@ -46,6 +49,7 @@ def test_init_required_paired(self):
         test_instance = GetOrientation(
             paths=(FILE_MATE_1, FILE_MATE_2),
             library_type=ResultsType(),
+            library_source=ResultsSource(),
             transcripts_file=FILE_TRANSCRIPTS,
         )
         assert test_instance.paths[0] == FILE_MATE_1
@@ -59,9 +63,9 @@ def test_init_all(self, tmpdir):
         test_instance = GetOrientation(
             paths=(FILE_MATE_1, FILE_MATE_2),
             library_type=ResultsType(),
+            library_source=ResultsSource(),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmp_dir,
-            source="hsapiens",
             threads_star=1,
             min_mapped_reads=20,
             min_fraction=0.75,
@@ -70,9 +74,9 @@ def test_init_all(self, tmpdir):
         assert test_instance.paths[0] == FILE_MATE_1
         assert test_instance.paths[1] == FILE_MATE_2
         assert test_instance.library_type == ResultsType()
+        assert test_instance.library_source == ResultsSource()
         assert test_instance.transcripts_file == FILE_TRANSCRIPTS
         assert test_instance.tmp_dir == tmp_dir
-        assert test_instance.source == "hsapiens"
         assert test_instance.threads_star == 1
         assert test_instance.min_mapped_reads == 20
         assert test_instance.min_fraction == 0.75
@@ -84,6 +88,7 @@ def test_evaluate_single_unmapped(self, tmpdir):
         test_instance = GetOrientation(
             paths=(FILE_UNMAPPED_SINGLE, None),
             library_type=ResultsType(),
+            library_source=ResultsSource(),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )
@@ -99,6 +104,10 @@ def test_evaluate_single_sf(self, tmpdir):
         test_instance = GetOrientation(
             paths=(FILE_ORIENTATION_SF, None),
             library_type=ResultsType(),
+            library_source=ResultsSource(
+                file_1=Source(short_name="hsapiens", taxon_id=9606),
+                file_2=Source(),
+            ),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )
@@ -114,6 +123,10 @@ def test_evaluate_single_sr(self, tmpdir):
         test_instance = GetOrientation(
             paths=(FILE_ORIENTATION_SR, None),
             library_type=ResultsType(),
+            library_source=ResultsSource(
+                file_1=Source(short_name="hsapiens", taxon_id=9606),
+                file_2=Source(),
+            ),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )
@@ -129,6 +142,10 @@ def test_evaluate_single_u(self, tmpdir):
         test_instance = GetOrientation(
             paths=(FILE_ORIENTATION_U, None),
             library_type=ResultsType(),
+            library_source=ResultsSource(
+                file_1=Source(short_name="hsapiens", taxon_id=9606),
+                file_2=Source(),
+            ),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )
@@ -148,6 +165,7 @@ def test_evaluate_paired_unmapped(self, tmpdir):
             library_type=ResultsType(
                 relationship=StatesTypeRelationship.split_mates,
             ),
+            library_source=ResultsSource(),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )
@@ -165,6 +183,10 @@ def test_evaluate_paired_isf(self, tmpdir):
             library_type=ResultsType(
                 relationship=StatesTypeRelationship.split_mates,
             ),
+            library_source=ResultsSource(
+                file_1=Source(short_name="hsapiens", taxon_id=9606),
+                file_2=Source(short_name="hsapiens", taxon_id=9606),
+            ),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )
@@ -182,6 +204,10 @@ def test_evaluate_paired_isr(self, tmpdir):
             library_type=ResultsType(
                 relationship=StatesTypeRelationship.split_mates,
             ),
+            library_source=ResultsSource(
+                file_1=Source(short_name="hsapiens", taxon_id=9606),
+                file_2=Source(short_name="hsapiens", taxon_id=9606),
+            ),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )
@@ -199,6 +225,10 @@ def test_evaluate_paired_iu(self, tmpdir):
             library_type=ResultsType(
                 relationship=StatesTypeRelationship.split_mates,
             ),
+            library_source=ResultsSource(
+                file_1=Source(short_name="hsapiens", taxon_id=9606),
+                file_2=Source(short_name="hsapiens", taxon_id=9606),
+            ),
             transcripts_file=FILE_TRANSCRIPTS,
             tmp_dir=tmpdir,
         )