From 2f99709e3f7af6f7b857fe647470da61dcccbdc4 Mon Sep 17 00:00:00 2001 From: Alex Kanitz Date: Wed, 18 May 2022 15:09:03 +0200 Subject: [PATCH] fix(orient): handle sources (#79) --- htsinfer/get_read_orientation.py | 39 ++++++++++++++++++++++-------- htsinfer/htsinfer.py | 2 +- tests/test_get_read_orientation.py | 34 ++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 13 deletions(-) diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index 7aafc51d..1de49a13 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -17,6 +17,7 @@ ) from htsinfer.models import ( ResultsOrientation, + ResultsSource, ResultsType, StatesOrientation, StatesOrientationRelationship, @@ -33,7 +34,10 @@ class GetOrientation: Args: paths: Tuple of one or two paths for single-end and paired end library files. - library_type: Library type and mate relationship. + library_type: ResultsType object with library type and mate + relationship. + library_source: ResultsSource object with source information on each + library file. transcripts_file: File path to an uncompressed transcripts file in FASTA format. tmp_dir: Path to directory where temporary output is written to. @@ -49,7 +53,10 @@ class GetOrientation: Attributes: paths: Tuple of one or two paths for single-end and paired end library files. - library_type: Library type and mate relationship. + library_type: ResultsType object with library type and mate + relationship. + library_source: ResultsSource object with source information on each + library file. transcripts_file: File path to an uncompressed transcripts file in FASTA format. tmp_dir: Path to directory where temporary output is written to. @@ -66,20 +73,20 @@ def __init__( self, paths: Tuple[Path, Optional[Path]], library_type: ResultsType, + library_source: ResultsSource, transcripts_file: Path, tmp_dir: Path = Path(tempfile.gettempdir()) / 'tmp_htsinfer', threads_star: int = 1, - source: str = "hsapiens", min_mapped_reads: int = 20, min_fraction: float = 0.75, ): """Class contructor.""" self.paths = paths self.library_type = library_type + self.library_source = library_source self.transcripts_file = transcripts_file self.tmp_dir = tmp_dir self.threads_star = threads_star - self.source = source self.min_mapped_reads = min_mapped_reads self.min_fraction = min_fraction @@ -111,7 +118,14 @@ def evaluate(self) -> ResultsOrientation: return orientation def subset_transcripts_by_organism(self) -> Path: - """Filter FASTA file of transcripts by current organism. + """Filter FASTA file of transcripts by current sources. + + The filtered file contains records from the indicated sources. + Typically, this is one source. However, for if two input files + were supplied that are originating from different sources (i.e., + not from a valid paired-ended library), it may be from two + different sources. If no source is supplied (because it could + not be inferred), no filtering is done. Returns: Path to filtered FASTA file. @@ -120,18 +134,23 @@ def subset_transcripts_by_organism(self) -> Path: FileProblem: Could not open input/output FASTA file for reading/writing. """ - LOGGER.debug(f"Subsetting transcripts for: {self.source}") + LOGGER.debug(f"Subsetting transcripts for: {self.library_source}") - outfile = self.tmp_dir / f"{self.source}.fasta" + outfile = self.tmp_dir / f"{self.library_source}.fasta" def yield_filtered_seqs(): - """Generator yielding sequence records for specified organism. + """Generator yielding sequence records for specified sources. Yields: - Next FASTA sequence record of the specified organism. + Next FASTA sequence record of the specified sources. Raises: Could not process input FASTA file. """ + sources = [] + if self.library_source.file_1.short_name is not None: + sources.append(self.library_source.file_1.short_name) + if self.library_source.file_2.short_name is not None: + sources.append(self.library_source.file_2.short_name) try: for record in SeqIO.parse( handle=self.transcripts_file, @@ -141,7 +160,7 @@ def yield_filtered_seqs(): org_name = record.description.split("|")[3] except ValueError: continue - if org_name == self.source: + if org_name in sources or len(sources) == 0: yield record except OSError as exc: diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py index 91633d2f..e46636e6 100755 --- a/htsinfer/htsinfer.py +++ b/htsinfer/htsinfer.py @@ -348,9 +348,9 @@ def get_read_orientation(self): get_read_orientation = GetOrientation( paths=(self.path_1_processed, self.path_2_processed), library_type=self.results.library_type, + library_source=self.results.library_source, transcripts_file=self.transcripts_file_processed, threads_star=self.threads, - source="hsapiens", tmp_dir=self.tmp_dir, min_mapped_reads=self.read_orientation_min_mapped_reads, min_fraction=self.read_orientation_min_fraction, diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py index 3dbc4153..4d51fd6d 100644 --- a/tests/test_get_read_orientation.py +++ b/tests/test_get_read_orientation.py @@ -3,7 +3,9 @@ from htsinfer.get_read_orientation import GetOrientation from htsinfer.models import ( ResultsOrientation, + ResultsSource, ResultsType, + Source, StatesOrientation, StatesOrientationRelationship, StatesTypeRelationship, @@ -35,6 +37,7 @@ def test_init_required(self): test_instance = GetOrientation( paths=(FILE_MATE_1, None), library_type=ResultsType(), + library_source=ResultsSource(), transcripts_file=FILE_TRANSCRIPTS, ) assert test_instance.paths[0] == FILE_MATE_1 @@ -46,6 +49,7 @@ def test_init_required_paired(self): test_instance = GetOrientation( paths=(FILE_MATE_1, FILE_MATE_2), library_type=ResultsType(), + library_source=ResultsSource(), transcripts_file=FILE_TRANSCRIPTS, ) assert test_instance.paths[0] == FILE_MATE_1 @@ -59,9 +63,9 @@ def test_init_all(self, tmpdir): test_instance = GetOrientation( paths=(FILE_MATE_1, FILE_MATE_2), library_type=ResultsType(), + library_source=ResultsSource(), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmp_dir, - source="hsapiens", threads_star=1, min_mapped_reads=20, min_fraction=0.75, @@ -70,9 +74,9 @@ def test_init_all(self, tmpdir): assert test_instance.paths[0] == FILE_MATE_1 assert test_instance.paths[1] == FILE_MATE_2 assert test_instance.library_type == ResultsType() + assert test_instance.library_source == ResultsSource() assert test_instance.transcripts_file == FILE_TRANSCRIPTS assert test_instance.tmp_dir == tmp_dir - assert test_instance.source == "hsapiens" assert test_instance.threads_star == 1 assert test_instance.min_mapped_reads == 20 assert test_instance.min_fraction == 0.75 @@ -84,6 +88,7 @@ def test_evaluate_single_unmapped(self, tmpdir): test_instance = GetOrientation( paths=(FILE_UNMAPPED_SINGLE, None), library_type=ResultsType(), + library_source=ResultsSource(), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, ) @@ -99,6 +104,10 @@ def test_evaluate_single_sf(self, tmpdir): test_instance = GetOrientation( paths=(FILE_ORIENTATION_SF, None), library_type=ResultsType(), + library_source=ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(), + ), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, ) @@ -114,6 +123,10 @@ def test_evaluate_single_sr(self, tmpdir): test_instance = GetOrientation( paths=(FILE_ORIENTATION_SR, None), library_type=ResultsType(), + library_source=ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(), + ), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, ) @@ -129,6 +142,10 @@ def test_evaluate_single_u(self, tmpdir): test_instance = GetOrientation( paths=(FILE_ORIENTATION_U, None), library_type=ResultsType(), + library_source=ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(), + ), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, ) @@ -148,6 +165,7 @@ def test_evaluate_paired_unmapped(self, tmpdir): library_type=ResultsType( relationship=StatesTypeRelationship.split_mates, ), + library_source=ResultsSource(), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, ) @@ -165,6 +183,10 @@ def test_evaluate_paired_isf(self, tmpdir): library_type=ResultsType( relationship=StatesTypeRelationship.split_mates, ), + library_source=ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, ) @@ -182,6 +204,10 @@ def test_evaluate_paired_isr(self, tmpdir): library_type=ResultsType( relationship=StatesTypeRelationship.split_mates, ), + library_source=ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, ) @@ -199,6 +225,10 @@ def test_evaluate_paired_iu(self, tmpdir): library_type=ResultsType( relationship=StatesTypeRelationship.split_mates, ), + library_source=ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ), transcripts_file=FILE_TRANSCRIPTS, tmp_dir=tmpdir, )