From 7fb27d86217a07986c1ba97d0328d72a71289f36 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Sun, 8 Sep 2024 14:11:51 +0300 Subject: [PATCH] switch to simple id distributor as thread safety is not needed acrros differnt processes --- src/common.py | 19 +++++++------------ src/gene_info.py | 7 +++---- src/graph_based_model_construction.py | 19 ++++++++----------- src/id_policy.py | 21 ++++++++++++++++----- src/isoform_assignment.py | 5 +++-- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/src/common.py b/src/common.py index 9be377b6..c0df0348 100644 --- a/src/common.py +++ b/src/common.py @@ -9,7 +9,6 @@ import os import re import subprocess -import threading import math from collections import defaultdict from enum import Enum @@ -17,17 +16,6 @@ logger = logging.getLogger('IsoQuant') -class AtomicIDDistributor(object): - def __init__(self): - self.value = 0 - self._lock = threading.Lock() - - def increment(self): - with self._lock: - self.value += 1 - return self.value - - class CigarEvent(Enum): match = 0 insertion = 1 @@ -48,6 +36,13 @@ def get_ins_del_match_events(cls): return {cls.match, cls.insertion, cls.deletion, cls.seq_match, cls.seq_mismatch} +class TranscriptNaming: + transcript_prefix = "transcript" + novel_gene_prefix = "novel_gene_" + nic_transcript_suffix = ".nic" + nnic_transcript_suffix = ".nnic" + + # key, value def get_first_best_from_sorted(sorted_list_of_pairs): if not sorted_list_of_pairs: diff --git a/src/gene_info.py b/src/gene_info.py index 277f3c21..469bceae 100644 --- a/src/gene_info.py +++ b/src/gene_info.py @@ -16,11 +16,10 @@ equal_ranges, get_intron_strand, intervals_total_length, - is_subprofile, overlaps, - junctions_from_blocks, - AtomicIDDistributor + junctions_from_blocks ) +from .id_policy import SimpleIDDistributor logger = logging.getLogger('IsoQuant') @@ -121,7 +120,7 @@ def print_debug(self): # exon/intron info class FeatureInfo: - feature_id_counter = AtomicIDDistributor() + feature_id_counter = SimpleIDDistributor() def __init__(self, chr_id, start, end, strand, type, gene_ids): self.id = FeatureInfo.feature_id_counter.increment() self.chr_id = chr_id diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index d9f71c86..1273ac18 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -11,6 +11,7 @@ from enum import unique, Enum from .common import ( + TranscriptNaming, cmp, get_exons, intersection_len, @@ -46,10 +47,6 @@ class StrandnessReportingLevel(Enum): class GraphBasedModelConstructor: - transcript_prefix = "transcript" - novel_gene_prefix = "novel_gene_" - nic_transcript_suffix = ".nic" - nnic_transcript_suffix = ".nnic" detected_known_isoforms = set() extended_transcript_ids = set() @@ -411,7 +408,7 @@ def construct_fl_isoforms(self): transcript_range = (path[0][1], path[-1][1]) novel_exons = get_exons(transcript_range, list(intron_path)) count = self.path_storage.paths[path] - new_transcript_id = self.transcript_prefix + str(self.get_transcript_id()) + new_transcript_id = TranscriptNaming.transcript_prefix + str(self.get_transcript_id()) # logger.debug("uuu %s: %s" % (new_transcript_id, str(novel_exons))) reference_isoform = None @@ -482,17 +479,17 @@ def construct_fl_isoforms(self): transcript_gene = self.select_reference_gene(intron_path, transcript_range, transcript_strand) if transcript_gene is None: - transcript_gene = (GraphBasedModelConstructor.novel_gene_prefix + self.gene_info.chr_id + + transcript_gene = (TranscriptNaming.novel_gene_prefix + self.gene_info.chr_id + "_" + str(self.get_transcript_id())) elif transcript_strand == '.': transcript_strand = self.gene_info.gene_strands[transcript_gene] if all(intron in self.known_introns for intron in intron_path): transcript_type = TranscriptModelType.novel_in_catalog - id_suffix = self.nic_transcript_suffix + id_suffix = TranscriptNaming.nic_transcript_suffix else: transcript_type = TranscriptModelType.novel_not_in_catalog - id_suffix = self.nnic_transcript_suffix + id_suffix = TranscriptNaming.nnic_transcript_suffix new_model = TranscriptModel(self.gene_info.chr_id, transcript_strand, new_transcript_id + ".%s" % self.gene_info.chr_id + id_suffix, @@ -644,11 +641,11 @@ def generate_monoexon_from_clustered(self, clustered_reads, forward=True): strand = '+' if forward else '-' coordinates = (five_prime_pos, three_prime_pos) if forward else (three_prime_pos, five_prime_pos) - new_transcript_id = self.transcript_prefix + str(self.get_transcript_id()) - transcript_gene = (GraphBasedModelConstructor.novel_gene_prefix + self.gene_info.chr_id + + new_transcript_id = TranscriptNaming.transcript_prefix + str(self.get_transcript_id()) + transcript_gene = (TranscriptNaming.novel_gene_prefix + self.gene_info.chr_id + "_" + str(self.get_transcript_id())) transcript_type = TranscriptModelType.novel_not_in_catalog - id_suffix = self.nnic_transcript_suffix + id_suffix = TranscriptNaming.nnic_transcript_suffix is_valid = True half_len = interval_len(coordinates) / 2 diff --git a/src/id_policy.py b/src/id_policy.py index 029da66b..9c725830 100644 --- a/src/id_policy.py +++ b/src/id_policy.py @@ -3,9 +3,9 @@ # # All Rights Reserved # See file LICENSE for details. ############################################################################ +import threading - -from .graph_based_model_construction import GraphBasedModelConstructor +from src.common import TranscriptNaming class SimpleIDDistributor(object): @@ -25,7 +25,7 @@ def __init__(self, genedb, chr_id): return for g in genedb.region(seqid=chr_id, start=1, featuretype="gene"): - if g.id.startswith(GraphBasedModelConstructor.novel_gene_prefix): + if g.id.startswith(TranscriptNaming.novel_gene_prefix): try: gene_num = int(g.id.split("_")[-1]) self.forbidden_ids.add(gene_num) @@ -34,9 +34,9 @@ def __init__(self, genedb, chr_id): except ValueError: pass - transcript_num_start_pos = len(GraphBasedModelConstructor.transcript_prefix) + transcript_num_start_pos = len(TranscriptNaming.transcript_prefix) for t in genedb.region(seqid=chr_id, start=1, featuretype=("transcript", "mRNA")): - if t.id.startswith(GraphBasedModelConstructor.transcript_prefix): + if t.id.startswith(TranscriptNaming.transcript_prefix): try: transcript_num = int(t.id.split(".")[0][transcript_num_start_pos:]) self.forbidden_ids.add(transcript_num) @@ -78,3 +78,14 @@ def get_id(self, chr_id, feature, strand): feature_id = self.id_dict[feature_tuple] return feature_id + + +class AtomicIDDistributor(object): + def __init__(self): + self.value = 0 + self._lock = threading.Lock() + + def increment(self): + with self._lock: + self.value += 1 + return self.value diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 1f6250f1..db1ba011 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -7,7 +7,8 @@ import logging from enum import Enum, unique -from src.common import AtomicIDDistributor, junctions_from_blocks +from src.common import junctions_from_blocks +from src.id_policy import SimpleIDDistributor from src.serialization import * from src.polya_finder import PolyAInfo @@ -619,7 +620,7 @@ def serialize(self, outfile): class ReadAssignment: - assignment_id_generator = AtomicIDDistributor() + assignment_id_generator = SimpleIDDistributor() def __init__(self, read_id, assignment_type, match=None): self.assignment_id = ReadAssignment.assignment_id_generator.increment()