FIX: pass length_dis option to fetch_transcript_id_by_gene function

GenomiqueENS · Jan 4, 2025 · a324e80 · a324e80
1 parent fecd642
commit a324e80
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 83 deletions.
diff --git a/bin/template_maker.py b/bin/template_maker.py
@@ -201,7 +201,7 @@ def make_template(self, cell, umi_counts, transcripts_index, features, seed=2024
                 count=int(count)
                 if count > 0:
                     total_matrix_counts += count
-                    trns = trns if transcript_id else fetch_transcript_id_by_gene(trns, transcripts_index)
+                    trns = trns if transcript_id else fetch_transcript_id_by_gene(trns, transcripts_index, self.length_dist)
                     if trns:
                         if self.unspliced_ratio > 0 and random.random() < self.unspliced_ratio:
                                 cDNA = get_unspliced_sequence(trns,
@@ -282,19 +282,16 @@ def template_maker(args):
             sys.exit(1)
 
     if args.full_length:
-        length_dist = None
         truncation_model = None
-
     else :
-        try:
-            shape, loc, scale = args.length_dist.split(",")
-            length_dist = [float(x) for x in [shape, loc, scale]]
-        except:
-            length_dist = [0.37, 0.0, 824.94]
-
-    if not args.full_length:
         truncation_model = pd.read_csv(args.truncation_model)
 
+    try:
+        shape, loc, scale = args.length_dist.split(",")
+        length_dist = [float(x) for x in [shape, loc, scale]]
+    except:
+        length_dist = [0.37, 0.0, 824.94]
+
     dict_ref_structure = None
     IR_markov_model = None
     genome_fai = None

diff --git a/bin/toolkit.py b/bin/toolkit.py
@@ -12,62 +12,35 @@
 ont_3p = [0.46713188751686213, 0.16673847625320887, 0.03520399186107891, 0.01779148300706967, 0.011167330623367366, 0.007589634154594385, 0.0057037865834126626, 0.005360905206834167, 0.004714618927954307, 0.005824471804774172, 0.003999530793905735, 0.003578824368038042, 0.00447776008229153, 0.003863054982833373, 0.004495806470532503, 0.0033385818245800832, 0.003102850878182368, 0.0023415188742663016, 0.002119322719049316, 0.0030261537281582305, 0.002912235902387086, 0.002373100053688005, 0.003551754785676582, 0.007077567888256763, 0.00419578526602632, 0.002210682559519244, 0.004627770684544622, 0.002146392301410776, 0.0036092776481946843, 0.002611086798615842, 0.0025039363684350624, 0.003856287587243008, 0.0038607991843032513, 0.0028445619464834357, 0.003017130534037744, 0.004093146432905784, 0.002603191503760416, 0.002035858173434814, 0.003538219994495851, 0.003646498323941692, 0.004468736888171044, 0.0032054897113029043, 0.0035348362967006688, 0.0019862306057721376, 0.002995700448001588, 0.0024700993904832372, 0.002491529476519393, 0.002064055655061335, 0.003814555314435757, 0.0023054260977843546, 0.004129239209387731, 0.0024678435919531154, 0.0019704400160612855, 0.001982846907976955, 0.002121578517579438, 0.0027633531993990555, 0.00470559573383382, 0.0034919761246283566, 0.0025039363684350624, 0.004561224627906033, 0.0024982968721097582, 0.0020798462447721867, 0.002897573211941295, 0.0023437746727964235, 0.0026392842802423627, 0.0032111292076282085, 0.006500083464545616, 0.006404212027015444, 0.0049221523927255005, 0.0023291119823506323, 0.002362948960302458, 0.003120897266423342, 0.0026697375603990056, 0.0038326017026767308, 0.003440092758435558, 0.008549476429161158, 0.003399488384893368, 0.0025400291449170095, 0.002512959562555549, 0.001722302177747901, 0.002155415495531263, 0.00236633265809764, 0.0031242809642185237, 0.0027306441207122912, 0.0022715891198325294, 0.003955542722568362, 0.0019287077432540345, 0.001819301514543133, 0.003299105350302954, 0.005128557958231634, 0.0014031066857356834, 0.0016433492291936425, 0.0014741643394345165, 0.0014403273614826914, 0.0015813147696152963, 0.0037209396754357077, 0.0004489039074942139, 0.000371078858205016, 0.00019851027065070765, 4.286017207231188e-05, 0.0]
 ont_5p = [0.5807485641842356, 0.13738151418220534, 0.05887183003911555, 0.031937595589462714, 0.023225701666132793, 0.014341239155248566, 0.010606764688632128, 0.008649859463751573, 0.00788063216498008, 0.008692719635823885, 0.006399700429955199, 0.004554457232315667, 0.0038506480909177043, 0.004070588447604567, 0.0035111504121343915, 0.0030069794406521965, 0.002865992032519592, 0.0024012975353145264, 0.0024464135059169595, 0.004743944308845889, 0.002703574538350831, 0.0030520954112546305, 0.0021384970065553505, 0.0024520530022422637, 0.0018644174851455667, 0.0016072564527116957, 0.0018802080748564186, 0.002547924439772435, 0.0015249198063622543, 0.0016659072144948593, 0.002093381035952917, 0.0028253876589774012, 0.0018181736152780722, 0.0014234088725067786, 0.002099020532278221, 0.0011854221275789418, 0.0010275162304704242, 0.0011504572503620557, 0.0021892524734830883, 0.0013636302114585542, 0.0013884439952898927, 0.0010320278275306675, 0.0010884227907837095, 0.0010884227907837095, 0.001266630874663322, 0.0011075970782897437, 0.0010252604319403024, 0.0010184930363499375, 0.0008526918443859941, 0.0009000636135185493, 0.0008696103333619067, 0.0008831451245426367, 0.0008989357142534885, 0.0008921683186631235, 0.001091806488578892, 0.0009260052966149487, 0.00098127236060293, 0.0009429237855908614, 0.0011042133804945614, 0.0009654817708920781, 0.0008323896576148991, 0.0007365182200847278, 0.0006756116597714426, 0.0012790377665789913, 0.0007466693134702754, 0.000650797875940104, 0.0006271119913738264, 0.0006756116597714426, 0.0006429025810846782, 0.0010230046334101807, 0.0009891676554583558, 0.0009316447929402529, 0.0006000424090123664, 0.000609065603132853, 0.0005662054310605412, 0.0005797402222412712, 0.0005425195464942635, 0.0005154499641328033, 0.0005256010575183509, 0.0006282398906388872, 0.0008380291539402033, 0.0006598210700605907, 0.0005650775317954804, 0.0004049158361568412, 0.0004274738214580579, 0.0003879973471809286, 0.00049740357589183, 0.0004658223964701265, 0.0004116832317472062, 0.00032370708907246073, 0.0002594168309639929, 0.00023685884566277616, 0.0002436262412531412, 0.00020414976697601188, 0.0001917428750603426, 0.00012970841548199646, 0.00012632471768681395, 7.105765369883285e-05, 4.737176913255523e-05, 6.767395590365033e-06, 0.0]
 
-def adjust_lambda_cut(base_lambda, seq_len):
-    return base_lambda / seq_len
-
-
-def cut_sequence_proba(sequence, base_lambda=1):
-        lambda_param = adjust_lambda_cut(base_lambda, len(sequence))
-        prob = selection_probas(range(len(sequence)), lambda_param)
-        cut_length = np.random.choice(range(len(sequence)), p=prob[::-1])
-        return sequence[-cut_length:]
-
-
-def adjust_lambda(base_lambda, lengths):
-    std_dev = np.std(lengths)
-    return base_lambda / std_dev
-
-
-def selection_probas(lengths, lambda_param):
-    probabilities = np.exp(-lambda_param * np.array(lengths)**2)
-    return probabilities / probabilities.sum()
-
-
 def logN_probas(transcript_lengths, shape = 0.39146, scale = 830):
     probabilities = [stats.lognorm.pdf(length, shape, scale=scale) for length in transcript_lengths]
     return probabilities/sum(probabilities)
 
 
-def fetch_transcript_id_by_gene(gene, transcripts_index, base_lambda=0.001):
-    transcript_ids = transcripts_index.get(gene, [])
+def fetch_transcript_id_by_gene(gene, transcripts_index, length_dist, seed=2024):
+
+    shape, _, scale = length_dist
+
+    transcript_ids = transcripts_index.get(gene, None)
 
     if transcript_ids:
         trans_len_dict = {trns: length for trns, length in transcript_ids}
 
         lengths = list(trans_len_dict.values())
         transcripts = list(trans_len_dict.keys())
 
-        probabilities = logN_probas(lengths)
+        probabilities = logN_probas(lengths, shape, scale)
 
         if np.any(np.isnan(probabilities)):
             return random.choice(transcripts)
 
         selected_index = np.random.choice(range(len(lengths)), p=probabilities)
-
         return transcripts[selected_index]
 
     else:
         return None
 
 
-def fetch_transcript_id_by_gene_old(gene, transcripts_index):
-    transcript_ids = transcripts_index.get(gene, [])
-    if transcript_ids:
-        return transcript_ids[0]
-    else:
-        return None
-
-
 def truncate_cDNA(cDNA, probs_3p=ont_3p, probs_5p=ont_5p):
     cDNA_len = len(cDNA)
     bp_list=list(range(101))
@@ -193,46 +166,6 @@ def parse_gtf_small(gtf_file, index_by, protein_coding=False):
     return transcripts
 
 
-def parse_gtf_old(gtf_file, index_by, protein_coding=True):
-
-    transcripts = {}
-
-    with open(gtf_file, 'r') as f:
-        for line in f:
-            if line.startswith('#'):
-                continue
-
-            columns = line.strip().split("\t")
-            if len(columns) > 8:
-                Attribute = columns[8].replace('"', '')
-                feature = columns[2]
-                if feature != 'transcript': 
-                    continue
-
-                attributes = dict(item.strip().split(' ',1) for item in Attribute.strip().split(';') if item.strip())
-                transcript_id = attributes['transcript_id']
-                gene_id = attributes['gene_id']
-                gene_name = attributes['gene_name']
-                transcript_type = attributes.get('transcript_biotype', attributes.get('transcript_type', ''))
-
-                index_key = None
-                if index_by == 'gene_id':
-                    index_key = gene_id
-                elif index_by == 'gene_name':
-                    index_key = gene_name
-
-                if index_key:
-                    if index_key not in transcripts:
-                        transcripts[index_key] = []
-
-                    if not protein_coding:
-                        transcripts[index_key].append(transcript_id)
-                    elif protein_coding and transcript_type == 'protein_coding':
-                        transcripts[index_key].append(transcript_id)
-
-    return transcripts
-
-
 def custom_key_function(header):
     """Extrait l'identifiant sans la version."""
     return header.split('.')[0]