-
Notifications
You must be signed in to change notification settings - Fork 0
/
reads_per_ins_each_gene.py
46 lines (31 loc) · 1.46 KB
/
reads_per_ins_each_gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 4 16:14:01 2021
@author: floor
"""
import numpy as np
import pandas as pd
from ast import literal_eval
# Import pergene_insertions.txt
df = pd.read_csv(r"C:\Users\floor\PycharmProjects\LaanLab-SATAY-DataAnalysis\satay_analysis_testdata\Output_Processing_WT1_KornmannLab\ERR1533147_trimmed.sorted.bam_pergene_insertions.txt", sep = "\t")
df.columns = ["gene","chromosome", "start bp", "stop bp", "insertions", "reads"]
# Convert entire column to a list
df.loc[:,'insertions'] = df.loc[:,'insertions'].apply(lambda x: literal_eval(x))
df.loc[:,'reads'] = df.loc[:,'reads'].apply(lambda x: literal_eval(x))
# Compute the average number of reads per insertion for each gene by adding up all the reads in 1 gene
# and dividing them by the number of insertions
y = [ ]
for index, row in df.iterrows():
if row['insertions'] != []:
x = [row['gene'], sum(row['reads'])/len(row['reads'])]
y.append(x)
else:
x = [row['gene'], 0 ]
y.append(x)
gene_readsperins = pd.DataFrame(y)
gene_readsperins.columns = ["gene", "reads per insertions"]
del (y, x, index, row)
# compute the average reads per insertions over all the data.
# so we can compare this number with all the genes
avrg_reads_per_ins = gene_readsperins["reads per insertions"].sum()/len(gene_readsperins)
# compare to general reads per insertion