-
Notifications
You must be signed in to change notification settings - Fork 0
/
cell_states_from_final.R
176 lines (68 loc) · 4.53 KB
/
cell_states_from_final.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#Hufsah Ashraf
#this script uses the background cell state for each segment in a cell using the 'final.txt' file generated by mosaiCatcher
#then translates the 'watson' and 'crick ' notations to 'forward' or 'inverted' based on the background cellstate, to have one 4 cell table per within inversion snp per single cell
#furthermore, combines all cells and outputs for each sample one 4 cell table per inversion
oldw <- getOption("warn")
options(warn = -1)
suppressMessages(library("optparse"))
suppressMessages(library(tidyverse))
option_list = list(
make_option(c("-f", "--file"), type="character", default=NULL,
help="final.txt files produced by mosaicatcher", metavar="character"),
make_option(c("-b", "--bed"), type="character", default=NULL,
help="output file snp_strand_counts from recuurence script", metavar="character"),
make_option(c("-g", "--gts"), type="character", default=NULL,
help="genotypes from the regenotyper", metavar="character"),
make_option(c("-o", "--outdir"), type="character", default="./outputcorr/",
help="output dir name [default= %default]", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
states_link = opt$file
counts_link = opt$bed
outdir_raw = opt$outdir
states<- read.table(states_link, header=T)
snp_counts<- read.table(counts_link, header=T)
sample = str_match(states_link, "([HG|NA|GM]+[0-9]{3,5})")[,2]
print(sample)
# getting state info from 'final.txt' the same way mosaiCatcher does
# Assign strand states to all chromosomes (ignoring the position, for now)
snp_counts<-left_join(snp_counts, states, by = c("cell","chrom"))
snp_counts <- snp_counts[(snp_counts$inv_start >= snp_counts$start & snp_counts$inv_end <=snp_counts$end),]
# filter out the entries where class is anything other than CC and WW (it includes removing the cells where the class entries are NA or '?')
snp_counts_filtered<-data.frame(snp_counts[!is.na(snp_counts$class) & (snp_counts$class=='CC'| snp_counts$class=='WW'),])
# for WW cells, translate W-->forward and C-->inverted (and vice versa for CC)
snp_counts_filtered$strand<-NA
snp_counts_filtered$strand<-ifelse(((snp_counts_filtered$class=='CC'& snp_counts_filtered$orientation=='Crick')|
(snp_counts_filtered$class=='WW'& snp_counts_filtered$orientation=='Watson')),'forward','inverted')
#get consensus across cells
snp_counts_cons<-snp_counts_filtered %>% group_by(sample.x, chrom, inv_start, inv_end, inv_ID, snp_pos,ref_allele, alt_allele, seen_allele, strand, snp_AF) %>%
summarize(count = sum(count), NumFrames = n())
sub<-data.frame(snp_counts_cons$sample.x,snp_counts_cons$chrom, snp_counts_cons$inv_start, snp_counts_cons$inv_end, snp_counts_cons$inv_ID,
snp_counts_cons$snp_pos, snp_counts_cons$ref_allele, snp_counts_cons$alt_allele, snp_counts_cons$snp_AF)
#store each within inversion unique snp
unique<-sub[!duplicated(sub), ]
colnames(unique)<-c('sample','chrom','inv_start','inv_end', 'inv_ID','snp_pos','ref_allele','alt_allele', 'snp_AF')
#to get tables with absolute read counts
#(a)
unique$ref_fwd<-NA
unique$ref_inv<-NA
unique$alt_fwd<-NA
unique$alt_inv<-NA
snp_counts_cons<-data.frame(snp_counts_cons)#otherwise the column entries behave weirdly because of the grouping done above
for (i in 1:length(snp_counts_cons$chrom)){
index<-which(unique$sample==snp_counts_cons[i,'sample.x'] & unique$inv_ID==snp_counts_cons[i,'inv_ID']&
unique$snp_pos==snp_counts_cons[i,'snp_pos']&unique$ref_allele==snp_counts_cons[i,'ref_allele'] &
unique$alt_allele==snp_counts_cons[i,'alt_allele'])
if(snp_counts_cons[i,'seen_allele']==snp_counts_cons[i,'ref_allele'] & snp_counts_cons[i,'strand']=='forward'){
unique[index, 'ref_fwd']<-snp_counts_cons[i,'count']
}else if (snp_counts_cons[i,'seen_allele']==snp_counts_cons[i,'ref_allele'] & snp_counts_cons[i,'strand']=='inverted'){
unique[index, 'ref_inv']<-snp_counts_cons[i,'count']
}else if (snp_counts_cons[i,'seen_allele']==snp_counts_cons[i,'alt_allele'] & snp_counts_cons[i,'strand']=='forward'){
unique[index, 'alt_fwd']<-snp_counts_cons[i,'count']
}else if (snp_counts_cons[i,'seen_allele']==snp_counts_cons[i,'alt_allele'] & snp_counts_cons[i,'strand']=='inverted'){
unique[index, 'alt_inv']<-snp_counts_cons[i,'count']
}
}
unique[is.na(unique)]<-0
write.table(unique, file=outdir_raw, row.names = F, col.names = T, quote = F )