-
Notifications
You must be signed in to change notification settings - Fork 0
/
dupcrd.R
executable file
·34 lines (32 loc) · 1.35 KB
/
dupcrd.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env Rscript
# dupcrd.R duplicate CSV read ... uses the output of csvrdm.c
# args <- commandArgs(trailingOnly = TRUE)
# numargs <- length(args)
# enumargs <- 1 # expected numargs
# if(numargs != enumargs) {
# print("This script processes the output of csvrdm ... needs one argument, a filename.")
# stop("Stopping right here")
# }
# csv <- read.csv("sigu.csv")
# csv <- read.csv("sigcsvrd.csv")
# this next one is where I corrected for av3->vsz < av4->vsz and av3->vsz>1, which was a problem.
# csv <- read.csv("sig22.csv")
# unfortunately a blight with the above was csvrdm retain a typeo GpgGrp instead of CpgGrp so
# yes, it was this that caused fewer genes in dupcrd.R
csv <- read.csv("sig222.csv") # hardcoded output of csvrdm.c
doon <- paste0(csv$Genename, "__", csv$CpgGrp)
doon2 <- paste0(csv$Genename, "__", csv$CpgGrp, "__", csv$Relation_to_Island)
d3 <- paste0(csv$Genegrp, "__", csv$Relation_to_Island)
# the duplicated annotation provide no extra information.
w <- which(!duplicated(doon))
w2 <- which(!duplicated(doon2))
w2m <- setdiff(w2,w)
w2mi <- which(!w2 %in% w)
csvw <- csv[w,]
csvw <- csvw[order(csvw$Genename),]
csvw2 <- csv[w2,]
csvw2 <- csvw2[order(csvw2$Genename),]
# first one ignores Relation_to_Island
write.csv(csvw, "sigcdd_.csv", quote=F, row.names=F)
# second csv does not.
write.csv(csvw2, "sigcdd3_.csv", quote=F, row.names=F)