From aa281c03a04a9eb199bb80f219a6ca69a9963d71 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Mon, 21 Feb 2022 14:07:30 +0800 Subject: [PATCH] remove scripts/gen_fusion_file.jl since it's no longer useful --- scripts/gen_fusion_file.jl | 119 ------------------------------------- 1 file changed, 119 deletions(-) delete mode 100644 scripts/gen_fusion_file.jl diff --git a/scripts/gen_fusion_file.jl b/scripts/gen_fusion_file.jl deleted file mode 100644 index 01aaeaf..0000000 --- a/scripts/gen_fusion_file.jl +++ /dev/null @@ -1,119 +0,0 @@ -using OpenGene -using OpenGene.Reference -using ArgParse - -function main(args) - - s = ArgParseSettings(description = "generate a fusion CSV file for given gene list") - - @add_arg_table s begin - "--genes", "-g" - help = "the file giving gene list (separated by space, tab or line break)" - required = true - "--ref", "-r" - help = "the reference genome to use (hg19/hg37/grch38)" - required = true - "--fusion", "-f" - help = "the fusion file to output" - required = true - end - - options = parse_args(s) # the result is a Dict{String,Any} - detect(options["ref"], options["genes"], options["fusion"]) -end - -function detect(ref, genes, fusion) - ref = lowercase(ref) - if !(ref in ["hg19", "grch37", "grch38"]) - println("reference <-f/--ref> should be one of hg19/grch37/grch38") - return - end - input = open(genes) - output = open(fusion, "w") - genestr = readstring(input) - allgenes = split(genestr) - - println("loading gencode " * ref) - index = gencode_load(ref) - println("loading done") - - processed_genes = [] - - passed_num = 0 - for genestr in allgenes - genelist = split(genestr, '-') - for genename in genelist - if genename in processed_genes - #println("----skipped ", genename, ", already processed") - continue - end - gene_attrs = split(genename, '_') - gene = gene_attrs[1] - transcript = "" - if length(gene_attrs)>1 - transcript = gene_attrs[2] - end - gene_matches = gencode_genes(index, gene) - if length(gene_matches)==0 - #println("----skipped ", genename, ", gene name not found in gencode") - continue - end - println(genename) - g = gene_matches[1] - - # get the correct transcript if it is specified - target_tr = nothing - if transcript!="" - for tr in g.transcripts - if startswith(tr.id, transcript) - target_tr = tr - break - end - end - if target_tr == nothing - #println("----skipped ", genename, ", transcript not found") - end - end - - # get the 001 transcript - if target_tr == nothing - for tr in g.transcripts - tname = tr.attributes["transcript_name"] - if endswith(tname, "001\"") - target_tr = tr - break - end - end - end - - # get the longest transcript if it is not specified - if target_tr == nothing - longest = 0 - for tr in g.transcripts - if length(tr.exons)>longest - longest = length(tr.exons) - target_tr = tr - end - end - end - - write(output, ">", gene, "_", target_tr.id, ",", g.chr, ":", string(g.start_pos), "-", string(g.end_pos), "\n") - - exons = target_tr.exons - id = 1 - for e in exons - write(output, string(id), ",", string(e.start_pos), ",", string(e.end_pos), "\n") - id += 1 - end - passed_num += 1 - write(output, "\n") - push!(processed_genes, genename) - end - end - println("successfully processed genes: ", passed_num) - flush(output) -end - -# REPL is not supported - -main(ARGS)