From 31c6207d7b6dae9e53ae729234542fcaf5e24cd3 Mon Sep 17 00:00:00 2001 From: MatteoSchiavinato Date: Thu, 20 Apr 2023 10:32:30 +0200 Subject: [PATCH] Updated plotting script --- src/plot | 302 +++++++++++++++--- src/scripts/loh-bin-plots_one-ref.Rscript | 53 +++ ....Rscript => loh-bin-plots_two-ref.Rscript} | 2 +- 3 files changed, 312 insertions(+), 45 deletions(-) create mode 100644 src/scripts/loh-bin-plots_one-ref.Rscript rename src/scripts/{loh-bin-plots.Rscript => loh-bin-plots_two-ref.Rscript} (99%) diff --git a/src/plot b/src/plot index 60b2d73..955bd8f 100755 --- a/src/plot +++ b/src/plot @@ -41,22 +41,33 @@ if (sys.argv[1] in ["--help", "-h", "-help", "help", "getopt", "usage"]): Plot LOH propensity over all or specific chromosomes from multiple JLOH extract TSV output files -Usage: +Usage (one ref): +jloh plot --loh [options] + +Usage (two ref): jloh plot --loh ... --names ... [options] -[I/O] ---loh Input TSV file(s) containing LOH blocks from JLOH extract [!] +[one reference] +--one-ref Activate "one reference" mode [off] +--loh Input TSV file containing LOH blocks from JLOH extract [!] +--name Name to use in plot [!] +--het Input BED file containing het regions from JLOH extract [!] +--output-dir Output directory, created if not existitng [JLOH_plot_out] + +[two references] +--two-ref Activate "two references" mode [off] +--loh Input TSV files containing LOH blocks from JLOH extract [!] --names Names to use in plot for each of the files (same order) [!] +--ref-name Name to use in plot for REF allele [REF] +--alt-name Name to use in plot for ALT allele [ALT] --output-dir Output directory, created if not existitng [JLOH_plot_out] [parameters] --threads Number of parallel threads [12] ---ref-name Name to use in plot for REF allele [REF] ---alt-name Name to use in plot for ALT allele [ALT] --chr Restrict the analysis to this specific chromosome [off] --window-size Size of window for plotting, the shorter the slower [10000] -[ggplot2] +[ggplot2 options] --aspect-ratio Ratio between y / x for the output plot [0.35] --width Width (px) of the output plot [2000] --height Height (px) of the output plot [750] @@ -68,7 +79,11 @@ jloh plot --loh ... --names ... [options] # parser p = ap.ArgumentParser() +p.add_argument("--one-ref", action="store_true") +p.add_argument("--two-ref", action="store_true") p.add_argument("--loh", nargs="+") +p.add_argument("--het", type=str) +p.add_argument("--name", nargs="+") p.add_argument("--names", nargs="+") p.add_argument("--ref-name", default="REF") p.add_argument("--alt-name", default="ALT") @@ -87,6 +102,32 @@ ss = sys.exit pandarallel.initialize(progress_bar=False, nb_workers=args.threads, use_memory_fs=False) # functions +def check_conditions(args): + + if (args.one_ref and args.two_ref) or \ + (not args.one_ref and not args.two_ref): + sys.exit(""" + +ERROR: you should use either --one-ref or --two-ref + +""") + + if args.one_ref and not args.name: + sys.exit(""" + +ERROR: you should specify a --name for the --loh file when using --one-ref + +""") + + if args.two_ref and not (args.loh and args.names): + sys.exit(""" + +ERROR: when using --two-ref you should pass both the --loh files and the +--names you want to use with them in the plots + +""") + + def organize_input(loh, names): """ @@ -117,10 +158,10 @@ def organize_input(loh, names): return df -def split_into_windows(df_sub, w_size, q): +def split_into_windows(df_sub, w_size, q, args): """ - Last update: 16/01/2023 + Last update: 18/04/2023 """ start, end = df_sub["Start"], df_sub["End"] @@ -161,10 +202,97 @@ def split_into_windows(df_sub, w_size, q): sys.exit() -def count_ratios(df_sub, w_size): +def split_into_windows_het(df_het, w_size, q, args): """ - 16/01/2023 + Last update: 18/04/2023 + """ + + start, end = df_het["Start"], df_het["End"] + w_start = start - (start % w_size) + w_end = w_start + w_size + + if (w_start <= start <= w_end) and (w_start <= end <= w_end): + df_het["Window"] = (w_start, w_end) + df_het["LOH_pos"] = df_het["End"] - df_het["Start"] + q.put(df_het) + + elif (w_start <= start <= w_end) and (end > w_end): + out = [] + df_tmp = df_het.copy() + df_tmp.End = w_end + df_tmp["Window"] = (w_start, w_end) + df_tmp["LOH_pos"] = df_tmp["End"] - df_tmp["Start"] + out.append(df_tmp.copy()) + + while (end > w_end): + w_start += w_size + w_end += w_size + df_tmp.Start = max(start, w_start) + df_tmp.End = min(end, w_end) + df_tmp["Window"] = (w_start, w_end) + df_tmp["LOH_pos"] = df_tmp["End"] - df_tmp["Start"] + out.append(df_tmp.copy()) + + for x in out: + q.put(x) + + elif (start < w_start) and (start <= end <= w_end): + sys.stderr.write("ERROR: debug!\n") + sys.exit() + + elif (start < w_start) and (end > w_end): + sys.stderr.write("ERROR: debug!\n") + sys.exit() + + +def count_ratios_oneref(df_sub, w_size, df_het): + + """ + 18/04/2023 + """ + + sample = df_sub.loc[:,"Sample"].tolist()[0] + chrom = df_sub.loc[:,"Chromosome"].tolist()[0] + window = df_sub.loc[:,"Window"].tolist()[0] + + df_sub_het = df_het.loc[(df_het["Sample"]==sample) & + (df_het["Chromosome"]==chrom) & + (df_het["Window"]==window), :] + + + if df_sub_het.shape[0] > 0: + df_sub_het = df_sub_het.loc[:,["Het_pos"]].sum() + het_pos = df_sub_het["Het_pos"] + het_ratio = het_pos / w_size + + else: + het_pos = 0 + het_ratio = 0 + + if df_sub.shape[0] > 0: + df_sub = df_sub.loc[:,["LOH_pos"]].sum() + loh_pos = df_sub["LOH_pos"] + loh_ratio = loh_pos / w_size + + else: + loh_pos = 0 + loh_ratio = 0 + + s = pd.Series( + { + "Het_pos":het_pos, "Het_ratio":het_ratio, + "LOH_pos":loh_pos, "LOH_ratio":loh_ratio + } + ) + + return s + + +def count_ratios_tworef(df_sub, w_size): + + """ + 18/04/2023 """ window = df_sub.loc[:,"Window"].tolist()[0] @@ -174,7 +302,7 @@ def count_ratios(df_sub, w_size): df_sub["LOH_ratio"] = df_sub["LOH_pos"] / w_size df_sub["W_start"] = window[0] df_sub["W_end"] = window[1] - + for loh_type in ["REF", "ALT"]: if loh_type not in df_sub["Allele"].tolist(): df_sub = df_sub.append({"Allele":loh_type, "LOH_pos":0, "LOH_ratio":0, "W_start":window[0], "W_end":window[1]}, ignore_index=True) @@ -187,21 +315,21 @@ def count_ratios(df_sub, w_size): "LOH_ratio_ALT" : [df_sub.loc["ALT", "LOH_ratio"]] } df_sub = pd.DataFrame.from_dict(d) - return df_sub -def quantize_intervals(df, window_size, threads): +def quantize_intervals(df, window_size, threads, args): """ - Last update: 16/01/2023 + Last update: 18/04/2023 """ + # quantize windows pool = mp.Pool(processes=threads) q = mp.Manager().Queue() for index, row in df.iterrows(): - pool.apply_async(split_into_windows, args=(row, window_size, q)) + pool.apply_async(split_into_windows, args=(row, window_size, q, args)) pool.close() pool.join() @@ -212,34 +340,126 @@ def quantize_intervals(df, window_size, threads): df = pd.DataFrame(out).sort_values(by=["Sample", "Chromosome", "Window"]) - df = df.groupby(["Sample", "Chromosome", "Window"]).parallel_apply(count_ratios, (window_size)) + # read heterozygosity and quantize it + if args.one_ref: + pool = mp.Pool(processes=threads) + q = mp.Manager().Queue() + + df_het = pd.read_csv(args.het, sep="\t", usecols=[0,1,2]) + df_het.columns = ["Chromosome", "Start", "End"] + df_het.insert(loc=0, column="Sample", value=args.name[0]) + + for index, row in df_het.iterrows(): + pool.apply_async(split_into_windows_het, args=(row, window_size, q, args)) + + pool.close() + pool.join() + + out = [] + while q.empty() == False: + out.append(q.get()) + + df_het = pd.DataFrame(out).sort_values(by=["Sample", "Chromosome", "Window"]) + df_het = df_het.rename({"LOH_pos":"Het_pos"}, axis=1) + df_het = df_het.loc[:,["Sample", "Chromosome", "Window", "Het_pos"]] + + df = df.loc[:,["Sample", "Chromosome", "Window", "LOH_pos"]] + + # quantize ratios + if args.one_ref: + df = df.groupby(["Sample", "Chromosome", "Window"]).parallel_apply(lambda df_sub : count_ratios_oneref(df_sub, window_size, df_het)) + + elif args.two_ref: + df = df.groupby(["Sample", "Chromosome", "Window"]).parallel_apply(lambda df_sub : count_ratios_tworef(df_sub, window_size)) + return df -def reorganise_output_dataframe(df): +def reorganise_output_dataframe(df, args): """ - Last update: 16/01/2023 + Last update: 18/04/2023 """ - df = df.reset_index(drop=False) - df["W_start"] = df["Window"].str[0] - df["W_end"] = df["Window"].str[1] - df = df.loc[:,[ "Sample", "Chromosome", "W_start", "W_end", - "LOH_pos_REF", "LOH_ratio_REF", "LOH_pos_ALT", "LOH_ratio_ALT"]] + if args.one_ref: + df = df.reset_index(drop=False) + df["W_start"] = df["Window"].str[0] + df["W_end"] = df["Window"].str[1] + df = df.loc[:,[ "Sample", "Chromosome", "W_start", "W_end", + "Het_pos", "Het_ratio", "LOH_pos", "LOH_ratio"]] + + elif args.two_ref: + df = df.reset_index(drop=False) + df["W_start"] = df["Window"].str[0] + df["W_end"] = df["Window"].str[1] + df = df.loc[:,[ "Sample", "Chromosome", "W_start", "W_end", + "LOH_pos_REF", "LOH_ratio_REF", "LOH_pos_ALT", "LOH_ratio_ALT"]] return df +def run_one_ref_script(args): + + """ + 18/04/2023 + """ + + # plot + sys.stderr.write(f"[{at()}] Plotting\n") + src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts" + cmd = " ".join([ + "Rscript", f"{src_dir}/loh-bin-plots_one-ref.Rscript", + f"{args.output_dir}/LOH_rate_by_window.tsv", + f"{args.output_dir}/plots", + str(args.aspect_ratio), + str(args.width), + str(args.height), + str(args.res), + str(args.name[0]) ]) + + sys.stderr.write(f"\nPlot command that was run:\n{cmd}\n\n") + os.system(cmd) + + +def run_two_ref_script(args): + + """ + 18/04/2023 + """ + + # plot + sys.stderr.write(f"[{at()}] Plotting\n") + src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts" + cmd = " ".join([ + "Rscript", f"{src_dir}/loh-bin-plots_two-ref.Rscript", + f"{args.output_dir}/LOH_rate_by_window.tsv", + f"{args.output_dir}/plots", + str(args.aspect_ratio), + str(args.width), + str(args.height), + str(args.res), + args.ref_name, args.alt_name ]) + + sys.stderr.write(f"\nPlot command that was run:\n{cmd}\n\n") + os.system(cmd) + + def main(args): + # check conditions + check_conditions(args) + # organise workspace os.makedirs(args.output_dir, exist_ok=True) # create input table - sys.stderr.write(f"[{at()}] Reading input information\n") - df = organize_input(args.loh, args.names) + sys.stderr.write(f"[{at()}] Reading input information\n") + if args.one_ref: + df = organize_input(args.loh, args.name) + + elif args.two_ref: + df = organize_input(args.loh, args.names) # select specific chromosome if user wants if args.chr != "ALL": @@ -247,33 +467,27 @@ def main(args): df = df.loc[df["Chromosome"]==args.chr, :] if df.shape[0] == 0: - sys.stderr.write(f"ERROR: dataframe is empty after selecting only chromosome {args.chr}\n") + sys.stderr.write(f""" + +ERROR: dataframe is empty after selecting only chromosome {args.chr}\n + +""") ss() # quantize in windows sys.stderr.write(f"[{at()}] Quantizing in windows of {args.window_size} bp\n") - df = quantize_intervals(df, args.window_size, args.threads) - df = reorganise_output_dataframe(df) - + df = quantize_intervals(df, args.window_size, args.threads, args) + df = reorganise_output_dataframe(df, args) + # write table to output sys.stderr.write(f"[{at()}] Writing table to output\n") df.to_csv(f"{args.output_dir}/LOH_rate_by_window.tsv", sep="\t", header=True, index=False) - # plot - sys.stderr.write(f"[{at()}] Plotting\n") - src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts" - cmd = " ".join([ - "Rscript", f"{src_dir}/loh-bin-plots.Rscript", - f"{args.output_dir}/LOH_rate_by_window.tsv", - f"{args.output_dir}/plots", - str(args.aspect_ratio), - str(args.width), - str(args.height), - str(args.res), - args.ref_name, args.alt_name ]) - - sys.stderr.write(f"\nPlot command that was run:\n{cmd}\n\n") - os.system(cmd) + if args.one_ref: + run_one_ref_script(args) + + elif args.two_ref: + run_two_ref_script(args) # main diff --git a/src/scripts/loh-bin-plots_one-ref.Rscript b/src/scripts/loh-bin-plots_one-ref.Rscript new file mode 100644 index 0000000..f3ab58e --- /dev/null +++ b/src/scripts/loh-bin-plots_one-ref.Rscript @@ -0,0 +1,53 @@ +#!/usr/bin/env Rscript + +# libraries +library(ggplot2) +library(reshape2) +library(hash) +library(scales) + +args <- commandArgs(trailingOnly=T) +infile <- args[1] +outdir <- args[2] +myRatio <- as.numeric(args[3]) +myWidth <- as.numeric(args[4]) +myHeight <- as.numeric(args[5]) +myRes <- as.numeric(args[6]) +myName <- args[7] + +# read input file +dir.create(outdir, showWarnings = FALSE) +df <- read.table(infile, header=T) + +# fix positions and names +df$W_start <- df$W_start / 1000000 +df$W_end <- df$W_end / 1000000 +df$Sample <- gsub("_.*", "", df$Sample) +df$Sample <- factor(as.character(df$Sample), levels=unique(as.character(df$Sample))) +df$LOH_propensity <- df$LOH_ratio - df$Het_ratio + +df$Chromosome <- factor(df$Chromosome, levels=unique(df$Chromosome)) + +x <- df[ , c("Chromosome", "W_end", "LOH_propensity")] + +P1 <- ggplot(data=x, aes(x=W_end, y=Chromosome, fill=LOH_propensity)) + +theme_bw() + +theme(aspect.ratio=myRatio, + axis.text=element_text(family="Helvetica", face="plain", size=8), + axis.title=element_text(family="Helvetica", face="plain", size=9), + legend.text=element_text(family="Helvetica", face="italic", size=7), + legend.title=element_text(family="Helvetica", face="plain", size=9), + plot.title=element_text(family="Helvetica", face="plain", size=9)) + +geom_tile() + +xlab("Position [Mbp]") + +scale_fill_gradient2(low="#f7c35c", mid="white", high="#EF6F6C", midpoint=0, + na.value = "grey50", breaks=c(-1,1), limits=c(-1,1), + labels=c("Het", "LOH"), + name="") + +ylab("") + +outfile <- paste(myName, "LOH", "png", sep=".") +out <- paste(outdir, outfile, sep="/") +png(out, height=myHeight, width=myWidth, res=myRes, units="px") +plot(P1) +dev.off() \ No newline at end of file diff --git a/src/scripts/loh-bin-plots.Rscript b/src/scripts/loh-bin-plots_two-ref.Rscript similarity index 99% rename from src/scripts/loh-bin-plots.Rscript rename to src/scripts/loh-bin-plots_two-ref.Rscript index d7dc89d..7245f6f 100755 --- a/src/scripts/loh-bin-plots.Rscript +++ b/src/scripts/loh-bin-plots_two-ref.Rscript @@ -50,7 +50,7 @@ for (chrom in Chroms) { legend.text=element_text(family="Helvetica", face="italic", size=7), legend.title=element_text(family="Helvetica", face="plain", size=9), plot.title=element_text(family="Helvetica", face="plain", size=9)) + - geom_raster() + + geom_tile() + xlab("Position [Mbp]") + scale_fill_gradient2(low="#EF6F6C", mid="white", high="#64B6AC", midpoint=0, na.value = "grey50", breaks=c(-1,0,1), limits=c(-1,1),