diff --git a/jloh b/jloh index d2b8e0a..6956309 100755 --- a/jloh +++ b/jloh @@ -41,26 +41,28 @@ if (sys.argv[1] in ["--help", "-h", "-help", "help", "getopt", "usage"]): Barcelona Supercomputing Center (BSC) 2023 - v0.22.1 + v0.23.0 #### -- Extraction + stats Estimate heterozygous and homozygous SNP statistics g2g Align two genomes to find regions that should carry SNPs extract Extract LOH blocks from VCF, BAM and FASTA files + + -- Operations filter Filter extracted LOH blocks intersect Perform intersection/removal operations with output files + cluster Cluster different runs by overlap chimeric Extract genes featuring LOH blocks from different haplotypes - - -- Calculations - stats Estimate heterozygous and homozygous SNP statistics junctions Calculate number of block-to-block junctions over the genome + -- Visualization + plot Make an LOH propensity plot from "extract" output file(s) + -- Simulation sim Simulate a divergent copy of a genome - -- Visualization - plot Make an LOH propensity plot from "extract" output file(s) """) else: diff --git a/src/cluster b/src/cluster new file mode 100755 index 0000000..37d0d0b --- /dev/null +++ b/src/cluster @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 + +""" +### +JLOH - Inferring Loss of Heterozygosity Blocks from Short-read sequencing data + +Copyright (C) 2023 Matteo Schiavinato + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +### +""" + + +import argparse as ap +import sys +import pandas as pd +import pybedtools +from pybedtools import BedTool +import multiprocessing as mp +import scipy +import scipy.cluster +from time import asctime as at + + +ss = sys.exit + + +# help section +if len(sys.argv) == 1: + sys.argv.append("--help") + +if (sys.argv[1] in ["--help", "-h", "-help", "help", "getopt", "usage"]): + sys.stderr.write(""" + +Cluster different runs by overlap + +Usage: +jloh cluster [options] --loh ... + +[I/O/E] +--loh TSV files produced by "jloh extract" [!] +--out-prefix Prefix to use for output files [jloh_clust_out] + +[parameters] +--max-dist Maximum distance (float, 0-1) between elements in cluster [0.1] +--threads Number of parallel threads [4] + +""") + sys.exit(0) + +# parser +p = ap.ArgumentParser() +p.add_argument("--loh", nargs="+", required=True) +p.add_argument("--out-prefix", default="jloh_clust_out", type=str) +p.add_argument("--threads", default=4, type=int) +p.add_argument("--max-dist", default=0.1, type=float) +args = p.parse_args() + +# functions +def dump_queue(q): + + """ + 16/03/2022 + """ + + out = [] + while not q.empty(): + x = q.get() + out.append(x) + + return out + + +def read_input_files(Lohs): + + """ + 31/05/2023 + """ + + Bts = [(infile, BedTool([line for line in open(infile)])) for infile in Lohs] + return Bts + + +def get_jaccard_distance(bt1, infile_1, bt2, infile_2, queue): + + """ + 31/05/2023 + """ + + jaccard = bt1.jaccard(b=bt2)["jaccard"] + queue.put((infile_1, infile_2, jaccard)) + + +def get_distances(Bts, args): + + """ + 31/05/2023 + """ + + # calculate tot distances + tot_dists = len(Bts) * len(Bts) + + # create pool and queue for multiprocessing + pool = mp.Pool(processes=args.threads) + queue = mp.Manager().Queue() + + # for each file in the list of BedTool objects + # calculate distance from every other file including itself + counter = 0 + for i in range(0,len(Bts)): + infile_1 = Bts[i][0] + bt1 = Bts[i][1] + for k in range(0,len(Bts)): + counter += 1 + infile_2 = Bts[k][0] + bt2 = Bts[k][1] + pool.apply_async(get_jaccard_distance, args=(bt1, infile_1, bt2, infile_2, queue,)) + sys.stderr.write(f"[{at()}] Analysing pairwise distance {counter} of {tot_dists}\r") + + pool.close() + pool.join() + sys.stderr.write("\n") + + out = dump_queue(queue) + return out + + +def get_distance_matrix(Distances): + + """ + 31/05/2023 + """ + + Dist_dict = {} + + # create dictionary of dictionaries which will be used by + # pandas to create a matrix + for x in Distances: + if x[0] in Dist_dict.keys(): + Dist_dict[x[0]][x[1]] = x[2] + else: + Dist_dict[x[0]] = {x[1] : x[2]} + + # create pandas dataframe + df = pd.DataFrame(Dist_dict) + + # sort columns as row names so that the matrix is symmetrical + df = df.loc[: , df.index] + + # invert values to have diagonal 0 + # jaccard index measures proximity but to cluster we want distance + df = abs(df -1) + + return df + + +def get_clusters(df_dist, args): + + """ + 31/05/2023 + """ + + # get condensed matrix + df_dist_condensed = scipy.spatial.distance.squareform(df_dist) + + # obtain linkage between objects in matrix + clust = scipy.cluster.hierarchy.linkage(df_dist_condensed, method='single', metric='euclidean') + + # get belonging of items to clusters + clust = scipy.cluster.hierarchy.fcluster(clust, args.max_dist, criterion="distance") + + # subdivide objects by cluster belonging + df_clust = pd.DataFrame({"Sample":df_dist.index.tolist(), "Cluster":clust}) + df_clust = df_clust.sort_values(by="Cluster", ascending=True) + + return df_clust + + +def main(args): + + """ + 31/05/2023 + """ + + # read input files into BedTool objects + sys.stderr.write(f"[{at()}] Reading input files\n") + Bts = read_input_files(args.loh) + sys.stderr.write(f"[{at()}] Read {len(Bts)} files\n") + + # estimate distances using a jaccard index + sys.stderr.write(f"[{at()}] Calculating pairwise distances\n") + Distances = get_distances(Bts, args) + sys.stderr.write(f"[{at()}] Calculated {len(Distances)} pairwise distances\n") + + # convert into a distance matrix + sys.stderr.write(f"[{at()}] Getting a distance matrix\n") + df_dist = get_distance_matrix(Distances) + + # cluster samples based on distance matrix + sys.stderr.write(f"[{at()}] Inferring clusters from distance matrix\n") + df_clust = get_clusters(df_dist, args) + + # write to output + sys.stderr.write(f"[{at()}] Writing to output\n") + df_dist.round(3).reset_index(drop=False).to_csv(f"{args.out_prefix}.dist.tsv", index=False, header=True, sep="\t") + df_clust.to_csv(f"{args.out_prefix}.clust.tsv", index=False, header=True, sep="\t") + + sys.stderr.write(f"[{at()}] Done\n") + +# main +if __name__ == "__main__": + main(args) \ No newline at end of file diff --git a/src/plot b/src/plot index b1b11ed..844b8db 100755 --- a/src/plot +++ b/src/plot @@ -45,33 +45,22 @@ Usage: jloh plot --loh ... --names ... [options] -[one reference] ---one-ref Activate "one reference" mode [off] ---loh Input TSV file containing LOH blocks from JLOH extract [!] ---names Name to use in plot [!] +[modes] +--one-ref Plot LOH vs Het propensity (use --het in combination) [off] +--two-ref Plot LOH towards genome A vs B [off] + +[parameters] --het Input BED file containing het regions from JLOH extract [!] ---ref-name Name to use in plot for REF allele [REF] ---alt-name Name to use in plot for ALT allele [ALT] ---output-dir Output directory, created if not existitng [JLOH_plot_out] - -[two references] ---two-ref Activate "two references" mode [off] ---loh Input TSV files containing LOH blocks from JLOH extract [!] ---names Names to use in plot for each of the files (same order) [!] ---ref-name Name to use in plot for REF allele [REF] ---alt-name Name to use in plot for ALT allele [ALT] ---output-dir Output directory, created if not existitng [JLOH_plot_out] - -[by sample] + For "--one-ref" mode only --by-sample Use "by sample" mode (each plot is a different sample) [off] Plots will be organised by sample rather than by chromosome - -[parameters] +--clusters "clust" file from jloh cluster to define plotting order [off] --threads Number of parallel threads [12] --chr Restrict the analysis to this specific chromosome [off] --window-size Size of window for plotting, the shorter the slower [10000] -[ggplot2 options] +[R/ggplot2 options] +--r-exec R executable name to run plotting scripts [Rscript] --aspect-ratio Ratio between y / x for the output plot [0.35] --width Width (px) of the output plot [2000] --height Height (px) of the output plot [750] @@ -87,6 +76,7 @@ p.add_argument("--one-ref", action="store_true") p.add_argument("--two-ref", action="store_true") p.add_argument("--by-sample", action="store_true") p.add_argument("--loh", nargs="+") +p.add_argument("--clusters") p.add_argument("--het", nargs="+") p.add_argument("--name", nargs="+") p.add_argument("--names", nargs="+") @@ -96,6 +86,7 @@ p.add_argument("--chr", default="ALL") p.add_argument("--window-size", default=10000, type=int) p.add_argument("--threads", type=int, default=12) p.add_argument("--output-dir", default="JLOH_plot_out") +p.add_argument("--r-exec", default="Rscript", type=str) p.add_argument("--aspect-ratio", default=0.35, type=float) p.add_argument("--width", default=2000, type=int) p.add_argument("--height", default=750, type=int) @@ -117,11 +108,10 @@ ERROR: you should use either --one-ref or --two-ref """) - if (args.one_ref or args.two_ref) and not (args.loh and args.names): + if (args.one_ref or args.two_ref) and not (args.loh): sys.exit(""" -ERROR: when using --two-ref you should pass both the --loh files and the ---names you want to use with them in the plots +ERROR: No --loh files were passed """) @@ -203,6 +193,79 @@ def organise_input(loh, names, het, run_mode): return df, df_het +def sort_input_files(args): + + """ + 31/05/2023 + """ + + if args.one_ref: + + if args.clusters: + + df_clusters = pd.read_csv(args.clusters, header="infer", sep="\t") + sorting_order = df_clusters["Sample"].tolist() + + if args.names: + Loh = [] + Names = [] + Het = [] + for x in sorting_order: + Loh.append(x) + idx = args.loh.index(x) + Names.append(args.names[idx]) + Het.append(args.het[idx]) + else: + Loh = sorting_order + Names = sorting_order + Het = [] + for x in sorting_order: + idx = args.loh.index(x) + Het.append(args.het[idx]) + + else: + if args.names: + Loh = args.loh + Names = args.names + Het = args.het + else: + Loh = args.loh + Names = args.loh + Het = args.het + + elif args.two_ref: + + Het = None + + if args.clusters: + + df_clusters = pd.read_csv(args.clusters, header="infer", sep="\t") + sorting_order = df_clusters["Sample"].tolist() + + if args.names: + Loh = [] + Names = [] + for x in sorting_order: + Loh.append(x) + idx = args.loh.index(x) + Names.append(args.names[idx]) + else: + Loh = sorting_order + Names = sorting_order + for x in sorting_order: + idx = args.loh.index(x) + + else: + if args.names: + Loh = args.loh + Names = args.names + else: + Loh = args.loh + Names = args.loh + + return Loh, Names, Het + + def select_specific_chromosomes(df, df_het, args, run_mode): """ @@ -709,7 +772,7 @@ def fill_missing_windows(df, run_mode, args): -def run_one_ref_script(loh_table, args): +def run_one_ref_script(loh_table, Names, args): """ 12/05/2023 @@ -724,12 +787,12 @@ def run_one_ref_script(loh_table, args): sys.stderr.write(f"[{at()}] Plotting\n") src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts" cmd = " ".join([ - "Rscript", + str(args.r_exec), f"{src_dir}/loh-bin-plots_one-ref.Rscript", str(loh_table), f"{args.output_dir}/plots", str(plot_type), - str(",".join(args.names)), + str(",".join(Names)), str(args.aspect_ratio), str(args.width), str(args.height), @@ -740,7 +803,7 @@ def run_one_ref_script(loh_table, args): os.system(cmd) -def run_two_ref_script(loh_table, args): +def run_two_ref_script(loh_table, Names, args): """ 12/05/2023 @@ -755,11 +818,11 @@ def run_two_ref_script(loh_table, args): sys.stderr.write(f"[{at()}] Plotting\n") src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts" cmd = " ".join([ - "Rscript", f"{src_dir}/loh-bin-plots_two-ref.Rscript", + str(args.r_exec), f"{src_dir}/loh-bin-plots_two-ref.Rscript", str(loh_table), f"{args.output_dir}/plots", str(plot_type), - str(",".join(args.names)), + str(",".join(Names)), str(args.aspect_ratio), str(args.width), str(args.height), @@ -773,13 +836,16 @@ def run_two_ref_script(loh_table, args): def run_oneref_mode(args): """ - 12/05/2023 + 31/05/2023 """ - # read input information and put it into a single dataframe + # read input files and names and sort them by cluster if provided + Loh, Names, Het = sort_input_files(args) + + # put information into a single dataframe # create another one for heterozygosity files sys.stderr.write(f"[{at()}] Reading input information\n") - df, df_het = organise_input(args.loh, args.names, args.het, "one_ref") + df, df_het = organise_input(Loh, Names, Het, "one_ref") # select specific chromosome if user wants if args.chr: @@ -795,26 +861,30 @@ def run_oneref_mode(args): # fill the gaps df = fill_missing_windows(df, "one_ref", args) - + # write table to output sys.stderr.write(f"[{at()}] Writing table to output\n") loh_table = f"{args.output_dir}/LOH_rate.tsv" df.to_csv(loh_table, sep="\t", header=True, index=False) - run_one_ref_script(loh_table, args) + run_one_ref_script(loh_table, Names, args) def run_tworef_mode(args): """ - 12/05/2023 + 31/05/2023 """ + # read input files and names and sort them by cluster if provided + # het is useless, and in fact assigned to "None" by the function in two_ref mode + Loh, Names, Het = sort_input_files(args) + # read input information and put it into a single dataframe # ignore heterozygosity sys.stderr.write(f"[{at()}] Reading input information\n") - df, df_het = organise_input(args.loh, args.names, None, "two_ref") + df, df_het = organise_input(Loh, Names, Het, "two_ref") # select specific chromosome if user wants if args.chr: @@ -836,7 +906,7 @@ def run_tworef_mode(args): loh_table = f"{args.output_dir}/LOH_rate.tsv" df.to_csv(loh_table, sep="\t", header=True, index=False) - run_two_ref_script(loh_table, args) + run_two_ref_script(loh_table, Names, args) diff --git a/src/scripts/loh-bin-plots_one-ref.Rscript b/src/scripts/loh-bin-plots_one-ref.Rscript index a646af3..1a22d65 100755 --- a/src/scripts/loh-bin-plots_one-ref.Rscript +++ b/src/scripts/loh-bin-plots_one-ref.Rscript @@ -32,8 +32,7 @@ plotOrder <- rev(unlist(strsplit(plotOrder, ",", fixed=T))) # fix positions and names df$W_start <- df$W_start / 1000000 -df$W_end <- df$W_end / 1000000 -df$Sample <- gsub("_.*", "", df$Sample) +df$W_end <- df$W_end / 1000000 df$Sample <- factor(as.character(df$Sample), levels=plotOrder) df$LOH_propensity <- df$LOH_ratio - df$Het_ratio diff --git a/src/scripts/loh-bin-plots_two-ref.Rscript b/src/scripts/loh-bin-plots_two-ref.Rscript index df13fed..9d64c16 100755 --- a/src/scripts/loh-bin-plots_two-ref.Rscript +++ b/src/scripts/loh-bin-plots_two-ref.Rscript @@ -33,7 +33,6 @@ plotOrder <- rev(unlist(strsplit(plotOrder, ",", fixed=T))) # fix positions and names df$W_start <- df$W_start / 1000000 df$W_end <- df$W_end / 1000000 -df$Sample <- gsub("_.*", "", df$Sample) df$Sample <- factor(as.character(df$Sample), levels=plotOrder) df$LOH_ratio <- df$LOH_ratio_ALT - df$LOH_ratio_REF