diff --git a/jloh b/jloh
index d2b8e0a..6956309 100755
--- a/jloh
+++ b/jloh
@@ -41,26 +41,28 @@ if (sys.argv[1] in ["--help", "-h", "-help", "help", "getopt", "usage"]):
Barcelona Supercomputing Center (BSC)
2023
- v0.22.1
+ v0.23.0
####
-- Extraction
+ stats Estimate heterozygous and homozygous SNP statistics
g2g Align two genomes to find regions that should carry SNPs
extract Extract LOH blocks from VCF, BAM and FASTA files
+
+ -- Operations
filter Filter extracted LOH blocks
intersect Perform intersection/removal operations with output files
+ cluster Cluster different runs by overlap
chimeric Extract genes featuring LOH blocks from different haplotypes
-
- -- Calculations
- stats Estimate heterozygous and homozygous SNP statistics
junctions Calculate number of block-to-block junctions over the genome
+ -- Visualization
+ plot Make an LOH propensity plot from "extract" output file(s)
+
-- Simulation
sim Simulate a divergent copy of a genome
- -- Visualization
- plot Make an LOH propensity plot from "extract" output file(s)
""")
else:
diff --git a/src/cluster b/src/cluster
new file mode 100755
index 0000000..37d0d0b
--- /dev/null
+++ b/src/cluster
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+"""
+###
+JLOH - Inferring Loss of Heterozygosity Blocks from Short-read sequencing data
+
+Copyright (C) 2023 Matteo Schiavinato
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+###
+"""
+
+
+import argparse as ap
+import sys
+import pandas as pd
+import pybedtools
+from pybedtools import BedTool
+import multiprocessing as mp
+import scipy
+import scipy.cluster
+from time import asctime as at
+
+
+ss = sys.exit
+
+
+# help section
+if len(sys.argv) == 1:
+ sys.argv.append("--help")
+
+if (sys.argv[1] in ["--help", "-h", "-help", "help", "getopt", "usage"]):
+ sys.stderr.write("""
+
+Cluster different runs by overlap
+
+Usage:
+jloh cluster [options] --loh ...
+
+[I/O/E]
+--loh TSV files produced by "jloh extract" [!]
+--out-prefix Prefix to use for output files [jloh_clust_out]
+
+[parameters]
+--max-dist Maximum distance (float, 0-1) between elements in cluster [0.1]
+--threads Number of parallel threads [4]
+
+""")
+ sys.exit(0)
+
+# parser
+p = ap.ArgumentParser()
+p.add_argument("--loh", nargs="+", required=True)
+p.add_argument("--out-prefix", default="jloh_clust_out", type=str)
+p.add_argument("--threads", default=4, type=int)
+p.add_argument("--max-dist", default=0.1, type=float)
+args = p.parse_args()
+
+# functions
+def dump_queue(q):
+
+ """
+ 16/03/2022
+ """
+
+ out = []
+ while not q.empty():
+ x = q.get()
+ out.append(x)
+
+ return out
+
+
+def read_input_files(Lohs):
+
+ """
+ 31/05/2023
+ """
+
+ Bts = [(infile, BedTool([line for line in open(infile)])) for infile in Lohs]
+ return Bts
+
+
+def get_jaccard_distance(bt1, infile_1, bt2, infile_2, queue):
+
+ """
+ 31/05/2023
+ """
+
+ jaccard = bt1.jaccard(b=bt2)["jaccard"]
+ queue.put((infile_1, infile_2, jaccard))
+
+
+def get_distances(Bts, args):
+
+ """
+ 31/05/2023
+ """
+
+ # calculate tot distances
+ tot_dists = len(Bts) * len(Bts)
+
+ # create pool and queue for multiprocessing
+ pool = mp.Pool(processes=args.threads)
+ queue = mp.Manager().Queue()
+
+ # for each file in the list of BedTool objects
+ # calculate distance from every other file including itself
+ counter = 0
+ for i in range(0,len(Bts)):
+ infile_1 = Bts[i][0]
+ bt1 = Bts[i][1]
+ for k in range(0,len(Bts)):
+ counter += 1
+ infile_2 = Bts[k][0]
+ bt2 = Bts[k][1]
+ pool.apply_async(get_jaccard_distance, args=(bt1, infile_1, bt2, infile_2, queue,))
+ sys.stderr.write(f"[{at()}] Analysing pairwise distance {counter} of {tot_dists}\r")
+
+ pool.close()
+ pool.join()
+ sys.stderr.write("\n")
+
+ out = dump_queue(queue)
+ return out
+
+
+def get_distance_matrix(Distances):
+
+ """
+ 31/05/2023
+ """
+
+ Dist_dict = {}
+
+ # create dictionary of dictionaries which will be used by
+ # pandas to create a matrix
+ for x in Distances:
+ if x[0] in Dist_dict.keys():
+ Dist_dict[x[0]][x[1]] = x[2]
+ else:
+ Dist_dict[x[0]] = {x[1] : x[2]}
+
+ # create pandas dataframe
+ df = pd.DataFrame(Dist_dict)
+
+ # sort columns as row names so that the matrix is symmetrical
+ df = df.loc[: , df.index]
+
+ # invert values to have diagonal 0
+ # jaccard index measures proximity but to cluster we want distance
+ df = abs(df -1)
+
+ return df
+
+
+def get_clusters(df_dist, args):
+
+ """
+ 31/05/2023
+ """
+
+ # get condensed matrix
+ df_dist_condensed = scipy.spatial.distance.squareform(df_dist)
+
+ # obtain linkage between objects in matrix
+ clust = scipy.cluster.hierarchy.linkage(df_dist_condensed, method='single', metric='euclidean')
+
+ # get belonging of items to clusters
+ clust = scipy.cluster.hierarchy.fcluster(clust, args.max_dist, criterion="distance")
+
+ # subdivide objects by cluster belonging
+ df_clust = pd.DataFrame({"Sample":df_dist.index.tolist(), "Cluster":clust})
+ df_clust = df_clust.sort_values(by="Cluster", ascending=True)
+
+ return df_clust
+
+
+def main(args):
+
+ """
+ 31/05/2023
+ """
+
+ # read input files into BedTool objects
+ sys.stderr.write(f"[{at()}] Reading input files\n")
+ Bts = read_input_files(args.loh)
+ sys.stderr.write(f"[{at()}] Read {len(Bts)} files\n")
+
+ # estimate distances using a jaccard index
+ sys.stderr.write(f"[{at()}] Calculating pairwise distances\n")
+ Distances = get_distances(Bts, args)
+ sys.stderr.write(f"[{at()}] Calculated {len(Distances)} pairwise distances\n")
+
+ # convert into a distance matrix
+ sys.stderr.write(f"[{at()}] Getting a distance matrix\n")
+ df_dist = get_distance_matrix(Distances)
+
+ # cluster samples based on distance matrix
+ sys.stderr.write(f"[{at()}] Inferring clusters from distance matrix\n")
+ df_clust = get_clusters(df_dist, args)
+
+ # write to output
+ sys.stderr.write(f"[{at()}] Writing to output\n")
+ df_dist.round(3).reset_index(drop=False).to_csv(f"{args.out_prefix}.dist.tsv", index=False, header=True, sep="\t")
+ df_clust.to_csv(f"{args.out_prefix}.clust.tsv", index=False, header=True, sep="\t")
+
+ sys.stderr.write(f"[{at()}] Done\n")
+
+# main
+if __name__ == "__main__":
+ main(args)
\ No newline at end of file
diff --git a/src/plot b/src/plot
index b1b11ed..844b8db 100755
--- a/src/plot
+++ b/src/plot
@@ -45,33 +45,22 @@ Usage:
jloh plot --loh ... --names ... [options]
-[one reference]
---one-ref Activate "one reference" mode [off]
---loh Input TSV file containing LOH blocks from JLOH extract [!]
---names Name to use in plot [!]
+[modes]
+--one-ref Plot LOH vs Het propensity (use --het in combination) [off]
+--two-ref Plot LOH towards genome A vs B [off]
+
+[parameters]
--het Input BED file containing het regions from JLOH extract [!]
---ref-name Name to use in plot for REF allele [REF]
---alt-name Name to use in plot for ALT allele [ALT]
---output-dir Output directory, created if not existitng [JLOH_plot_out]
-
-[two references]
---two-ref Activate "two references" mode [off]
---loh Input TSV files containing LOH blocks from JLOH extract [!]
---names Names to use in plot for each of the files (same order) [!]
---ref-name Name to use in plot for REF allele [REF]
---alt-name Name to use in plot for ALT allele [ALT]
---output-dir Output directory, created if not existitng [JLOH_plot_out]
-
-[by sample]
+ For "--one-ref" mode only
--by-sample Use "by sample" mode (each plot is a different sample) [off]
Plots will be organised by sample rather than by chromosome
-
-[parameters]
+--clusters "clust" file from jloh cluster to define plotting order [off]
--threads Number of parallel threads [12]
--chr Restrict the analysis to this specific chromosome [off]
--window-size Size of window for plotting, the shorter the slower [10000]
-[ggplot2 options]
+[R/ggplot2 options]
+--r-exec R executable name to run plotting scripts [Rscript]
--aspect-ratio Ratio between y / x for the output plot [0.35]
--width Width (px) of the output plot [2000]
--height Height (px) of the output plot [750]
@@ -87,6 +76,7 @@ p.add_argument("--one-ref", action="store_true")
p.add_argument("--two-ref", action="store_true")
p.add_argument("--by-sample", action="store_true")
p.add_argument("--loh", nargs="+")
+p.add_argument("--clusters")
p.add_argument("--het", nargs="+")
p.add_argument("--name", nargs="+")
p.add_argument("--names", nargs="+")
@@ -96,6 +86,7 @@ p.add_argument("--chr", default="ALL")
p.add_argument("--window-size", default=10000, type=int)
p.add_argument("--threads", type=int, default=12)
p.add_argument("--output-dir", default="JLOH_plot_out")
+p.add_argument("--r-exec", default="Rscript", type=str)
p.add_argument("--aspect-ratio", default=0.35, type=float)
p.add_argument("--width", default=2000, type=int)
p.add_argument("--height", default=750, type=int)
@@ -117,11 +108,10 @@ ERROR: you should use either --one-ref or --two-ref
""")
- if (args.one_ref or args.two_ref) and not (args.loh and args.names):
+ if (args.one_ref or args.two_ref) and not (args.loh):
sys.exit("""
-ERROR: when using --two-ref you should pass both the --loh files and the
---names you want to use with them in the plots
+ERROR: No --loh files were passed
""")
@@ -203,6 +193,79 @@ def organise_input(loh, names, het, run_mode):
return df, df_het
+def sort_input_files(args):
+
+ """
+ 31/05/2023
+ """
+
+ if args.one_ref:
+
+ if args.clusters:
+
+ df_clusters = pd.read_csv(args.clusters, header="infer", sep="\t")
+ sorting_order = df_clusters["Sample"].tolist()
+
+ if args.names:
+ Loh = []
+ Names = []
+ Het = []
+ for x in sorting_order:
+ Loh.append(x)
+ idx = args.loh.index(x)
+ Names.append(args.names[idx])
+ Het.append(args.het[idx])
+ else:
+ Loh = sorting_order
+ Names = sorting_order
+ Het = []
+ for x in sorting_order:
+ idx = args.loh.index(x)
+ Het.append(args.het[idx])
+
+ else:
+ if args.names:
+ Loh = args.loh
+ Names = args.names
+ Het = args.het
+ else:
+ Loh = args.loh
+ Names = args.loh
+ Het = args.het
+
+ elif args.two_ref:
+
+ Het = None
+
+ if args.clusters:
+
+ df_clusters = pd.read_csv(args.clusters, header="infer", sep="\t")
+ sorting_order = df_clusters["Sample"].tolist()
+
+ if args.names:
+ Loh = []
+ Names = []
+ for x in sorting_order:
+ Loh.append(x)
+ idx = args.loh.index(x)
+ Names.append(args.names[idx])
+ else:
+ Loh = sorting_order
+ Names = sorting_order
+ for x in sorting_order:
+ idx = args.loh.index(x)
+
+ else:
+ if args.names:
+ Loh = args.loh
+ Names = args.names
+ else:
+ Loh = args.loh
+ Names = args.loh
+
+ return Loh, Names, Het
+
+
def select_specific_chromosomes(df, df_het, args, run_mode):
"""
@@ -709,7 +772,7 @@ def fill_missing_windows(df, run_mode, args):
-def run_one_ref_script(loh_table, args):
+def run_one_ref_script(loh_table, Names, args):
"""
12/05/2023
@@ -724,12 +787,12 @@ def run_one_ref_script(loh_table, args):
sys.stderr.write(f"[{at()}] Plotting\n")
src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts"
cmd = " ".join([
- "Rscript",
+ str(args.r_exec),
f"{src_dir}/loh-bin-plots_one-ref.Rscript",
str(loh_table),
f"{args.output_dir}/plots",
str(plot_type),
- str(",".join(args.names)),
+ str(",".join(Names)),
str(args.aspect_ratio),
str(args.width),
str(args.height),
@@ -740,7 +803,7 @@ def run_one_ref_script(loh_table, args):
os.system(cmd)
-def run_two_ref_script(loh_table, args):
+def run_two_ref_script(loh_table, Names, args):
"""
12/05/2023
@@ -755,11 +818,11 @@ def run_two_ref_script(loh_table, args):
sys.stderr.write(f"[{at()}] Plotting\n")
src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts"
cmd = " ".join([
- "Rscript", f"{src_dir}/loh-bin-plots_two-ref.Rscript",
+ str(args.r_exec), f"{src_dir}/loh-bin-plots_two-ref.Rscript",
str(loh_table),
f"{args.output_dir}/plots",
str(plot_type),
- str(",".join(args.names)),
+ str(",".join(Names)),
str(args.aspect_ratio),
str(args.width),
str(args.height),
@@ -773,13 +836,16 @@ def run_two_ref_script(loh_table, args):
def run_oneref_mode(args):
"""
- 12/05/2023
+ 31/05/2023
"""
- # read input information and put it into a single dataframe
+ # read input files and names and sort them by cluster if provided
+ Loh, Names, Het = sort_input_files(args)
+
+ # put information into a single dataframe
# create another one for heterozygosity files
sys.stderr.write(f"[{at()}] Reading input information\n")
- df, df_het = organise_input(args.loh, args.names, args.het, "one_ref")
+ df, df_het = organise_input(Loh, Names, Het, "one_ref")
# select specific chromosome if user wants
if args.chr:
@@ -795,26 +861,30 @@ def run_oneref_mode(args):
# fill the gaps
df = fill_missing_windows(df, "one_ref", args)
-
+
# write table to output
sys.stderr.write(f"[{at()}] Writing table to output\n")
loh_table = f"{args.output_dir}/LOH_rate.tsv"
df.to_csv(loh_table, sep="\t", header=True, index=False)
- run_one_ref_script(loh_table, args)
+ run_one_ref_script(loh_table, Names, args)
def run_tworef_mode(args):
"""
- 12/05/2023
+ 31/05/2023
"""
+ # read input files and names and sort them by cluster if provided
+ # het is useless, and in fact assigned to "None" by the function in two_ref mode
+ Loh, Names, Het = sort_input_files(args)
+
# read input information and put it into a single dataframe
# ignore heterozygosity
sys.stderr.write(f"[{at()}] Reading input information\n")
- df, df_het = organise_input(args.loh, args.names, None, "two_ref")
+ df, df_het = organise_input(Loh, Names, Het, "two_ref")
# select specific chromosome if user wants
if args.chr:
@@ -836,7 +906,7 @@ def run_tworef_mode(args):
loh_table = f"{args.output_dir}/LOH_rate.tsv"
df.to_csv(loh_table, sep="\t", header=True, index=False)
- run_two_ref_script(loh_table, args)
+ run_two_ref_script(loh_table, Names, args)
diff --git a/src/scripts/loh-bin-plots_one-ref.Rscript b/src/scripts/loh-bin-plots_one-ref.Rscript
index a646af3..1a22d65 100755
--- a/src/scripts/loh-bin-plots_one-ref.Rscript
+++ b/src/scripts/loh-bin-plots_one-ref.Rscript
@@ -32,8 +32,7 @@ plotOrder <- rev(unlist(strsplit(plotOrder, ",", fixed=T)))
# fix positions and names
df$W_start <- df$W_start / 1000000
-df$W_end <- df$W_end / 1000000
-df$Sample <- gsub("_.*", "", df$Sample)
+df$W_end <- df$W_end / 1000000
df$Sample <- factor(as.character(df$Sample), levels=plotOrder)
df$LOH_propensity <- df$LOH_ratio - df$Het_ratio
diff --git a/src/scripts/loh-bin-plots_two-ref.Rscript b/src/scripts/loh-bin-plots_two-ref.Rscript
index df13fed..9d64c16 100755
--- a/src/scripts/loh-bin-plots_two-ref.Rscript
+++ b/src/scripts/loh-bin-plots_two-ref.Rscript
@@ -33,7 +33,6 @@ plotOrder <- rev(unlist(strsplit(plotOrder, ",", fixed=T)))
# fix positions and names
df$W_start <- df$W_start / 1000000
df$W_end <- df$W_end / 1000000
-df$Sample <- gsub("_.*", "", df$Sample)
df$Sample <- factor(as.character(df$Sample), levels=plotOrder)
df$LOH_ratio <- df$LOH_ratio_ALT - df$LOH_ratio_REF