diff --git a/jloh b/jloh
index d2b8e0a..6956309 100755
--- a/jloh
+++ b/jloh
@@ -41,26 +41,28 @@ if (sys.argv[1] in ["--help", "-h", "-help", "help", "getopt", "usage"]):
     Barcelona Supercomputing Center (BSC) 
     2023 
 
-    v0.22.1
+    v0.23.0
 
     ####
 
  -- Extraction
+    stats               Estimate heterozygous and homozygous SNP statistics
     g2g                 Align two genomes to find regions that should carry SNPs
     extract             Extract LOH blocks from VCF, BAM and FASTA files
+
+ -- Operations
     filter              Filter extracted LOH blocks
     intersect           Perform intersection/removal operations with output files
+    cluster             Cluster different runs by overlap 
     chimeric            Extract genes featuring LOH blocks from different haplotypes
-
- -- Calculations
-    stats               Estimate heterozygous and homozygous SNP statistics
     junctions           Calculate number of block-to-block junctions over the genome
 
+ -- Visualization
+    plot                Make an LOH propensity plot from "extract" output file(s)
+
  -- Simulation
     sim                 Simulate a divergent copy of a genome
 
- -- Visualization
-    plot                Make an LOH propensity plot from "extract" output file(s)
 
 """)
 else:
diff --git a/src/cluster b/src/cluster
new file mode 100755
index 0000000..37d0d0b
--- /dev/null
+++ b/src/cluster
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+"""
+###
+JLOH - Inferring Loss of Heterozygosity Blocks from Short-read sequencing data
+
+Copyright (C) 2023 Matteo Schiavinato
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+###
+"""
+
+
+import argparse as ap
+import sys
+import pandas as pd
+import pybedtools
+from pybedtools import BedTool
+import multiprocessing as mp 
+import scipy
+import scipy.cluster 
+from time import asctime as at
+
+
+ss = sys.exit 
+
+
+# help section
+if len(sys.argv) == 1:
+    sys.argv.append("--help")
+
+if (sys.argv[1] in ["--help", "-h", "-help", "help", "getopt", "usage"]):
+    sys.stderr.write("""
+
+Cluster different runs by overlap 
+
+Usage:
+jloh cluster [options] --loh <LOH_A.tsv> ... <LOH_N.tsv> 
+
+[I/O/E]
+--loh               TSV files produced by "jloh extract"                        [!]
+--out-prefix        Prefix to use for output files                              [jloh_clust_out]
+
+[parameters]
+--max-dist          Maximum distance (float, 0-1) between elements in cluster   [0.1]
+--threads           Number of parallel threads                                  [4]
+
+""")
+    sys.exit(0)
+
+# parser 
+p = ap.ArgumentParser()
+p.add_argument("--loh", nargs="+", required=True)
+p.add_argument("--out-prefix", default="jloh_clust_out", type=str)
+p.add_argument("--threads", default=4, type=int)
+p.add_argument("--max-dist", default=0.1, type=float)
+args = p.parse_args()
+
+# functions
+def dump_queue(q):
+
+    """
+    16/03/2022
+    """
+
+    out = []
+    while not q.empty():
+        x = q.get()
+        out.append(x)
+
+    return out
+
+
+def read_input_files(Lohs):
+
+    """
+    31/05/2023
+    """
+
+    Bts = [(infile, BedTool([line for line in open(infile)])) for infile in Lohs]
+    return Bts
+
+
+def get_jaccard_distance(bt1, infile_1, bt2, infile_2, queue):
+
+    """
+    31/05/2023
+    """
+
+    jaccard = bt1.jaccard(b=bt2)["jaccard"]
+    queue.put((infile_1, infile_2, jaccard))
+
+
+def get_distances(Bts, args):
+
+    """
+    31/05/2023
+    """
+
+    # calculate tot distances 
+    tot_dists = len(Bts) * len(Bts)
+
+    # create pool and queue for multiprocessing 
+    pool = mp.Pool(processes=args.threads)
+    queue = mp.Manager().Queue()
+
+    # for each file in the list of BedTool objects 
+    # calculate distance from every other file including itself 
+    counter = 0
+    for i in range(0,len(Bts)):
+        infile_1 = Bts[i][0]
+        bt1 = Bts[i][1]
+        for k in range(0,len(Bts)):
+            counter += 1
+            infile_2 = Bts[k][0]
+            bt2 = Bts[k][1]
+            pool.apply_async(get_jaccard_distance, args=(bt1, infile_1, bt2, infile_2, queue,))
+            sys.stderr.write(f"[{at()}] Analysing pairwise distance {counter} of {tot_dists}\r")
+
+    pool.close()
+    pool.join()
+    sys.stderr.write("\n")
+
+    out = dump_queue(queue)
+    return out 
+
+
+def get_distance_matrix(Distances):
+
+    """
+    31/05/2023
+    """
+
+    Dist_dict = {}
+    
+    # create dictionary of dictionaries which will be used by 
+    # pandas to create a matrix 
+    for x in Distances:
+        if x[0] in Dist_dict.keys():
+            Dist_dict[x[0]][x[1]] = x[2]
+        else:
+            Dist_dict[x[0]] = {x[1] : x[2]}
+    
+    # create pandas dataframe
+    df = pd.DataFrame(Dist_dict)
+
+    # sort columns as row names so that the matrix is symmetrical
+    df = df.loc[: , df.index]
+
+    # invert values to have diagonal 0 
+    # jaccard index measures proximity but to cluster we want distance 
+    df = abs(df -1) 
+    
+    return df 
+
+
+def get_clusters(df_dist, args):
+
+    """
+    31/05/2023
+    """
+
+    # get condensed matrix 
+    df_dist_condensed = scipy.spatial.distance.squareform(df_dist)
+
+    # obtain linkage between objects in matrix
+    clust = scipy.cluster.hierarchy.linkage(df_dist_condensed, method='single', metric='euclidean')
+
+    # get belonging of items to clusters 
+    clust = scipy.cluster.hierarchy.fcluster(clust, args.max_dist, criterion="distance")
+
+    # subdivide objects by cluster belonging 
+    df_clust = pd.DataFrame({"Sample":df_dist.index.tolist(), "Cluster":clust})
+    df_clust = df_clust.sort_values(by="Cluster", ascending=True)
+
+    return df_clust 
+
+
+def main(args):
+
+    """
+    31/05/2023
+    """
+
+    # read input files into BedTool objects 
+    sys.stderr.write(f"[{at()}] Reading input files\n")
+    Bts = read_input_files(args.loh)
+    sys.stderr.write(f"[{at()}] Read {len(Bts)} files\n")
+
+    # estimate distances using a jaccard index 
+    sys.stderr.write(f"[{at()}] Calculating pairwise distances\n")
+    Distances = get_distances(Bts, args)
+    sys.stderr.write(f"[{at()}] Calculated {len(Distances)} pairwise distances\n")
+
+    # convert into a distance matrix 
+    sys.stderr.write(f"[{at()}] Getting a distance matrix\n")
+    df_dist = get_distance_matrix(Distances)
+
+    # cluster samples based on distance matrix 
+    sys.stderr.write(f"[{at()}] Inferring clusters from distance matrix\n")
+    df_clust = get_clusters(df_dist, args)
+
+    # write to output 
+    sys.stderr.write(f"[{at()}] Writing to output\n")
+    df_dist.round(3).reset_index(drop=False).to_csv(f"{args.out_prefix}.dist.tsv", index=False, header=True, sep="\t")
+    df_clust.to_csv(f"{args.out_prefix}.clust.tsv", index=False, header=True, sep="\t")
+
+    sys.stderr.write(f"[{at()}] Done\n")
+
+# main 
+if __name__ == "__main__":
+    main(args)
\ No newline at end of file
diff --git a/src/plot b/src/plot
index b1b11ed..844b8db 100755
--- a/src/plot
+++ b/src/plot
@@ -45,33 +45,22 @@ Usage:
 jloh plot --loh <TSV_1> ... <TSV_n> --names <STR_1> ... <STR_n> [options]
 
 
-[one reference]
---one-ref           Activate "one reference" mode                               [off]
---loh               Input TSV file containing LOH blocks from JLOH extract      [!]
---names             Name to use in plot                                         [!]
+[modes]
+--one-ref           Plot LOH vs Het propensity (use --het in combination)       [off]
+--two-ref           Plot LOH towards genome A vs B                              [off]
+
+[parameters]
 --het               Input BED file containing het regions from JLOH extract     [!]
---ref-name          Name to use in plot for REF allele                          [REF]
---alt-name          Name to use in plot for ALT allele                          [ALT]
---output-dir        Output directory, created if not existitng                  [JLOH_plot_out]
-
-[two references]
---two-ref           Activate "two references" mode                              [off]
---loh               Input TSV files containing LOH blocks from JLOH extract     [!]
---names             Names to use in plot for each of the files (same order)     [!]
---ref-name          Name to use in plot for REF allele                          [REF]
---alt-name          Name to use in plot for ALT allele                          [ALT]
---output-dir        Output directory, created if not existitng                  [JLOH_plot_out]
-
-[by sample]
+                    For "--one-ref" mode only 
 --by-sample         Use "by sample" mode (each plot is a different sample)      [off]
                     Plots will be organised by sample rather than by chromosome
-
-[parameters]
+--clusters          "clust" file from jloh cluster to define plotting order     [off]
 --threads           Number of parallel threads                                  [12]
 --chr               Restrict the analysis to this specific chromosome           [off]
 --window-size       Size of window for plotting, the shorter the slower         [10000]
 
-[ggplot2 options]
+[R/ggplot2 options]
+--r-exec            R executable name to run plotting scripts                   [Rscript] 
 --aspect-ratio      Ratio between y / x for the output plot                     [0.35]
 --width             Width (px) of the output plot                               [2000]
 --height            Height (px) of the output plot                              [750]
@@ -87,6 +76,7 @@ p.add_argument("--one-ref", action="store_true")
 p.add_argument("--two-ref", action="store_true")
 p.add_argument("--by-sample", action="store_true")
 p.add_argument("--loh", nargs="+")
+p.add_argument("--clusters")
 p.add_argument("--het", nargs="+")
 p.add_argument("--name", nargs="+")
 p.add_argument("--names", nargs="+")
@@ -96,6 +86,7 @@ p.add_argument("--chr", default="ALL")
 p.add_argument("--window-size", default=10000, type=int)
 p.add_argument("--threads", type=int, default=12)
 p.add_argument("--output-dir", default="JLOH_plot_out")
+p.add_argument("--r-exec", default="Rscript", type=str)
 p.add_argument("--aspect-ratio", default=0.35, type=float)
 p.add_argument("--width", default=2000, type=int)
 p.add_argument("--height", default=750, type=int)
@@ -117,11 +108,10 @@ ERROR: you should use either --one-ref or --two-ref
 
 """)
 
-    if (args.one_ref or args.two_ref) and not (args.loh and args.names):
+    if (args.one_ref or args.two_ref) and not (args.loh):
         sys.exit("""
 
-ERROR: when using --two-ref you should pass both the --loh files and the 
---names you want to use with them in the plots 
+ERROR: No --loh files were passed 
 
 """)
 
@@ -203,6 +193,79 @@ def organise_input(loh, names, het, run_mode):
     return df, df_het 
 
 
+def sort_input_files(args):
+
+    """
+    31/05/2023
+    """
+
+    if args.one_ref:
+
+        if args.clusters:
+
+            df_clusters = pd.read_csv(args.clusters, header="infer", sep="\t")
+            sorting_order = df_clusters["Sample"].tolist()
+
+            if args.names:
+                Loh = []
+                Names = []
+                Het = []
+                for x in sorting_order:
+                    Loh.append(x)
+                    idx = args.loh.index(x)
+                    Names.append(args.names[idx])
+                    Het.append(args.het[idx])
+            else:
+                Loh = sorting_order
+                Names = sorting_order
+                Het = []
+                for x in sorting_order:
+                    idx = args.loh.index(x)
+                    Het.append(args.het[idx])
+
+        else:
+            if args.names:
+                Loh = args.loh
+                Names = args.names
+                Het = args.het
+            else:
+                Loh = args.loh 
+                Names = args.loh
+                Het = args.het 
+    
+    elif args.two_ref:
+
+        Het = None
+
+        if args.clusters:
+
+            df_clusters = pd.read_csv(args.clusters, header="infer", sep="\t")
+            sorting_order = df_clusters["Sample"].tolist()
+
+            if args.names:
+                Loh = []
+                Names = []
+                for x in sorting_order:
+                    Loh.append(x)
+                    idx = args.loh.index(x)
+                    Names.append(args.names[idx])
+            else:
+                Loh = sorting_order
+                Names = sorting_order
+                for x in sorting_order:
+                    idx = args.loh.index(x)
+
+        else:
+            if args.names:
+                Loh = args.loh
+                Names = args.names
+            else:
+                Loh = args.loh 
+                Names = args.loh
+
+    return Loh, Names, Het
+
+
 def select_specific_chromosomes(df, df_het, args, run_mode):
 
     """
@@ -709,7 +772,7 @@ def fill_missing_windows(df, run_mode, args):
 
 
 
-def run_one_ref_script(loh_table, args):
+def run_one_ref_script(loh_table, Names, args):
 
     """
     12/05/2023
@@ -724,12 +787,12 @@ def run_one_ref_script(loh_table, args):
     sys.stderr.write(f"[{at()}] Plotting\n")
     src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts"
     cmd = " ".join([
-        "Rscript", 
+        str(args.r_exec), 
         f"{src_dir}/loh-bin-plots_one-ref.Rscript", 
         str(loh_table), 
         f"{args.output_dir}/plots",
         str(plot_type), 
-        str(",".join(args.names)),
+        str(",".join(Names)),
         str(args.aspect_ratio), 
         str(args.width),
         str(args.height),
@@ -740,7 +803,7 @@ def run_one_ref_script(loh_table, args):
     os.system(cmd)
 
 
-def run_two_ref_script(loh_table, args):
+def run_two_ref_script(loh_table, Names, args):
 
     """
     12/05/2023
@@ -755,11 +818,11 @@ def run_two_ref_script(loh_table, args):
     sys.stderr.write(f"[{at()}] Plotting\n") 
     src_dir = "/".join(sys.argv[0].split("/")[0:-1]) + "/" + "scripts"
     cmd = " ".join([
-        "Rscript", f"{src_dir}/loh-bin-plots_two-ref.Rscript", 
+        str(args.r_exec), f"{src_dir}/loh-bin-plots_two-ref.Rscript", 
         str(loh_table), 
         f"{args.output_dir}/plots",
         str(plot_type),
-        str(",".join(args.names)),
+        str(",".join(Names)),
         str(args.aspect_ratio), 
         str(args.width),
         str(args.height),
@@ -773,13 +836,16 @@ def run_two_ref_script(loh_table, args):
 def run_oneref_mode(args):
 
     """
-    12/05/2023
+    31/05/2023
     """
 
-    # read input information and put it into a single dataframe 
+    # read input files and names and sort them by cluster if provided 
+    Loh, Names, Het = sort_input_files(args)
+
+    # put information into a single dataframe 
     # create another one for heterozygosity files 
     sys.stderr.write(f"[{at()}] Reading input information\n")
-    df, df_het = organise_input(args.loh, args.names, args.het, "one_ref")
+    df, df_het = organise_input(Loh, Names, Het, "one_ref")
 
     # select specific chromosome if user wants 
     if args.chr:
@@ -795,26 +861,30 @@ def run_oneref_mode(args):
 
     # fill the gaps
     df = fill_missing_windows(df, "one_ref", args)
-
+    
     # write table to output
     sys.stderr.write(f"[{at()}] Writing table to output\n") 
 
     loh_table = f"{args.output_dir}/LOH_rate.tsv"
     df.to_csv(loh_table, sep="\t", header=True, index=False)
 
-    run_one_ref_script(loh_table, args)
+    run_one_ref_script(loh_table, Names, args)
 
 
 def run_tworef_mode(args):
 
     """
-    12/05/2023
+    31/05/2023
     """
 
+    # read input files and names and sort them by cluster if provided 
+    # het is useless, and in fact assigned to "None" by the function in two_ref mode
+    Loh, Names, Het = sort_input_files(args)
+
     # read input information and put it into a single dataframe 
     # ignore heterozygosity 
     sys.stderr.write(f"[{at()}] Reading input information\n")
-    df, df_het = organise_input(args.loh, args.names, None, "two_ref")
+    df, df_het = organise_input(Loh, Names, Het, "two_ref")
     
     # select specific chromosome if user wants 
     if args.chr:
@@ -836,7 +906,7 @@ def run_tworef_mode(args):
     loh_table = f"{args.output_dir}/LOH_rate.tsv"
     df.to_csv(loh_table, sep="\t", header=True, index=False)
 
-    run_two_ref_script(loh_table, args)
+    run_two_ref_script(loh_table, Names, args)
 
 
 
diff --git a/src/scripts/loh-bin-plots_one-ref.Rscript b/src/scripts/loh-bin-plots_one-ref.Rscript
index a646af3..1a22d65 100755
--- a/src/scripts/loh-bin-plots_one-ref.Rscript
+++ b/src/scripts/loh-bin-plots_one-ref.Rscript
@@ -32,8 +32,7 @@ plotOrder <- rev(unlist(strsplit(plotOrder, ",", fixed=T)))
 
 # fix positions and names 
 df$W_start <- df$W_start / 1000000
-df$W_end <- df$W_end / 1000000
-df$Sample <- gsub("_.*", "", df$Sample)
+df$W_end <- df$W_end / 1000000  
 df$Sample <- factor(as.character(df$Sample), levels=plotOrder)
 df$LOH_propensity <- df$LOH_ratio - df$Het_ratio 
 
diff --git a/src/scripts/loh-bin-plots_two-ref.Rscript b/src/scripts/loh-bin-plots_two-ref.Rscript
index df13fed..9d64c16 100755
--- a/src/scripts/loh-bin-plots_two-ref.Rscript
+++ b/src/scripts/loh-bin-plots_two-ref.Rscript
@@ -33,7 +33,6 @@ plotOrder <- rev(unlist(strsplit(plotOrder, ",", fixed=T)))
 # fix positions and names 
 df$W_start <- df$W_start / 1000000
 df$W_end <- df$W_end / 1000000
-df$Sample <- gsub("_.*", "", df$Sample)
 df$Sample <- factor(as.character(df$Sample), levels=plotOrder)
 df$LOH_ratio <- df$LOH_ratio_ALT - df$LOH_ratio_REF