From 9c661993a488b9d2eb7277bd15363f2ff7cd0148 Mon Sep 17 00:00:00 2001
From: MatteoSchiavinato <matteo.schiavinato.90@gmail.com>
Date: Fri, 26 Nov 2021 16:55:00 +0100
Subject: [PATCH] Fixed issue arising at the bed complement step

---
 jloh | 193 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 115 insertions(+), 78 deletions(-)

diff --git a/jloh b/jloh
index a339d10..9a12c60 100755
--- a/jloh
+++ b/jloh
@@ -16,7 +16,7 @@ Last change: 11/05/2018
 v3
 Matteo Schiavinato
 Notes: upgrade to python3, integrated three scripts in one, improved code readability
-Last change: 22/11/2021
+Last change: 26/11/2021
 ###
 """
 
@@ -59,14 +59,16 @@ Usage:
 --sample            Sample name / Strain name for output files                  [loh_blocks]
 --output-dir        Output directory                                            [loh_blocks_out]
 --t0-vcf            VCF with variants to ignore from --vcf                      [off]
---window            Window for heterozygous blocks definition                   [100]
+--snp-distance      Min. distance (bp between SNPs for blocks definition        [100]
 --min-size          Min. LOH block size                                         [100]
---block-distances   Comma-sep. list of desired distances between LOH blocks     [100,1000,5000]
---min-af            Min. allele frequency to consider a variant heterozygous    [off]
---max-af            Max. allele frequency to consider a variant heterozygous    [off]
+--block-lengths     Comma-sep. list of desired distances between LOH blocks     [100,1000,5000]
+--min-af            Min. allele frequency to consider a variant heterozygous    [0.3]
+--max-af            Max. allele frequency to consider a variant heterozygous    [0.7]
 --bam               BAM file (only required when filtering by coverage)         [off]
---min-frac-cov      Min. mean coverage fraction for LOH blocks                  [off]
---max-frac-cov      Max. mean coverage fraction for LOH blocks                  [off]
+--min-frac-cov      Min. mean coverage fraction for LOH blocks                  [0.75]
+                    (used only if --bam specified)
+--max-frac-cov      Max. mean coverage fraction for LOH blocks                  [1.25]
+                    (used only if --bam specified)
 --bedtools          Path to the bedtools executable                             [bedtools]
 --print-info        Show authors and edits with dates                           [off]
 
@@ -80,14 +82,14 @@ p.add_argument("--genome-file")
 p.add_argument("--sample", default="loh_blocks")
 p.add_argument("--output-dir", default="loh_blocks_out")
 p.add_argument("--t0-vcf", type=str, default=None)
-p.add_argument("--window", default=100, type=int)
+p.add_argument("--snp-distance", default=100, type=int)
 p.add_argument("--min-size", default=100, type=int)
-p.add_argument("--block-distances", default="100,1000,5000", type=str)
-p.add_argument("--min-af", default=0, type=float)
-p.add_argument("--max-af", default=1, type=float)
+p.add_argument("--block-lengths", default="100,1000,5000", type=str)
+p.add_argument("--min-af", default=0.3, type=float)
+p.add_argument("--max-af", default=0.7, type=float)
 p.add_argument("--bam", type=str)
-p.add_argument("--min-frac-cov", type=float)
-p.add_argument("--max-frac-cov", type=float)
+p.add_argument("--min-frac-cov", type=float, default=0.75)
+p.add_argument("--max-frac-cov", type=float, default=1.25)
 p.add_argument("--bedtools", default="bedtools")
 p.add_argument("--print-info", action="store_true")
 args = p.parse_args()
@@ -232,27 +234,46 @@ def extract_heterozygous_snps(vcf, min_af, max_af, output_dir, sample):
 	return (True, het_snps_vcf)
 
 
-def merge_bed_intervals(sample, output_dir, window, bedtools, het_snps_vcf):
+def merge_bed_intervals(sample, output_dir, snp_distance, bedtools, het_snps_vcf, bed_out):
 
 	"""
 	Create BED regions based on the input VCF rows after their filtering
-	These regions are --window large and centered on the SNPs
-	For each window, the number of variants contained in it are counted
-	Sort of like a coverage
+	These regions are --snp-distance large and centered on the SNPs
+	For each snp_distance, the number of variants contained in it are counted
+	The output is: chr | start | end | count
 	"""
 
-	merged_bed_out = f"{output_dir}/{sample}.d{args.window}bp_provisory.bed"
-	cmd = f"{bedtools} merge -d {window} -c 1 -o count -i {het_snps_vcf} > {merged_bed_out}"
+	cmd = f"{bedtools} merge -d {snp_distance} -c 1 -o count -i {het_snps_vcf} > {bed_out}"
 	try:
 	    subprocess.run([cmd], check=True, shell=True)
 	except subprocess.CalledProcessError:
 		sys.stderr.write("ERROR: bedtools merge didn't work\n")
 		sys.exit(1)
 
-	return (True, merged_bed_out, window)
+	return (True, snp_distance)
 
 
-def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, minL):
+def add_length(bed, output_bed):
+
+	"""
+	Compute length of BED interval and add it as 5th column
+	"""
+
+	INPUT = open(bed, "r")
+	Lines = [line.rstrip("\b\r\n").split("\t") for line in INPUT]
+	INPUT.close()
+
+	OUTPUT = open(output_bed, "w")
+	for lst in Lines:
+		chrom, start, end, count = lst
+		length = int(end)-int(start)
+		OUTPUT.write("\t".join([chrom, start, end, count, str(length)]) + "\n")
+
+	OUTPUT.close()
+
+	return True
+
+def filter_by_snp_density(in_bed, out_bed, output_dir, sample, snp_distance, minC, maxC, minL):
 
 	"""
 	Filter regions based on the number of variants found in them
@@ -266,7 +287,7 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi
 	v2
 	Author: Matteo Schiavinato
 	Notes: included a return statement to fit the new code, update to python3
-	Last change: 22/11/2021
+	Last change: 26/11/2021
 	"""
 
 	handle = open(in_bed, "r")
@@ -276,11 +297,9 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi
 	for l in handle:
 		i += 1
 		c = 0
-		ref,start,end = l.split()[:3]
-		if len(l.split())>3:
-			c = float(l.split()[3])
-		start,end = int(start),int(end)
-		if minC <= float(c) <= maxC and int(end) - int(start) >= minL:
+		ref,start,end,count,length = l.split()
+		ref,start,end,count,length = str(ref),int(start),int(end),float(count),int(length)
+		if minC <= count <= maxC and length >= minL:
 			out.append(l)
 		else:
 			skipped += 1
@@ -291,10 +310,10 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi
 		OUT.write(line)
 	OUT.close()
 
-	return (True, out_bed, minC, maxC, minL)
+	return (True, minC, maxC, minL)
 
 
-def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, genome_file):
+def get_complementary_regions(complement_bed, bedtools, filt_bed, genome_file):
 
 	"""
 	Run bedtools complement to obtain BED intervals
@@ -303,7 +322,6 @@ def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, ge
 	Hence, the output of this function are the homozygous blocks
 	"""
 
-	complement_bed = f"{output_dir}/{sample}.d{window}bp.complement.bed"
 	cmd_complement = f"{bedtools} complement -i {filt_bed} -g {genome_file} > {complement_bed}"
 	try:
 	    subprocess.run([cmd_complement], check=True, shell=True)
@@ -311,7 +329,28 @@ def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, ge
 		sys.stderr.write("ERROR: bedtools complement didn't work\n")
 		sys.exit(1)
 
-	return (True, complement_bed)
+	return True
+
+
+def filter_by_length(in_bed, out_bed, min_size):
+
+	"""
+	Keep only candidate LOH regions of a certain minimum size
+	"""
+
+	INPUT = open(in_bed, "r")
+	OUTPUT = open(out_bed, "w")
+
+	for line in INPUT:
+		chr,start,end = line.rstrip("\n\r\b").split("\t")
+		length = int(end) - int(start)
+		out_lst = [str(i) for i in [chr,start,end,length]]
+		OUTPUT.write("\t".join(out_lst) + "\n")
+
+	INPUT.close()
+	OUTPUT.close()
+
+	return True
 
 
 def filter_by_coverage(bam_file, bed_file, covfr, cmax):
@@ -325,7 +364,7 @@ def filter_by_coverage(bam_file, bed_file, covfr, cmax):
 	v2
 	Author: Matteo Schiavinato
 	Notes: included a return statement to fit the new code, update to python3
-	Last change: 05/11/2021
+	Last change: 26/11/2021
 	"""
 
 	# open bed file
@@ -341,18 +380,19 @@ def filter_by_coverage(bam_file, bed_file, covfr, cmax):
 	Out = []
 	for bed in bedstream:
 		#unload bed coordinate
-		chrom, start, end = bed.split('\t')[:3]
+		chrom, start, end, length = bed.rstrip("\n\b\r").split('\t')
 		start, end = int(start), int(end)
-		#get read count
+		#get read count within range of bed file 
 		c = sam.count(chrom, start, end)
 		#check if correct coverage
 		cov = c *1.0 / (end-start)
 		if cov < float(covfr) * meancov or cov > float(cmax) * meancov:
 			cov = round(cov, 2)
-			Out.append(f"{chrom}\t{start}\t{end}\t{cov}\n")
+			Out.append(f"{chrom}\t{start}\t{end}\t{cov}\t{length}\n")
 
 	return Out
 
+
 def main():
 
 	"""
@@ -395,53 +435,51 @@ def main():
 	# -------------------------------------------
 	# Get heterozygous blocks with bedtools merge
 	# merge bed file
-	sys.stderr.write(f"Defining heterozygous blocks for {sample}\n")
-	(result, merged_bed_out, window) = merge_bed_intervals(	sample,
-															output_dir,
-															args.window,
-															args.bedtools,
-															het_snps_vcf)
+	sys.stderr.write(f"Defining heterozygous blocks for {sample}...\n")
+	merged_bed_out = f"{output_dir}/{sample}.d{args.snp_distance}bp_provisory.bed"
+	(result, snp_distance) = merge_bed_intervals(sample,
+												output_dir,
+												args.snp_distance,
+												args.bedtools,
+												het_snps_vcf,
+												merged_bed_out)
+
+	# add length to BED file
+	sys.stderr.write(f"Calculating length of BED intervals...\n")
+	bed_with_length = f"{output_dir}/{sample}.d{args.snp_distance}bp_provisory_w_len.bed"
+	result = add_length(merged_bed_out, bed_with_length)
 
 	# ---------------------------------------------
 	# run filtering function on the merged bed file
-	# setting loose parameters for filtering
-	# set by Veronica in her script
 	sys.stderr.write("Keeping only blocks with > 1 SNP...\n")
 	minC, maxC, minL = float(2), float("inf"), float(0)
-	filt_bed = f"{output_dir}/{sample}.d{window}bp.bed"
-	(result, filt_bed, minC, maxC, minL) = bed2count_filter(merged_bed_out,
-															filt_bed,
-															output_dir,
-															sample,
-															window,
-															minC,
-															maxC,
-															minL)
+	filt_bed = f"{output_dir}/{sample}.d{snp_distance}bp.bed"
+	(result, minC, maxC, minL) = filter_by_snp_density(bed_with_length,
+														filt_bed,
+														output_dir,
+														sample,
+														snp_distance,
+														minC,
+														maxC,
+														minL)
 
 	# ------------------------------------------------------
 	# Obtaining complementary regions to heterozygous blocks
 	# get complementary bed file
 	sys.stderr.write("Getting complementary regions...\n")
-	(result, complement_bed) = get_complementary_regions(	output_dir,
-															sample,
-															window,
-															args.bedtools,
-															filt_bed,
-															args.genome_file)
+	complement_bed = f"{output_dir}/{sample}.d{snp_distance}bp.complement.bed"
+	result = get_complementary_regions(complement_bed,
+										args.bedtools,
+										filt_bed,
+										args.genome_file)
 
 	# --------------------------------------------------
 	# Filter the complement to get the homozygous blocks
 	sys.stderr.write("Getting candidate LOH blocks...\n")
-	minC, maxC, minL = float("-inf"), float("inf"), 0
-	filt_compl_bed = f"{output_dir}/{sample}.homo.d{window}bp.bed"
-	(result, filt_bed, minC, maxC, minL) = bed2count_filter(merged_bed_out,
-															filt_compl_bed,
-															output_dir,
-															sample,
-															window,
-															minC,
-															maxC,
-															minL)
+	filt_compl_bed = f"{output_dir}/{sample}.homo.d{snp_distance}bp.bed"
+	result = filter_by_length(complement_bed,
+							filt_compl_bed,
+							args.min_size)
 
 	# -----------------------------------------
 	# filter by real coverage from the BAM file
@@ -465,7 +503,7 @@ def main():
 
 		# use the filtered bed file as input for next step
 		Final_bed_lines = filter_by_coverage(args.bam, filt_compl_bed, min_frac_cov, max_frac_cov)
-		final_bed_file = f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.bed"
+		final_bed_file = f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.bed"
 
 		# write final bed file lines to an actual bed file output
 		OUT = open(final_bed_file, "w")
@@ -474,7 +512,7 @@ def main():
 		OUT.close()
 
 	else:
-		final_bed_file = f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.bed"
+		final_bed_file = f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.bed"
 		try:
 		    subprocess.run([f"cp {filt_compl_bed} {final_bed_file}"], check=True, shell=True)
 		except subprocess.CalledProcessError:
@@ -490,20 +528,19 @@ def main():
 	blocks_file.close()
 
 	# iterate over predefined ranges
-	for block_distance in [int(i) for i in args.block_distances.split(",")]:
+	for block_length in [int(i) for i in args.block_lengths.split(",")]:
 
-		sys.stderr.write(f"Generating output file filtered at min. {block_distance} range\n")
+		sys.stderr.write(f"Generating output file filtered at min. {block_length} range\n")
 
 		# open output file
-		OUTPUT = open(f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.{block_distance}bp.bed", "w")
+		OUTPUT = open(f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.{block_length}bp.bed", "w")
 
 		# extract corresponding regions
 		for line in blocks:
-			l = line.split("\t")
-			new = line.split("\n")
-			dist = int(float(l[2]) - float(l[1]))
-			if dist >= block_distance:
-				OUTPUT.write(new[0] + "\t" + str(dist) + "\n")
+			chrom, start, end, cov, length = line.rstrip("\b\r\n").split("\t")
+			length = int(length)
+			if length >= block_length:
+				OUTPUT.write(line)
 
 		# close output file
 		OUTPUT.close()