From 9c661993a488b9d2eb7277bd15363f2ff7cd0148 Mon Sep 17 00:00:00 2001 From: MatteoSchiavinato Date: Fri, 26 Nov 2021 16:55:00 +0100 Subject: [PATCH] Fixed issue arising at the bed complement step --- jloh | 193 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 115 insertions(+), 78 deletions(-) diff --git a/jloh b/jloh index a339d10..9a12c60 100755 --- a/jloh +++ b/jloh @@ -16,7 +16,7 @@ Last change: 11/05/2018 v3 Matteo Schiavinato Notes: upgrade to python3, integrated three scripts in one, improved code readability -Last change: 22/11/2021 +Last change: 26/11/2021 ### """ @@ -59,14 +59,16 @@ Usage: --sample Sample name / Strain name for output files [loh_blocks] --output-dir Output directory [loh_blocks_out] --t0-vcf VCF with variants to ignore from --vcf [off] ---window Window for heterozygous blocks definition [100] +--snp-distance Min. distance (bp between SNPs for blocks definition [100] --min-size Min. LOH block size [100] ---block-distances Comma-sep. list of desired distances between LOH blocks [100,1000,5000] ---min-af Min. allele frequency to consider a variant heterozygous [off] ---max-af Max. allele frequency to consider a variant heterozygous [off] +--block-lengths Comma-sep. list of desired distances between LOH blocks [100,1000,5000] +--min-af Min. allele frequency to consider a variant heterozygous [0.3] +--max-af Max. allele frequency to consider a variant heterozygous [0.7] --bam BAM file (only required when filtering by coverage) [off] ---min-frac-cov Min. mean coverage fraction for LOH blocks [off] ---max-frac-cov Max. mean coverage fraction for LOH blocks [off] +--min-frac-cov Min. mean coverage fraction for LOH blocks [0.75] + (used only if --bam specified) +--max-frac-cov Max. mean coverage fraction for LOH blocks [1.25] + (used only if --bam specified) --bedtools Path to the bedtools executable [bedtools] --print-info Show authors and edits with dates [off] @@ -80,14 +82,14 @@ p.add_argument("--genome-file") p.add_argument("--sample", default="loh_blocks") p.add_argument("--output-dir", default="loh_blocks_out") p.add_argument("--t0-vcf", type=str, default=None) -p.add_argument("--window", default=100, type=int) +p.add_argument("--snp-distance", default=100, type=int) p.add_argument("--min-size", default=100, type=int) -p.add_argument("--block-distances", default="100,1000,5000", type=str) -p.add_argument("--min-af", default=0, type=float) -p.add_argument("--max-af", default=1, type=float) +p.add_argument("--block-lengths", default="100,1000,5000", type=str) +p.add_argument("--min-af", default=0.3, type=float) +p.add_argument("--max-af", default=0.7, type=float) p.add_argument("--bam", type=str) -p.add_argument("--min-frac-cov", type=float) -p.add_argument("--max-frac-cov", type=float) +p.add_argument("--min-frac-cov", type=float, default=0.75) +p.add_argument("--max-frac-cov", type=float, default=1.25) p.add_argument("--bedtools", default="bedtools") p.add_argument("--print-info", action="store_true") args = p.parse_args() @@ -232,27 +234,46 @@ def extract_heterozygous_snps(vcf, min_af, max_af, output_dir, sample): return (True, het_snps_vcf) -def merge_bed_intervals(sample, output_dir, window, bedtools, het_snps_vcf): +def merge_bed_intervals(sample, output_dir, snp_distance, bedtools, het_snps_vcf, bed_out): """ Create BED regions based on the input VCF rows after their filtering - These regions are --window large and centered on the SNPs - For each window, the number of variants contained in it are counted - Sort of like a coverage + These regions are --snp-distance large and centered on the SNPs + For each snp_distance, the number of variants contained in it are counted + The output is: chr | start | end | count """ - merged_bed_out = f"{output_dir}/{sample}.d{args.window}bp_provisory.bed" - cmd = f"{bedtools} merge -d {window} -c 1 -o count -i {het_snps_vcf} > {merged_bed_out}" + cmd = f"{bedtools} merge -d {snp_distance} -c 1 -o count -i {het_snps_vcf} > {bed_out}" try: subprocess.run([cmd], check=True, shell=True) except subprocess.CalledProcessError: sys.stderr.write("ERROR: bedtools merge didn't work\n") sys.exit(1) - return (True, merged_bed_out, window) + return (True, snp_distance) -def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, minL): +def add_length(bed, output_bed): + + """ + Compute length of BED interval and add it as 5th column + """ + + INPUT = open(bed, "r") + Lines = [line.rstrip("\b\r\n").split("\t") for line in INPUT] + INPUT.close() + + OUTPUT = open(output_bed, "w") + for lst in Lines: + chrom, start, end, count = lst + length = int(end)-int(start) + OUTPUT.write("\t".join([chrom, start, end, count, str(length)]) + "\n") + + OUTPUT.close() + + return True + +def filter_by_snp_density(in_bed, out_bed, output_dir, sample, snp_distance, minC, maxC, minL): """ Filter regions based on the number of variants found in them @@ -266,7 +287,7 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi v2 Author: Matteo Schiavinato Notes: included a return statement to fit the new code, update to python3 - Last change: 22/11/2021 + Last change: 26/11/2021 """ handle = open(in_bed, "r") @@ -276,11 +297,9 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi for l in handle: i += 1 c = 0 - ref,start,end = l.split()[:3] - if len(l.split())>3: - c = float(l.split()[3]) - start,end = int(start),int(end) - if minC <= float(c) <= maxC and int(end) - int(start) >= minL: + ref,start,end,count,length = l.split() + ref,start,end,count,length = str(ref),int(start),int(end),float(count),int(length) + if minC <= count <= maxC and length >= minL: out.append(l) else: skipped += 1 @@ -291,10 +310,10 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi OUT.write(line) OUT.close() - return (True, out_bed, minC, maxC, minL) + return (True, minC, maxC, minL) -def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, genome_file): +def get_complementary_regions(complement_bed, bedtools, filt_bed, genome_file): """ Run bedtools complement to obtain BED intervals @@ -303,7 +322,6 @@ def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, ge Hence, the output of this function are the homozygous blocks """ - complement_bed = f"{output_dir}/{sample}.d{window}bp.complement.bed" cmd_complement = f"{bedtools} complement -i {filt_bed} -g {genome_file} > {complement_bed}" try: subprocess.run([cmd_complement], check=True, shell=True) @@ -311,7 +329,28 @@ def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, ge sys.stderr.write("ERROR: bedtools complement didn't work\n") sys.exit(1) - return (True, complement_bed) + return True + + +def filter_by_length(in_bed, out_bed, min_size): + + """ + Keep only candidate LOH regions of a certain minimum size + """ + + INPUT = open(in_bed, "r") + OUTPUT = open(out_bed, "w") + + for line in INPUT: + chr,start,end = line.rstrip("\n\r\b").split("\t") + length = int(end) - int(start) + out_lst = [str(i) for i in [chr,start,end,length]] + OUTPUT.write("\t".join(out_lst) + "\n") + + INPUT.close() + OUTPUT.close() + + return True def filter_by_coverage(bam_file, bed_file, covfr, cmax): @@ -325,7 +364,7 @@ def filter_by_coverage(bam_file, bed_file, covfr, cmax): v2 Author: Matteo Schiavinato Notes: included a return statement to fit the new code, update to python3 - Last change: 05/11/2021 + Last change: 26/11/2021 """ # open bed file @@ -341,18 +380,19 @@ def filter_by_coverage(bam_file, bed_file, covfr, cmax): Out = [] for bed in bedstream: #unload bed coordinate - chrom, start, end = bed.split('\t')[:3] + chrom, start, end, length = bed.rstrip("\n\b\r").split('\t') start, end = int(start), int(end) - #get read count + #get read count within range of bed file c = sam.count(chrom, start, end) #check if correct coverage cov = c *1.0 / (end-start) if cov < float(covfr) * meancov or cov > float(cmax) * meancov: cov = round(cov, 2) - Out.append(f"{chrom}\t{start}\t{end}\t{cov}\n") + Out.append(f"{chrom}\t{start}\t{end}\t{cov}\t{length}\n") return Out + def main(): """ @@ -395,53 +435,51 @@ def main(): # ------------------------------------------- # Get heterozygous blocks with bedtools merge # merge bed file - sys.stderr.write(f"Defining heterozygous blocks for {sample}\n") - (result, merged_bed_out, window) = merge_bed_intervals( sample, - output_dir, - args.window, - args.bedtools, - het_snps_vcf) + sys.stderr.write(f"Defining heterozygous blocks for {sample}...\n") + merged_bed_out = f"{output_dir}/{sample}.d{args.snp_distance}bp_provisory.bed" + (result, snp_distance) = merge_bed_intervals(sample, + output_dir, + args.snp_distance, + args.bedtools, + het_snps_vcf, + merged_bed_out) + + # add length to BED file + sys.stderr.write(f"Calculating length of BED intervals...\n") + bed_with_length = f"{output_dir}/{sample}.d{args.snp_distance}bp_provisory_w_len.bed" + result = add_length(merged_bed_out, bed_with_length) # --------------------------------------------- # run filtering function on the merged bed file - # setting loose parameters for filtering - # set by Veronica in her script sys.stderr.write("Keeping only blocks with > 1 SNP...\n") minC, maxC, minL = float(2), float("inf"), float(0) - filt_bed = f"{output_dir}/{sample}.d{window}bp.bed" - (result, filt_bed, minC, maxC, minL) = bed2count_filter(merged_bed_out, - filt_bed, - output_dir, - sample, - window, - minC, - maxC, - minL) + filt_bed = f"{output_dir}/{sample}.d{snp_distance}bp.bed" + (result, minC, maxC, minL) = filter_by_snp_density(bed_with_length, + filt_bed, + output_dir, + sample, + snp_distance, + minC, + maxC, + minL) # ------------------------------------------------------ # Obtaining complementary regions to heterozygous blocks # get complementary bed file sys.stderr.write("Getting complementary regions...\n") - (result, complement_bed) = get_complementary_regions( output_dir, - sample, - window, - args.bedtools, - filt_bed, - args.genome_file) + complement_bed = f"{output_dir}/{sample}.d{snp_distance}bp.complement.bed" + result = get_complementary_regions(complement_bed, + args.bedtools, + filt_bed, + args.genome_file) # -------------------------------------------------- # Filter the complement to get the homozygous blocks sys.stderr.write("Getting candidate LOH blocks...\n") - minC, maxC, minL = float("-inf"), float("inf"), 0 - filt_compl_bed = f"{output_dir}/{sample}.homo.d{window}bp.bed" - (result, filt_bed, minC, maxC, minL) = bed2count_filter(merged_bed_out, - filt_compl_bed, - output_dir, - sample, - window, - minC, - maxC, - minL) + filt_compl_bed = f"{output_dir}/{sample}.homo.d{snp_distance}bp.bed" + result = filter_by_length(complement_bed, + filt_compl_bed, + args.min_size) # ----------------------------------------- # filter by real coverage from the BAM file @@ -465,7 +503,7 @@ def main(): # use the filtered bed file as input for next step Final_bed_lines = filter_by_coverage(args.bam, filt_compl_bed, min_frac_cov, max_frac_cov) - final_bed_file = f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.bed" + final_bed_file = f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.bed" # write final bed file lines to an actual bed file output OUT = open(final_bed_file, "w") @@ -474,7 +512,7 @@ def main(): OUT.close() else: - final_bed_file = f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.bed" + final_bed_file = f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.bed" try: subprocess.run([f"cp {filt_compl_bed} {final_bed_file}"], check=True, shell=True) except subprocess.CalledProcessError: @@ -490,20 +528,19 @@ def main(): blocks_file.close() # iterate over predefined ranges - for block_distance in [int(i) for i in args.block_distances.split(",")]: + for block_length in [int(i) for i in args.block_lengths.split(",")]: - sys.stderr.write(f"Generating output file filtered at min. {block_distance} range\n") + sys.stderr.write(f"Generating output file filtered at min. {block_length} range\n") # open output file - OUTPUT = open(f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.{block_distance}bp.bed", "w") + OUTPUT = open(f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.{block_length}bp.bed", "w") # extract corresponding regions for line in blocks: - l = line.split("\t") - new = line.split("\n") - dist = int(float(l[2]) - float(l[1])) - if dist >= block_distance: - OUTPUT.write(new[0] + "\t" + str(dist) + "\n") + chrom, start, end, cov, length = line.rstrip("\b\r\n").split("\t") + length = int(length) + if length >= block_length: + OUTPUT.write(line) # close output file OUTPUT.close()