Skip to content

Commit

Permalink
Fixed issue arising at the bed complement step
Browse files Browse the repository at this point in the history
  • Loading branch information
MatteoSchiavinato committed Nov 26, 2021
1 parent 0b4e2b5 commit 9c66199
Showing 1 changed file with 115 additions and 78 deletions.
193 changes: 115 additions & 78 deletions jloh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Last change: 11/05/2018
v3
Matteo Schiavinato
Notes: upgrade to python3, integrated three scripts in one, improved code readability
Last change: 22/11/2021
Last change: 26/11/2021
###
"""

Expand Down Expand Up @@ -59,14 +59,16 @@ Usage:
--sample Sample name / Strain name for output files [loh_blocks]
--output-dir Output directory [loh_blocks_out]
--t0-vcf VCF with variants to ignore from --vcf [off]
--window Window for heterozygous blocks definition [100]
--snp-distance Min. distance (bp between SNPs for blocks definition [100]
--min-size Min. LOH block size [100]
--block-distances Comma-sep. list of desired distances between LOH blocks [100,1000,5000]
--min-af Min. allele frequency to consider a variant heterozygous [off]
--max-af Max. allele frequency to consider a variant heterozygous [off]
--block-lengths Comma-sep. list of desired distances between LOH blocks [100,1000,5000]
--min-af Min. allele frequency to consider a variant heterozygous [0.3]
--max-af Max. allele frequency to consider a variant heterozygous [0.7]
--bam BAM file (only required when filtering by coverage) [off]
--min-frac-cov Min. mean coverage fraction for LOH blocks [off]
--max-frac-cov Max. mean coverage fraction for LOH blocks [off]
--min-frac-cov Min. mean coverage fraction for LOH blocks [0.75]
(used only if --bam specified)
--max-frac-cov Max. mean coverage fraction for LOH blocks [1.25]
(used only if --bam specified)
--bedtools Path to the bedtools executable [bedtools]
--print-info Show authors and edits with dates [off]
Expand All @@ -80,14 +82,14 @@ p.add_argument("--genome-file")
p.add_argument("--sample", default="loh_blocks")
p.add_argument("--output-dir", default="loh_blocks_out")
p.add_argument("--t0-vcf", type=str, default=None)
p.add_argument("--window", default=100, type=int)
p.add_argument("--snp-distance", default=100, type=int)
p.add_argument("--min-size", default=100, type=int)
p.add_argument("--block-distances", default="100,1000,5000", type=str)
p.add_argument("--min-af", default=0, type=float)
p.add_argument("--max-af", default=1, type=float)
p.add_argument("--block-lengths", default="100,1000,5000", type=str)
p.add_argument("--min-af", default=0.3, type=float)
p.add_argument("--max-af", default=0.7, type=float)
p.add_argument("--bam", type=str)
p.add_argument("--min-frac-cov", type=float)
p.add_argument("--max-frac-cov", type=float)
p.add_argument("--min-frac-cov", type=float, default=0.75)
p.add_argument("--max-frac-cov", type=float, default=1.25)
p.add_argument("--bedtools", default="bedtools")
p.add_argument("--print-info", action="store_true")
args = p.parse_args()
Expand Down Expand Up @@ -232,27 +234,46 @@ def extract_heterozygous_snps(vcf, min_af, max_af, output_dir, sample):
return (True, het_snps_vcf)


def merge_bed_intervals(sample, output_dir, window, bedtools, het_snps_vcf):
def merge_bed_intervals(sample, output_dir, snp_distance, bedtools, het_snps_vcf, bed_out):

"""
Create BED regions based on the input VCF rows after their filtering
These regions are --window large and centered on the SNPs
For each window, the number of variants contained in it are counted
Sort of like a coverage
These regions are --snp-distance large and centered on the SNPs
For each snp_distance, the number of variants contained in it are counted
The output is: chr | start | end | count
"""

merged_bed_out = f"{output_dir}/{sample}.d{args.window}bp_provisory.bed"
cmd = f"{bedtools} merge -d {window} -c 1 -o count -i {het_snps_vcf} > {merged_bed_out}"
cmd = f"{bedtools} merge -d {snp_distance} -c 1 -o count -i {het_snps_vcf} > {bed_out}"
try:
subprocess.run([cmd], check=True, shell=True)
except subprocess.CalledProcessError:
sys.stderr.write("ERROR: bedtools merge didn't work\n")
sys.exit(1)

return (True, merged_bed_out, window)
return (True, snp_distance)


def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, minL):
def add_length(bed, output_bed):

"""
Compute length of BED interval and add it as 5th column
"""

INPUT = open(bed, "r")
Lines = [line.rstrip("\b\r\n").split("\t") for line in INPUT]
INPUT.close()

OUTPUT = open(output_bed, "w")
for lst in Lines:
chrom, start, end, count = lst
length = int(end)-int(start)
OUTPUT.write("\t".join([chrom, start, end, count, str(length)]) + "\n")

OUTPUT.close()

return True

def filter_by_snp_density(in_bed, out_bed, output_dir, sample, snp_distance, minC, maxC, minL):

"""
Filter regions based on the number of variants found in them
Expand All @@ -266,7 +287,7 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi
v2
Author: Matteo Schiavinato
Notes: included a return statement to fit the new code, update to python3
Last change: 22/11/2021
Last change: 26/11/2021
"""

handle = open(in_bed, "r")
Expand All @@ -276,11 +297,9 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi
for l in handle:
i += 1
c = 0
ref,start,end = l.split()[:3]
if len(l.split())>3:
c = float(l.split()[3])
start,end = int(start),int(end)
if minC <= float(c) <= maxC and int(end) - int(start) >= minL:
ref,start,end,count,length = l.split()
ref,start,end,count,length = str(ref),int(start),int(end),float(count),int(length)
if minC <= count <= maxC and length >= minL:
out.append(l)
else:
skipped += 1
Expand All @@ -291,10 +310,10 @@ def bed2count_filter(in_bed, out_bed, output_dir, sample, window, minC, maxC, mi
OUT.write(line)
OUT.close()

return (True, out_bed, minC, maxC, minL)
return (True, minC, maxC, minL)


def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, genome_file):
def get_complementary_regions(complement_bed, bedtools, filt_bed, genome_file):

"""
Run bedtools complement to obtain BED intervals
Expand All @@ -303,15 +322,35 @@ def get_complementary_regions(output_dir, sample, window, bedtools, filt_bed, ge
Hence, the output of this function are the homozygous blocks
"""

complement_bed = f"{output_dir}/{sample}.d{window}bp.complement.bed"
cmd_complement = f"{bedtools} complement -i {filt_bed} -g {genome_file} > {complement_bed}"
try:
subprocess.run([cmd_complement], check=True, shell=True)
except subprocess.CalledProcessError:
sys.stderr.write("ERROR: bedtools complement didn't work\n")
sys.exit(1)

return (True, complement_bed)
return True


def filter_by_length(in_bed, out_bed, min_size):

"""
Keep only candidate LOH regions of a certain minimum size
"""

INPUT = open(in_bed, "r")
OUTPUT = open(out_bed, "w")

for line in INPUT:
chr,start,end = line.rstrip("\n\r\b").split("\t")
length = int(end) - int(start)
out_lst = [str(i) for i in [chr,start,end,length]]
OUTPUT.write("\t".join(out_lst) + "\n")

INPUT.close()
OUTPUT.close()

return True


def filter_by_coverage(bam_file, bed_file, covfr, cmax):
Expand All @@ -325,7 +364,7 @@ def filter_by_coverage(bam_file, bed_file, covfr, cmax):
v2
Author: Matteo Schiavinato
Notes: included a return statement to fit the new code, update to python3
Last change: 05/11/2021
Last change: 26/11/2021
"""

# open bed file
Expand All @@ -341,18 +380,19 @@ def filter_by_coverage(bam_file, bed_file, covfr, cmax):
Out = []
for bed in bedstream:
#unload bed coordinate
chrom, start, end = bed.split('\t')[:3]
chrom, start, end, length = bed.rstrip("\n\b\r").split('\t')
start, end = int(start), int(end)
#get read count
#get read count within range of bed file
c = sam.count(chrom, start, end)
#check if correct coverage
cov = c *1.0 / (end-start)
if cov < float(covfr) * meancov or cov > float(cmax) * meancov:
cov = round(cov, 2)
Out.append(f"{chrom}\t{start}\t{end}\t{cov}\n")
Out.append(f"{chrom}\t{start}\t{end}\t{cov}\t{length}\n")

return Out


def main():

"""
Expand Down Expand Up @@ -395,53 +435,51 @@ def main():
# -------------------------------------------
# Get heterozygous blocks with bedtools merge
# merge bed file
sys.stderr.write(f"Defining heterozygous blocks for {sample}\n")
(result, merged_bed_out, window) = merge_bed_intervals( sample,
output_dir,
args.window,
args.bedtools,
het_snps_vcf)
sys.stderr.write(f"Defining heterozygous blocks for {sample}...\n")
merged_bed_out = f"{output_dir}/{sample}.d{args.snp_distance}bp_provisory.bed"
(result, snp_distance) = merge_bed_intervals(sample,
output_dir,
args.snp_distance,
args.bedtools,
het_snps_vcf,
merged_bed_out)

# add length to BED file
sys.stderr.write(f"Calculating length of BED intervals...\n")
bed_with_length = f"{output_dir}/{sample}.d{args.snp_distance}bp_provisory_w_len.bed"
result = add_length(merged_bed_out, bed_with_length)

# ---------------------------------------------
# run filtering function on the merged bed file
# setting loose parameters for filtering
# set by Veronica in her script
sys.stderr.write("Keeping only blocks with > 1 SNP...\n")
minC, maxC, minL = float(2), float("inf"), float(0)
filt_bed = f"{output_dir}/{sample}.d{window}bp.bed"
(result, filt_bed, minC, maxC, minL) = bed2count_filter(merged_bed_out,
filt_bed,
output_dir,
sample,
window,
minC,
maxC,
minL)
filt_bed = f"{output_dir}/{sample}.d{snp_distance}bp.bed"
(result, minC, maxC, minL) = filter_by_snp_density(bed_with_length,
filt_bed,
output_dir,
sample,
snp_distance,
minC,
maxC,
minL)

# ------------------------------------------------------
# Obtaining complementary regions to heterozygous blocks
# get complementary bed file
sys.stderr.write("Getting complementary regions...\n")
(result, complement_bed) = get_complementary_regions( output_dir,
sample,
window,
args.bedtools,
filt_bed,
args.genome_file)
complement_bed = f"{output_dir}/{sample}.d{snp_distance}bp.complement.bed"
result = get_complementary_regions(complement_bed,
args.bedtools,
filt_bed,
args.genome_file)

# --------------------------------------------------
# Filter the complement to get the homozygous blocks
sys.stderr.write("Getting candidate LOH blocks...\n")
minC, maxC, minL = float("-inf"), float("inf"), 0
filt_compl_bed = f"{output_dir}/{sample}.homo.d{window}bp.bed"
(result, filt_bed, minC, maxC, minL) = bed2count_filter(merged_bed_out,
filt_compl_bed,
output_dir,
sample,
window,
minC,
maxC,
minL)
filt_compl_bed = f"{output_dir}/{sample}.homo.d{snp_distance}bp.bed"
result = filter_by_length(complement_bed,
filt_compl_bed,
args.min_size)

# -----------------------------------------
# filter by real coverage from the BAM file
Expand All @@ -465,7 +503,7 @@ def main():

# use the filtered bed file as input for next step
Final_bed_lines = filter_by_coverage(args.bam, filt_compl_bed, min_frac_cov, max_frac_cov)
final_bed_file = f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.bed"
final_bed_file = f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.bed"

# write final bed file lines to an actual bed file output
OUT = open(final_bed_file, "w")
Expand All @@ -474,7 +512,7 @@ def main():
OUT.close()

else:
final_bed_file = f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.bed"
final_bed_file = f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.bed"
try:
subprocess.run([f"cp {filt_compl_bed} {final_bed_file}"], check=True, shell=True)
except subprocess.CalledProcessError:
Expand All @@ -490,20 +528,19 @@ def main():
blocks_file.close()

# iterate over predefined ranges
for block_distance in [int(i) for i in args.block_distances.split(",")]:
for block_length in [int(i) for i in args.block_lengths.split(",")]:

sys.stderr.write(f"Generating output file filtered at min. {block_distance} range\n")
sys.stderr.write(f"Generating output file filtered at min. {block_length} range\n")

# open output file
OUTPUT = open(f"{output_dir}/{sample}.homo.d{window}bp.cov_flt.{block_distance}bp.bed", "w")
OUTPUT = open(f"{output_dir}/{sample}.homo.d{snp_distance}bp.cov_flt.{block_length}bp.bed", "w")

# extract corresponding regions
for line in blocks:
l = line.split("\t")
new = line.split("\n")
dist = int(float(l[2]) - float(l[1]))
if dist >= block_distance:
OUTPUT.write(new[0] + "\t" + str(dist) + "\n")
chrom, start, end, cov, length = line.rstrip("\b\r\n").split("\t")
length = int(length)
if length >= block_length:
OUTPUT.write(line)

# close output file
OUTPUT.close()
Expand Down

0 comments on commit 9c66199

Please sign in to comment.