Skip to content

Commit

Permalink
Updated plotting scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Matteo Schiavinato committed Oct 18, 2023
1 parent a949f06 commit e50065b
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 29 deletions.
3 changes: 3 additions & 0 deletions src/extract
Original file line number Diff line number Diff line change
Expand Up @@ -1248,6 +1248,7 @@ def run_in_default_mode(args, tmp_bams):

# write candidates to output, pre filtering
out = open(f"{args.output_dir}/{args.sample}.LOH_candidates.tsv", "w")
out.write("\t".join(["#Chrom", "Start", "End", "Cov", "Cov_frac", "Cov_ratio_up", "Cov_ratio_down", "Zygosity", "Length", "Allele", "Homo_snps", "Hetero_snps"]) + "\n")

for row in LOH:
item = str(row).rstrip("\b\n\r").split("\t")
Expand Down Expand Up @@ -1394,6 +1395,8 @@ def run_in_block_assignment_mode(args, tmp_bams):

out_A = open(f"{args.output_dir}/{args.sample}.LOH_candidates.A.tsv", "w")
out_B = open(f"{args.output_dir}/{args.sample}.LOH_candidates.B.tsv", "w")
out_A.write("\t".join(["#Chrom", "Start", "End", "Cov", "Cov_frac", "Cov_ratio_up", "Cov_ratio_down", "Zygosity", "Length", "Allele", "Homo_snps", "Hetero_snps"]) + "\n")
out_B.write("\t".join(["#Chrom", "Start", "End", "Cov", "Cov_frac", "Cov_ratio_up", "Cov_ratio_down", "Zygosity", "Length", "Allele", "Homo_snps", "Hetero_snps"]) + "\n")

for row in LOH_A:
item = str(row).rstrip("\b\n\r").split("\t")
Expand Down
2 changes: 1 addition & 1 deletion src/plot
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ ERROR: No --loh files were passed
def organise_input(loh, names, het, run_mode):

"""
Last update: 12/05/2023
Last update: 18/10/2023
"""

if run_mode == "one_ref":
Expand Down
26 changes: 11 additions & 15 deletions src/scripts/loh-bin-plots_one-ref.Rscript
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dir.create(outdir, showWarnings = FALSE)
df <- read.table(infile, header=T)

df$Sample <- gsub(".*/", "", df$Sample)
plotOrder <- unlist(strsplit(plotOrder, ","))

# contrast
if (loh_contrast == "low") {
Expand All @@ -57,9 +58,14 @@ if (loh_contrast == "mid") {
print("Leaving contrast untouched")
}

if (do_we_merge == "yes") {
# fix positions and names
df$W_start <- df$W_start / 1000000
df$W_end <- df$W_end / 1000000
df$Sample <- factor(as.character(df$Sample), levels=plotOrder)
df$LOH_ratio <- df$LOH_ratio - df$Het_ratio


midpoint_color = het_color
if (do_we_merge == "yes") {

# merging combination
df["Merged_key"] <- paste(df$Chromosome, df$W_start, sep="__")
Expand All @@ -68,12 +74,6 @@ if (do_we_merge == "yes") {
# read plot order
plotOrder <- rev(unlist(strsplit(plotOrder, ",", fixed=T)))

# fix positions and names
df$W_start <- df$W_start / 1000000
df$W_end <- df$W_end / 1000000
df$Sample <- factor(as.character(df$Sample), levels=plotOrder)
df$LOH_ratio <- df$LOH_ratio - df$Het_ratio

# plotting the merged version
x <- df[ , c("Merged_key", "Sample", "LOH_ratio")]

Expand All @@ -88,7 +88,7 @@ if (do_we_merge == "yes") {
geom_tile() +
scale_x_discrete(name="Concatenated chromosomes [Mbp]", labels=NULL) +
scale_fill_gradient2(low=het_color, mid=midpoint_color, high=ref_color, midpoint=0,
na.value = "grey50", breaks=c(-1,1), limits=c(-1,1),
na.value = "grey80", breaks=c(-1,1), limits=c(-1,1),
labels=c("Het", "LOH"),
name="") +
ylab("")
Expand All @@ -105,8 +105,6 @@ if (do_we_merge == "yes") {

if (plotType == "by_chromosome") {

midpoint_color = het_color

# identify chromosomes
Chroms <- unique(as.character(df$Chromosome))
chrom_counter = 0
Expand All @@ -133,7 +131,7 @@ if (do_we_merge == "yes") {
geom_tile() +
xlab("Position [Mbp]") +
scale_fill_gradient2(low=het_color, mid=midpoint_color, high=ref_color, midpoint=0,
na.value = "grey50", breaks=c(-1,1), limits=c(-1,1),
na.value = "grey80", breaks=c(-1,1), limits=c(-1,1),
labels=c("Het", "LOH"),
name="") +
ylab("")
Expand All @@ -147,8 +145,6 @@ if (do_we_merge == "yes") {

} else if (plotType == "by_sample") {

midpoint_color = het_color

# identify samples
Samples <- unique(as.character(df$Sample))
sample_counter = 0
Expand Down Expand Up @@ -176,7 +172,7 @@ if (do_we_merge == "yes") {
geom_tile() +
xlab("Position [Mbp]") +
scale_fill_gradient2(low=het_color, mid=midpoint_color, high=ref_color, midpoint=0,
na.value = "grey50", breaks=c(-1,1), limits=c(-1,1),
na.value = "grey80", breaks=c(-1,1), limits=c(-1,1),
labels=c("Het", "LOH"),
name="") +
ylab("")
Expand Down
68 changes: 55 additions & 13 deletions src/sim
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ jloh sim --fasta <FASTA> [options]
--mean-haplotype-size Generated haplotypes will have this average size [25000]
--min-haplotype-length Minimum length of generated haplotypes [1000]
--divergence How much divergence in the mutated genome (0.0-1.0) [0.01]
--max-divergence Max divergence value to allow in haplotypes [0.2]
--loh Apply this level of LOH (0.0-1.0) [0.0]
--chrom-name-replace Pattern and replacement for chrom names in mutated out [off]
(space separated, --chrom-name-replace <pat> <rep>)
Expand All @@ -75,6 +76,7 @@ p.add_argument("--hybrid", action="store_true")
p.add_argument("--mean-haplotype-size", default=25000, type=int)
p.add_argument("--min-haplotype-length", default=1000, type=int)
p.add_argument("--divergence", default=0.01, type=float)
p.add_argument("--max-divergence", default=0.2, type=float)
p.add_argument("--loh", default=0.00, type=float)
p.add_argument("--chrom-name-replace", nargs=2)
args = p.parse_args()
Expand Down Expand Up @@ -162,10 +164,10 @@ def get_chrom_random_intervals(mean_haplotype_size, min_haplotype_len, stdev_fac
return Intervals


def get_chrom_interval_divergences(Chrom_intervals, divergence, stdev_factor):
def get_chrom_interval_divergences(Chrom_intervals, divergence, max_divergence, stdev_factor):

"""
03/03/2023
18/10/2023
"""

# beta distribution
Expand All @@ -183,16 +185,29 @@ def get_chrom_interval_divergences(Chrom_intervals, divergence, stdev_factor):

if divergence > 0:

if divergence > max_divergence:
sys.stderr.write("--divergence cannot be higher than --max-divergence\n")
sys.exit()

# establish beta distribution parameters
mu = divergence
maxdiv = max_divergence
stdev = mu * stdev_factor
var = stdev ** 2
alpha = ((1-mu)/var - 1/mu) * mu ** 2
beta = alpha * ((1 / mu) - 1)

for chrom in Chrom_intervals.keys():
num_intervals = len(Chrom_intervals[chrom])

# select only divergences within 0 and max-divergence
Divs = list(np.random.beta(a=alpha, b=beta, size=num_intervals))
while (sum([div > max_divergence for div in Divs]) >= 0.5 * len(Divs)):
Divs = list(np.random.beta(a=alpha, b=beta, size=num_intervals))
sys.stderr.write(f"Redrafting divergences with div={mu} and maxdiv={maxdiv}...\n")

Divs = [i for i in Divs if i <= maxdiv]

for i in range(0,num_intervals):
Chrom_intervals_w_div.append(
(
Expand Down Expand Up @@ -245,10 +260,10 @@ def nullify_fraction_of_intervals(Chrom_intervals_w_div, loh):
return Intervals_w_loh


def correct_divergences_after_loh(Int_w_loh, divergence):
def correct_divergences_after_loh(Int_w_loh, divergence, max_divergence):

"""
03/03/2023
18/10/2023
"""

if divergence > 0:
Expand All @@ -267,12 +282,12 @@ def correct_divergences_after_loh(Int_w_loh, divergence):
idx = np.random.choice(len(Int_w_loh_corr))

if (mean_div < 0.999 * divergence):
# increase divergence by 5% in that slot
Int_w_loh_corr[idx][3] = round(min(1, Int_w_loh_corr[idx][3] + Int_w_loh_corr[idx][3] * 0.05), 3)
# increase divergence by 10% in that slot
Int_w_loh_corr[idx][3] = round(min(max_divergence, Int_w_loh_corr[idx][3] + Int_w_loh_corr[idx][3] * 0.1), 3)

elif (mean_div > 1.001 * divergence):
# decrease divergence by 5% in that slot
Int_w_loh_corr[idx][3] = round(max(0, Int_w_loh_corr[idx][3] - Int_w_loh_corr[idx][3] * 0.05), 3)
# decrease divergence by 10% in that slot
Int_w_loh_corr[idx][3] = round(max(0, Int_w_loh_corr[idx][3] - Int_w_loh_corr[idx][3] * 0.1), 3)

# re-calculate mean divergence
mean_div = mean([x[3] for x in Int_w_loh_corr])
Expand Down Expand Up @@ -373,46 +388,72 @@ def join_chrom_subseqs(Int_w_loh_corr_seqs_mut):
def main(args):

"""
08/02/2023
18/10/2023
"""

# script variables
stdev_factor = 0.25

# get assembly chromosome lengths
sys.stderr.write(f"[{at()}] Getting chromosome sizes ... \r")
Chrom_sizes = get_chrom_sizes(args.fasta)
Chrom_intervals = { chrom : get_chrom_random_intervals(args.mean_haplotype_size, args.min_haplotype_length, stdev_factor, Chrom_sizes[chrom]) for chrom in Chrom_sizes.keys() }
sys.stderr.write(f"[{at()}] Getting chromosome sizes ... DONE \n")

sys.stderr.write(f"[{at()}] Reading their sequences ... \r")
Chrom_seqs = get_chrom_sequences(args.fasta)
INPUT = open(args.fasta, "r")
sys.stderr.write(f"[{at()}] Reading their sequences ... DONE \n")

sys.stderr.write(f"[{at()}] Storing their order in input file ... \r")
Chrom_order = [line.lstrip(">").rstrip("\b\r\n") for line in INPUT if line[0] == ">"]
INPUT.close()
sys.stderr.write(f"[{at()}] Storing their order in input file ... DONE \n")

# obtain list of intervals based on random lengths centered at
# mean haplotype size (--mean-haplotype-size)
# use numpy's random normal distribution centered at mean haplo size
Chrom_intervals = { chrom : get_chrom_random_intervals(args.mean_haplotype_size, args.min_haplotype_length, stdev_factor, Chrom_sizes[chrom]) for chrom in Chrom_sizes.keys() }


# obtain simulated distribution of haplotype divergence based on --divergence
# for each haplotype
Chrom_intervals_w_div = get_chrom_interval_divergences(Chrom_intervals, args.divergence, stdev_factor)
sys.stderr.write(f"[{at()}] Modelling internal divergences ... \r")
Chrom_intervals_w_div = get_chrom_interval_divergences(Chrom_intervals,
args.divergence,
args.max_divergence,
stdev_factor)
sys.stderr.write(f"[{at()}] Modelling internal divergences ... DONE \n")

# introduce loh based on the --loh argument
# turn to 0 the corresponding % of the sampled distribution
sys.stderr.write(f"[{at()}] Introducing required loh if specified ... \r")
Int_w_loh = nullify_fraction_of_intervals(Chrom_intervals_w_div, args.loh)
sys.stderr.write(f"[{at()}] Introducing required loh if specified ... DONE \n")

# recalculate the distribution mean
# randomly increase divergence in non-zero haplotypes to match --divergence
Int_w_loh_corr = correct_divergences_after_loh(Int_w_loh, args.divergence)
if args.loh > 0:
sys.stderr.write(f"[{at()}] Recalculating divergence after loh ... \r")
Int_w_loh = correct_divergences_after_loh(Int_w_loh, args.divergence, args.max_divergence)
sys.stderr.write(f"[{at()}] Recalculating divergence after loh ... DONE \n")

# break down genome in chunks like the haplotypes
Int_w_loh_corr_seqs = get_chrom_subseqs(Int_w_loh_corr, Chrom_seqs)
sys.stderr.write(f"[{at()}] Extracting subsequences of chromosomes ... \r")
Int_w_loh_corr_seqs = get_chrom_subseqs(Int_w_loh, Chrom_seqs)
sys.stderr.write(f"[{at()}] Extracting subsequences of chromosomes ... DONE \n")

# introduce mutations according to simulated divergence rates
sys.stderr.write(f"[{at()}] Extracting mutated sequence fragments ... \r")
Int_w_loh_corr_seqs_mut = get_mutate_sequence_pieces(Int_w_loh_corr_seqs, args.threads)
sys.stderr.write(f"[{at()}] Extracting mutated sequence fragments ... DONE \n")

# join sequences
sys.stderr.write(f"[{at()}] Joining chromosomal subsequences after mutation ... \r")
Mut_chrom_seqs = join_chrom_subseqs(Int_w_loh_corr_seqs_mut)
sys.stderr.write(f"[{at()}] Joining chromosomal subsequences after mutation ... DONE \n")

# write to output a tsv with introduced haplotypes and blocks
sys.stderr.write(f"[{at()}] Writing to output ... \r")
OUTPUT = open(args.out_haplotypes, "w")
OUTPUT.write("\t".join(["Chrom", "Start", "End", "Length", "Divergence", "Haplotype"]) + "\n")
for x in Int_w_loh_corr_seqs_mut:
Expand All @@ -437,6 +478,7 @@ def main(args):
# write sequence and name to output
OUTPUT.write(f">{chrom}\n{mut_seq}\n")
OUTPUT.close()
sys.stderr.write(f"[{at()}] Writing to output ... DONE \n")


if __name__=='__main__':
Expand Down

0 comments on commit e50065b

Please sign in to comment.