Skip to content

Commit

Permalink
DEV: Resolve linear genomes with terminal repeats to fix #6
Browse files Browse the repository at this point in the history
  • Loading branch information
Vini2 committed Jun 14, 2024
1 parent bad7be0 commit 4cf77f2
Showing 1 changed file with 169 additions and 56 deletions.
225 changes: 169 additions & 56 deletions reneo/workflow/scripts/reneo.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,42 +109,53 @@ def worker_resolve_components(component_queue, results_queue, **kwargs):
# Case 2 components
if len(candidate_nodes) == 2:
all_self_looped = True
one_circular = False

for node in candidate_nodes:
if kwargs["unitig_names"][node] not in kwargs["self_looped_nodes"]:
if (
kwargs["unitig_names"][candidate_nodes[0]] in kwargs["self_looped_nodes"]
and kwargs["unitig_names"][candidate_nodes[1]] in kwargs["self_looped_nodes"]
):
all_self_looped = True
else:
if kwargs["unitig_names"][candidate_nodes[0]] in kwargs["self_looped_nodes"]:
one_circular = True
all_self_looped = False
if kwargs["unitig_names"][candidate_nodes[1]] in kwargs["self_looped_nodes"]:
one_circular = True
all_self_looped = False

if all_self_looped:
results["case2_found"].add(my_count)
unitig1 = ""
unitig2 = ""

results["cycle_components"].add(my_count)
for edge in pruned_graph.es:
source_vertex_id = edge.source
target_vertex_id = edge.target

results["virus_like_edges"] = results["virus_like_edges"].union(
set(candidate_nodes)
)
comp_resolved_edges = comp_resolved_edges.union(set(candidate_nodes))

unitig1 = ""
unitig2 = ""
if source_vertex_id != target_vertex_id:
unitig1 = candidate_nodes[source_vertex_id]
unitig2 = candidate_nodes[target_vertex_id]

for edge in pruned_graph.es:
source_vertex_id = edge.source
target_vertex_id = edge.target
unitig1_name = unitig_names[unitig1]
unitig2_name = unitig_names[unitig2]

if source_vertex_id != target_vertex_id:
unitig1 = candidate_nodes[source_vertex_id]
unitig2 = candidate_nodes[target_vertex_id]
unitig1_len = len(str(kwargs["graph_unitigs"][unitig1_name]))
unitig2_len = len(str(kwargs["graph_unitigs"][unitig2_name]))

if unitig1 != "" and unitig2 != "":
if unitig1 != "" and unitig2 != "":

# Case 2 - both are circular
# Case 2 - both are circular
if all_self_looped:

case_name = "case2_circular"

unitig1_name = kwargs["unitig_names"][unitig1]
unitig2_name = kwargs["unitig_names"][unitig2]
results["case2_found"].add(my_count)

unitig1_len = len(str(kwargs["graph_unitigs"][unitig1_name]))
unitig2_len = len(str(kwargs["graph_unitigs"][unitig2_name]))
results["cycle_components"].add(my_count)

results["virus_like_edges"] = results["virus_like_edges"].union(
set(candidate_nodes)
)
comp_resolved_edges = comp_resolved_edges.union(set(candidate_nodes))

unitig_to_consider = -1
unitig_name = ""
Expand Down Expand Up @@ -194,18 +205,120 @@ def worker_resolve_components(component_queue, results_queue, **kwargs):
)

genome_path = GenomePath(
f"virus_comp_{my_count}_cycle_{cycle_number}",
case_name,
[
id = f"virus_comp_{my_count}_cycle_{cycle_number}",
bubble_case = case_name,
node_order = [
f"{repeat_unitig_name}+",
f"{unitig_name}+",
f"{repeat_unitig_name}-",
],
[repeat_unitig, unitig_to_consider, repeat_unitig],
path_string,
int(kwargs["unitig_coverages"][unitig_name]),
len(path_string),
(path_string.count("G") + path_string.count("C"))
node_id_order = [repeat_unitig, unitig_to_consider, repeat_unitig],
path = path_string,
coverage = int(kwargs["unitig_coverages"][unitig_name]),
length = len(path_string),
gc = (path_string.count("G") + path_string.count("C"))
/ len(path_string)
* 100,
)
my_genomic_paths.append(genome_path)
results["resolved_components"].add(my_count)
results["resolved_cyclic"].add(my_count)
results["case2_resolved"].add(my_count)

# Case 2 - only one is circular
elif one_circular:

case_name = "case2_linear"

results["case2_found"].add(my_count)

results["cycle_components"].add(my_count)

results["virus_like_edges"] = results["virus_like_edges"].union(
set(candidate_nodes)
)
comp_resolved_edges = comp_resolved_edges.union(set(candidate_nodes))

unitig_to_consider = -1
unitig_name = ""

repeat_unitig = -1
repeat_unitig_name = ""

if (
unitig1_len > unitig2_len
and unitig1_len > minlength
and unitig2_name in self_looped_nodes
):
unitig_to_consider = unitig1
unitig_name = unitig1_name
repeat_unitig = unitig2
repeat_unitig_name = unitig2_name
elif (
unitig2_len > unitig1_len
and unitig2_len > minlength
and unitig1_name in self_looped_nodes
):
unitig_to_consider = unitig2
unitig_name = unitig2_name
repeat_unitig = unitig1
repeat_unitig_name = unitig1_name

if unitig_to_consider != -1:
kwargs["logger"].debug(
f"Case 2 component: {unitig1_name} is {unitig1_len} bp long and {unitig2_name} is {unitig2_len} bp long."
)
cycle_number = 1
kwargs["resolved_edges"].add(unitig_to_consider)
kwargs["resolved_edges"].add(repeat_unitig)

# Get repeat count
repeat_count = max(
int(
kwargs["unitig_coverages"][repeat_unitig_name]
/ kwargs["unitig_coverages"][unitig_name]
),
1,
)
kwargs["logger"].debug(f"Repeat count: {repeat_count}")

path_string = (
str(
kwargs["graph_unitigs"][unitig_name][
kwargs["link_overlap"][(repeat_unitig, unitig_to_consider)] :
]
)
+ str(
kwargs["graph_unitigs"][repeat_unitig_name][
kwargs["link_overlap"][(unitig_to_consider, repeat_unitig)] :
]
)
* repeat_count
)
kwargs["logger"].debug(
f"Terminal repeat detected is {repeat_unitig_name}"
)

# Format path node order
path_with_repeats = [f"{unitig_name}+"] + [
f"{repeat_unitig_name}+" for x in range(repeat_count)
]

repeat_order = f"{repeat_unitig_name}:fwd," * repeat_count
path_with_repeats_human = f"{unitig_name}:fwd,{repeat_order[:-1]}"
node_id_order_with_repeats = [unitig_to_consider] + [
repeat_unitig for x in range(repeat_count)
]

genome_path = GenomePath(
id = f"virus_comp_{my_count}_cycle_{cycle_number}",
bubble_case = case_name,
node_order = path_with_repeats,
node_id_order = node_id_order_with_repeats,
path = path_string,
coverage = int(kwargs["unitig_coverages"][unitig_name]),
length = len(path_string),
gc = (path_string.count("G") + path_string.count("C"))
/ len(path_string)
* 100,
)
Expand Down Expand Up @@ -705,17 +818,17 @@ def worker_resolve_components(component_queue, results_queue, **kwargs):

# Create GenomePath object with path details
genome_path = GenomePath(
f"virus_comp_{my_count}_cycle_{cycle_number}",
case_name,
[x for x in path_order],
[
id = f"virus_comp_{my_count}_cycle_{cycle_number}",
bubble_case = case_name,
node_order = [x for x in path_order],
node_id_order = [
kwargs["unitig_names_rev"][x[:-1]]
for x in path_order
],
path_string,
int(coverage_val),
total_length,
(
path = path_string,
coverage = int(coverage_val),
length = total_length,
gc = (
path_string.count("G")
+ path_string.count("C")
)
Expand Down Expand Up @@ -1146,17 +1259,17 @@ def worker_resolve_components(component_queue, results_queue, **kwargs):

# Create GenomePath object with path details
genome_path = GenomePath(
f"virus_comp_{my_count}_cycle_{cycle_number}",
case_name,
[x for x in path_order],
[
id = f"virus_comp_{my_count}_cycle_{cycle_number}",
bubble_case = case_name,
node_order = [x for x in path_order],
node_id_order = [
kwargs["unitig_names_rev"][x[:-1]]
for x in path_order
],
path_string,
int(coverage_val),
total_length,
(
path = path_string,
coverage = int(coverage_val),
length = total_length,
gc = (
path_string.count("G")
+ path_string.count("C")
)
Expand Down Expand Up @@ -1208,14 +1321,14 @@ def worker_resolve_components(component_queue, results_queue, **kwargs):

# Create GenomePath object with path details
genome_path = GenomePath(
f"virus_comp_{my_count}_cycle_{cycle_number}",
case_name,
[kwargs["unitig_names"][candidate_nodes[0]]],
[candidate_nodes[0]],
path_string,
int(kwargs["unitig_coverages"][unitig_name]),
len(kwargs["graph_unitigs"][unitig_name]),
(path_string.count("G") + path_string.count("C"))
id = f"virus_comp_{my_count}_cycle_{cycle_number}",
bubble_case = case_name,
node_order = [kwargs["unitig_names"][candidate_nodes[0]]],
node_id_order = [candidate_nodes[0]],
path = path_string,
coverage = int(kwargs["unitig_coverages"][unitig_name]),
length = len(kwargs["graph_unitigs"][unitig_name]),
gc = (path_string.count("G") + path_string.count("C"))
/ len(path_string)
* 100,
)
Expand Down

0 comments on commit 4cf77f2

Please sign in to comment.