From 7510d969df02349c97f526c64f30f66149e8acfa Mon Sep 17 00:00:00 2001 From: JLSteenwyk Date: Mon, 19 Aug 2024 16:02:20 -0700 Subject: [PATCH] massive change --- Makefile | 2 + orthohmm/files.py | 76 ++++++++++ orthohmm/helpers.py | 340 ++++++++++++++++++++++++++++---------------- orthohmm/writer.py | 1 + requirements.txt | 3 +- setup.py | 2 +- 6 files changed, 301 insertions(+), 123 deletions(-) create mode 100644 orthohmm/files.py diff --git a/Makefile b/Makefile index bdce6e6..7830e0f 100644 --- a/Makefile +++ b/Makefile @@ -22,11 +22,13 @@ test.unit: test.integration: rm -rf ./tests/samples/orthohmm_* + rm -rf /tmp/orthohmm-test/ python3 -m pytest --basetemp=output -m "integration" test.fast: python3 -m pytest -m "not (integration or slow)" rm -rf ./tests/samples/orthohmm_* + rm -rf /tmp/orthohmm-test/ python3 -m pytest --basetemp=output -m "integration and not slow" # used by GitHub actions during CI workflow diff --git a/orthohmm/files.py b/orthohmm/files.py new file mode 100644 index 0000000..b1e4b4b --- /dev/null +++ b/orthohmm/files.py @@ -0,0 +1,76 @@ +import os +from typing import ( + Dict, List, Tuple +) + +import numpy as np + + +def write_clusters_file( + output_directory: str, + clustering_res: List[List[str]], +) -> None: + with open(f"{output_directory}/orthohmm_orthogroups.txt", 'w') as file: + for cluster in clustering_res: + genes_in_cluster = cluster[1:] + genes_in_cluster.sort() + file.write( + cluster[0] + " " + " ".join(map(str, genes_in_cluster)) + "\n" + ) + + +def write_copy_number_file( + output_directory: str, + og_cn: Dict[str, List[str]] +) -> None: + + with open(f"{output_directory}/orthohmm_gene_count.txt", 'w') as file: + for key, value in og_cn.items(): + file.write(f"{key} {' '.join(value)}\n") + + +def write_file_of_single_copy_ortholog_names( + output_directory: str, + og_cn: Dict[str, List[str]] +) -> None: + with open( + f"{output_directory}/orthohmm_single_copy_orthogroups.txt", 'w' + ) as file: + for key in og_cn.keys(): + if key != "files:": + file.write(f"{key[:-1]}\n") + + +def write_fasta_files_for_all_ogs( + output_directory: str, + ogs_dat: Dict[str, List[str]], +) -> None: + if not os.path.isdir(f"{output_directory}/orthohmm_orthogroups"): + os.mkdir(f"{output_directory}/orthohmm_orthogroups") + for og_id, fasta_dat in ogs_dat.items(): + with open( + f"{output_directory}/orthohmm_orthogroups/{og_id}.fa", "w" + ) as file: + file.write("\n".join(fasta_dat)+"\n") + + +def write_fasta_files_for_single_copy_orthologs( + output_directory: str, + ogs_dat: Dict[str, List[str]], + gene_lengths: np.ndarray, + single_copy_ogs: List[str], + extensions: Tuple, +) -> None: + # write single-copy og files + if not os.path.isdir(f"{output_directory}/orthohmm_single_copy_orthogroups"): + os.mkdir(f"{output_directory}/orthohmm_single_copy_orthogroups") + for single_copy_og in single_copy_ogs: + for idx in range(len(ogs_dat[single_copy_og])): + if ogs_dat[single_copy_og][idx][0] == ">": + taxon_name = \ + gene_lengths[gene_lengths["name"] == ogs_dat[single_copy_og][idx][1:]]["spp"][0] + # remove extension + taxon_name = next((taxon_name[:-len(ext)] for ext in extensions if taxon_name.endswith(ext)), taxon_name) + ogs_dat[single_copy_og][idx] = f">{taxon_name}|{ogs_dat[single_copy_og][idx][1:]}" + with open(f"{output_directory}/orthohmm_single_copy_orthogroups/{single_copy_og}.fa", "w") as file: + file.write("\n".join(ogs_dat[single_copy_og]) + "\n") diff --git a/orthohmm/helpers.py b/orthohmm/helpers.py index a7a386f..1eefe6c 100644 --- a/orthohmm/helpers.py +++ b/orthohmm/helpers.py @@ -1,9 +1,15 @@ -import os from typing import Tuple, List, Dict -from Bio import SeqIO import numpy as np +from .files import ( + write_fasta_files_for_all_ogs, + write_fasta_files_for_single_copy_orthologs, + write_file_of_single_copy_ortholog_names, + write_clusters_file, + write_copy_number_file, +) + def get_sequence_lengths( fasta_directory: str, @@ -84,6 +90,41 @@ def normalize_by_gene_length(res_merged: np.ndarray) -> np.ndarray: return res_merged +def correct_by_phylogenetic_distance( + best_hits_A_to_B: Dict[np.str_, Dict[np.str_, np.float64]], + best_hits_B_to_A: Dict[np.str_, Dict[np.str_, np.float64]], + pair: Tuple[str, str], + pairwise_rbh_corr: Dict[frozenset, np.float64], +) -> Tuple[ + Dict[np.str_, np.float64], + Dict[np.str_, np.float64], + Dict[frozenset, np.float64], +]: + # get rbh scores + rbh_scores = [] + rbh_pairs_identified = 0 + for query, best_hit in best_hits_A_to_B.items(): + target = best_hit['target'] + # Check if the reciprocal hit is also the best hit + if target in best_hits_B_to_A and best_hits_B_to_A[target]["target"] == query: + score_between_rbh_pair = ( + (best_hit["score"] + best_hits_B_to_A[target]["score"]) / 2 + ) + rbh_scores.append(score_between_rbh_pair) + rbh_pairs_identified += 1 + + if frozenset(pair) not in pairwise_rbh_corr: + pairwise_rbh_corr[frozenset(pair)] = np.mean(rbh_scores) + else: + pairwise_rbh_corr[frozenset(pair)] = (pairwise_rbh_corr[frozenset(pair)] + np.mean(rbh_scores)) / 2 + + # Phylogenetic correction + best_hit_scores_A_to_B = {key: value["score"] / pairwise_rbh_corr[frozenset(pair)] for key, value in best_hits_A_to_B.items()} + best_hit_scores_B_to_A = {key: value["score"] / pairwise_rbh_corr[frozenset(pair)] for key, value in best_hits_B_to_A.items()} + + return best_hit_scores_A_to_B, best_hit_scores_B_to_A, pairwise_rbh_corr + + def get_best_hits_and_scores( res_merged: np.ndarray ) -> Dict[np.str_, Dict[np.str_, np.float64]]: @@ -102,22 +143,52 @@ def get_best_hits_and_scores( return best_hits +def get_threshold_per_gene( + best_hits_A_to_B: Dict[np.str_, Dict[np.str_, np.float64]], + best_hits_B_to_A: Dict[np.str_, Dict[np.str_, np.float64]], + best_hit_scores_A_to_B: Dict[np.str_, np.float64], + best_hit_scores_B_to_A: Dict[np.str_, np.float64], + reciprocal_best_hit_thresholds: Dict[np.str_, np.float64], +): + for geneA, geneB in best_hits_A_to_B.items(): + geneB = geneB["target"] + if best_hits_B_to_A.get(geneB)["target"] == geneA: + score = ((best_hit_scores_A_to_B[geneA] + best_hit_scores_B_to_A[geneB]) / 2) + if geneA in reciprocal_best_hit_thresholds: + if score < reciprocal_best_hit_thresholds[geneA]: + reciprocal_best_hit_thresholds[geneA] = score + else: + reciprocal_best_hit_thresholds[geneA] = score + + return reciprocal_best_hit_thresholds + + def determine_edge_thresholds( files: List[str], fasta_directory: str, temporary_directory: str, -) -> Tuple[np.ndarray, Dict[np.str_, np.float64], Dict[frozenset, np.float64]]: - gene_lengths = get_sequence_lengths(fasta_directory, files) +) -> Tuple[ + np.ndarray, + Dict[np.str_, np.float64], + Dict[frozenset, np.float64], +]: + pairwise_rbh_corr = dict() + reciprocal_best_hit_thresholds = dict() - reciprocal_best_hit_thresholds = {} - pairwise_rbh_corr = {} + gene_lengths = get_sequence_lengths(fasta_directory, files) for file in files: file_pairs = [(file, i) for i in files] for pair in file_pairs: - fwd_res = read_and_filter_phmmer_output(pair[0], pair[1], temporary_directory) - rev_res = read_and_filter_phmmer_output(pair[1], pair[0], temporary_directory) + fwd_res = read_and_filter_phmmer_output( + pair[0], pair[1], + temporary_directory + ) + rev_res = read_and_filter_phmmer_output( + pair[1], pair[0], + temporary_directory + ) fwd_res_merged = merge_with_gene_lengths(fwd_res, gene_lengths) rev_res_merged = merge_with_gene_lengths(rev_res, gene_lengths) @@ -125,41 +196,23 @@ def determine_edge_thresholds( fwd_res_merged = normalize_by_gene_length(fwd_res_merged) rev_res_merged = normalize_by_gene_length(rev_res_merged) - # get dictionaries of scores for best hit and best hit best_hits_A_to_B = get_best_hits_and_scores(fwd_res_merged) best_hits_B_to_A = get_best_hits_and_scores(rev_res_merged) - # TODO: continue refactoring here - # get rbh scores - rbh_scores = [] - rbh_pairs_identified = 0 - for query, best_hit in best_hits_A_to_B.items(): - target = best_hit['target'] - # Check if the reciprocal hit is also the best hit - if target in best_hits_B_to_A and best_hits_B_to_A[target]['target'] == query: - score_between_rbh_pair = ((best_hit['score'] + best_hits_B_to_A[target]['score']) / 2) - rbh_scores.append(score_between_rbh_pair) - rbh_pairs_identified += 1 - - if frozenset(pair) not in pairwise_rbh_corr: - pairwise_rbh_corr[frozenset(pair)] = np.mean(rbh_scores) - else: - pairwise_rbh_corr[frozenset(pair)] = (pairwise_rbh_corr[frozenset(pair)] + np.mean(rbh_scores)) / 2 - - # Phylogenetic correction - best_hit_scores_A_to_B = {key: value["score"] / pairwise_rbh_corr[frozenset(pair)] for key, value in best_hits_A_to_B.items()} - best_hit_scores_B_to_A = {key: value["score"] / pairwise_rbh_corr[frozenset(pair)] for key, value in best_hits_B_to_A.items()} - - # get lower bound threshold, which is sequence length and distance corrected - for geneA, geneB in best_hits_A_to_B.items(): - geneB = geneB["target"] - if best_hits_B_to_A.get(geneB)["target"] == geneA: - score = ((best_hit_scores_A_to_B[geneA] + best_hit_scores_B_to_A[geneB]) / 2) - if geneA in reciprocal_best_hit_thresholds: - if score < reciprocal_best_hit_thresholds[geneA]: - reciprocal_best_hit_thresholds[geneA] = score - else: - reciprocal_best_hit_thresholds[geneA] = score + best_hit_scores_A_to_B, best_hit_scores_B_to_A, pairwise_rbh_corr = \ + correct_by_phylogenetic_distance( + best_hits_A_to_B, + best_hits_B_to_A, + pair, + pairwise_rbh_corr + ) + + reciprocal_best_hit_thresholds = \ + get_threshold_per_gene( + best_hits_A_to_B, best_hits_B_to_A, + best_hit_scores_A_to_B, best_hit_scores_B_to_A, + reciprocal_best_hit_thresholds + ) return gene_lengths, reciprocal_best_hit_thresholds, pairwise_rbh_corr @@ -172,32 +225,42 @@ def determine_network_edges( reciprocal_best_hit_thresholds: Dict[np.str_, np.float64], ) -> Dict[frozenset, np.float64]: edges = dict() - gene_lengths = {str(row['name']): int(row['length']) for row in gene_lengths} + + gene_lengths = { + str(row["name"]): int(row["length"]) for row in gene_lengths + } + for file in files: file_pairs = [(file, i) for i in files] for pair in file_pairs: - res = f"{temporary_directory}/{pair[0]}_2_{pair[1]}.phmmerout.txt" + res = read_and_filter_phmmer_output( + pair[0], pair[1], + temporary_directory + ) - dtype_res = [("target_name", "U50"), ("query_name", "U50"), ("evalue", float), ("score", float)] - res = np.genfromtxt(res, comments="#", dtype=dtype_res, usecols=[0, 2, 4, 5], encoding='utf-8') + for hit in res: + query_length = gene_lengths[hit["query_name"]] + target_length = gene_lengths[hit["target_name"]] - res = res[res["evalue"] < 0.0001] + norm_score = ( + hit['score'] / (query_length + target_length) + ) / pairwise_rbh_corr[frozenset(pair)] - for hit in res: - query_length = gene_lengths[hit['query_name']] - target_length = gene_lengths[hit['target_name']] - norm_score = (hit['score'] / (query_length + target_length)) / pairwise_rbh_corr[frozenset(pair)] try: if norm_score >= reciprocal_best_hit_thresholds[hit["query_name"]]: - genes = frozenset([hit['query_name'], hit['target_name']]) + genes = frozenset( + [hit["query_name"], hit["target_name"]] + ) + if len(genes) == 2: if genes in edges: if edges[genes] <= norm_score: edges[genes] = norm_score else: edges[genes] = norm_score + # except reciprocal_best_hit_thresholds[hit["query_name"]] doesn't exist except KeyError: continue @@ -210,54 +273,78 @@ def determine_network_edges( return edges -def generate_orthogroup_files( - output_directory: str, +def get_singletons( gene_lengths: np.ndarray, - files: list, - fasta_directory: str, - single_copy_threshold: float, - extensions: Tuple[str], - temporary_directory: str, -) -> Tuple[list, list, dict]: - clustering_res = [] - with open( - f"{temporary_directory}/orthohmm_edges_clustered.txt", - "r" - ) as file: - for line in file: - line = line.strip() - if line: - clustering_res.append(line.split()) - - # get singletons - genes that aren't in groups with other genes - singletons = list(set(gene_lengths["name"]) - set([j for i in clustering_res for j in i])) - singletons = [[x] for x in singletons] + clustering_res: List[List[str]], +) -> Tuple[List[List[str]], List[List[str]]]: + singletons = list( + set(gene_lengths["name"]) - set([j for i in clustering_res for j in i]) + ) + singletons = [[str(i)] for i in singletons] clustering_res.extend(singletons) - total_ogs = len(clustering_res) - width = len(str(total_ogs)) # Determine the width needed for the largest number - total_number_of_taxa = len(np.unique(gene_lengths["spp"])) + return clustering_res, singletons - # get all FASTA entries + +def get_all_fasta_entries( + fasta_directory: str, + files: List[str] +) -> Dict[str, str]: entries = {} for fasta_file in files: fasta_file_entries = {} - for record in SeqIO.parse(f"{fasta_directory}/{fasta_file}", "fasta"): - # Store gene ID and sequence in the dictionary - fasta_file_entries[record.id] = str(record.seq) + header = None + sequence = [] + + with open(f"{fasta_directory}/{fasta_file}", "r") as file: + for line in file: + line = line.strip() + if line.startswith(">"): + if header: + fasta_file_entries[header] = ''.join(sequence) + header = line[1:] # Remove the '>' character + sequence = [] + else: + sequence.append(line) + + # Don't forget to add the last entry to the dictionary + if header: + fasta_file_entries[header] = ''.join(sequence) - # Add the file_entries dictionary to the all_entries dictionary entries[fasta_file] = fasta_file_entries + return entries + + +def get_orthogroup_information( + files: List[str], + gene_lengths: np.ndarray, + clustering_res: List[List[str]], + single_copy_threshold: float, + entries: Dict[str, str], +) -> Tuple[ + List[List[str]], + Dict[str, List[str]], + Dict[str, List[str]], + List[str], +]: ogs_dat = dict() og_cn = dict() single_copy_ogs = list() + + total_ogs = len(clustering_res) + width = len(str(total_ogs)) + total_number_of_taxa = len(np.unique(gene_lengths["spp"])) + og_cn["files:"] = files + for i in range(total_ogs): # Format the OG number with leading zeros og_id = f"OG{i:0{width}}:" - og_rows = gene_lengths[np.isin(gene_lengths['name'], clustering_res[i])] + og_rows = gene_lengths[ + np.isin(gene_lengths["name"], clustering_res[i]) + ] # test if single-copy if len(np.unique(og_rows["spp"])) == len(og_rows["spp"]): # test if sufficient occupancy @@ -276,7 +363,6 @@ def generate_orthogroup_files( cnts = [] for file in files: try: - # cnts.append(str(spp_counts[spp_counts["spp"] == file]["cnt"].values[0])) cnts.append(str(spp_counts.get(file, 0))) except IndexError: cnts.append("0") @@ -284,44 +370,58 @@ def generate_orthogroup_files( clustering_res[i].insert(0, og_id) - # write clusters file - with open(f"{output_directory}/orthohmm_orthogroups.txt", 'w') as file: - for cluster in clustering_res: - # Join the elements of the sublist with spaces and write to the file - genes_in_cluster = cluster[1:] - genes_in_cluster.sort() - file.write(cluster[0] + " " + " ".join(map(str, genes_in_cluster)) + "\n") - - # write copy number per species - with open(f"{output_directory}/orthohmm_gene_count.txt", 'w') as file: - for key, value in og_cn.items(): - # Write each key-value pair as a space-delimited string - file.write(f"{key} {' '.join(value)}\n") - - # write list of single-copy orthologs - with open(f"{output_directory}/orthohmm_single_copy_orthogroups.txt", 'w') as file: - for key in og_cn.keys(): - if key != "files:": - file.write(f"{key[:-1]}\n") - - # write all og files - if not os.path.isdir(f"{output_directory}/orthohmm_orthogroups"): - os.mkdir(f"{output_directory}/orthohmm_orthogroups") - for og_id, fasta_dat in ogs_dat.items(): - with open(f"{output_directory}/orthohmm_orthogroups/{og_id}.fa", "w") as file: - file.write("\n".join(fasta_dat)+"\n") - - # write single-copy og files - if not os.path.isdir(f"{output_directory}/orthohmm_single_copy_orthogroups"): - os.mkdir(f"{output_directory}/orthohmm_single_copy_orthogroups") - for single_copy_og in single_copy_ogs: - for idx in range(len(ogs_dat[single_copy_og])): - if ogs_dat[single_copy_og][idx][0] == ">": - taxon_name = gene_lengths[gene_lengths["name"] == ogs_dat[single_copy_og][idx][1:]]["spp"][0] - # remove extension - taxon_name = next((taxon_name[:-len(ext)] for ext in extensions if taxon_name.endswith(ext)), taxon_name) - ogs_dat[single_copy_og][idx] = f">{taxon_name}|{ogs_dat[single_copy_og][idx][1:]}" - with open(f"{output_directory}/orthohmm_single_copy_orthogroups/{single_copy_og}.fa", "w") as file: - file.write("\n".join(ogs_dat[single_copy_og]) + "\n") + return clustering_res, og_cn, ogs_dat, single_copy_ogs + + +def generate_orthogroup_files( + output_directory: str, + gene_lengths: np.ndarray, + files: list, + fasta_directory: str, + single_copy_threshold: float, + extensions: Tuple[str], + temporary_directory: str, +) -> Tuple[ + List[str], + List[List[str]], + Dict[str, List[str]] +]: + clustering_res = list() + + with open( + f"{temporary_directory}/orthohmm_edges_clustered.txt", + "r", + ) as file: + for line in file: + line = line.strip() + if line: + clustering_res.append(line.split()) + + # get singletons - i.e., genes that aren't in groups with other genes + clustering_res, singletons = get_singletons( + gene_lengths, + clustering_res, + ) + + # get all FASTA entries + entries = get_all_fasta_entries(fasta_directory, files) + + clustering_res, og_cn, ogs_dat, single_copy_ogs = \ + get_orthogroup_information( + files, + gene_lengths, + clustering_res, + single_copy_threshold, + entries + ) + + # write results files + write_clusters_file(output_directory, clustering_res) + write_copy_number_file(output_directory, og_cn) + write_file_of_single_copy_ortholog_names(output_directory, og_cn) + write_fasta_files_for_all_ogs(output_directory, ogs_dat) + write_fasta_files_for_single_copy_orthologs( + output_directory, ogs_dat, gene_lengths, single_copy_ogs, extensions + ) return single_copy_ogs, singletons, ogs_dat diff --git a/orthohmm/writer.py b/orthohmm/writer.py index bb6a1f8..b908556 100644 --- a/orthohmm/writer.py +++ b/orthohmm/writer.py @@ -57,6 +57,7 @@ def write_output_stats( Number of edges in network: {len(edges)} Number of single-copy orthogroups: {len(single_copy_ogs)} Number of singletons: {len(singletons)} + Execution time: {round(time.time() - start_time, 3)}s """ # noqa ) diff --git a/requirements.txt b/requirements.txt index 11851ca..ebd06fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ -biopython==1.84 Cython==3.0.4 -pandas==2.2.2 +numpy>=2.0.1 diff --git a/setup.py b/setup.py index f521432..291236d 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "Topic :: Scientific/Engineering", ] -REQUIRES = ["biopython>=1.84", "pandas>=2.2.2", "numpy>=2.0.1", "cython"] +REQUIRES = ["numpy>=2.0.1", "cython"] setup( name="orthohmm",