From 3d1a0102d879b7d2164e64d702030e698036e897 Mon Sep 17 00:00:00 2001
From: Valerio Arnaboldi <valearna@caltech.edu>
Date: Thu, 5 Jul 2018 08:33:21 -0700
Subject: [PATCH] added orthology sentences

- added orthology sentences and configuration options for WB
- select best orthologs based on number of methods
- if more than 3 orthologs with the same (highest) number of methods, use gene class to group them
- special case for human orthologs - use gene families instead of classes
---
 config_wb.yml                           | 142 +++++++++---------------
 genedescriptions/config_parser.py       |   2 +
 genedescriptions/data_fetcher.py        |  87 ++++++++++++++-
 genedescriptions/descriptions_rules.py  |  87 ++++++++++++++-
 genedescriptions/descriptions_writer.py |  22 ++++
 generate_desc_from_raw_files.py         |  53 +++++++--
 requirements.txt                        |   1 +
 tests/test_data_fetcher.py              |  26 +++++
 tests/test_descriptions_rules.py        |   2 +-
 9 files changed, 319 insertions(+), 103 deletions(-)

diff --git a/config_wb.yml b/config_wb.yml
index c71b439..ff76aac 100644
--- a/config_wb.yml
+++ b/config_wb.yml
@@ -5,44 +5,89 @@ generic_data_fetcher:
 
 # options for WormBase gene description generation - used when data_fetcher is set to 'wb_data_fetcher'
 wb_data_fetcher:
-    release: "WS265"
+    release: "WS266"
     raw_files_source: "ftp://ftp.ebi.ac.uk/pub/databases/wormbase/releases"
     organisms_to_process:
         # add the species to be processed to the following list. Entries must have the same text as in 'organisms'
         # option
         - c_elegans
-        #- b_malayi
-        #- c_brenneri
+        - b_malayi
+        - c_brenneri
         - c_briggsae
-        #- c_japonica
-        #- c_remanei
-        #- o_volvulus
-        #- p_pacificus
-        #- s_ratti
-        #- t_muris
+        - c_japonica
+        - c_remanei
+        - o_volvulus
+        - p_pacificus
+        - s_ratti
+        - t_muris
     organisms:
+        human:
+            full_name: "Homo sapiens"
         b_malayi:
+            full_name: "Brugia Malayi"
             project_id: "PRJNA10729"
+            ortholog:
+                - c_elegans
+                - o_volvulus
+            main_sister_species: c_elegans
         c_brenneri:
+            full_name: "Caenorhabditis brenneri"
             project_id: "PRJNA20035"
+            ortholog:
+                - c_elegans
+            main_sister_species: c_elegans
         c_briggsae:
+            full_name: "Caenorhabditis briggsae"
             project_id: "PRJNA10731"
             main_sister_species: c_elegans
+            ortholog:
+                - c_elegans
         c_elegans:
+            full_name: "Caenorhabditis elegans"
             project_id: "PRJNA13758"
             name: "C. elegans"
+            ortholog:
+                - human
         c_japonica:
+            full_name: "Caenorhabditis japonica"
             project_id: "PRJNA12591"
+            main_sister_species: c_elegans
+            ortholog:
+                - c_elegans
         c_remanei:
+            full_name: "Caenorhabditis remanei"
             project_id: "PRJNA53967"
+            main_sister_species: c_elegans
+            ortholog:
+                - c_elegans
         o_volvulus:
+            full_name: "Onchocerca volvulus"
             project_id: "PRJEB513"
+            ortholog:
+                - c_elegans
+                - b_malayi
+            main_sister_species: c_elegans
         p_pacificus:
+            full_name: "Pristionchus pacificus"
             project_id: "PRJNA12644"
+            ortholog:
+                - c_elegans
+            main_sister_species: c_elegans
         s_ratti:
+            full_name: "Strongyloides ratti"
             project_id: "PRJEB125"
+            ortholog:
+                - c_elegans
+                - b_malayi
+                - o_volvulus
+            main_sister_species: c_elegans
         t_muris:
+            full_name: "Trichuris Muris"
             project_id: "PRJEB126"
+            ortholog:
+                - c_elegans
+                - b_malayi
+            main_sister_species: c_elegans
 
 agr_data_fetcher:
     raw_files_source: "https://s3.amazonaws.com/mod-datadumps"
@@ -107,6 +152,7 @@ go_sentences_options:
         "(.*)embryo development ending in birth or egg hatching(.*)": "\\1embryo development\\2"
         "(.*)synaptic transmission, (\\w+)(.*)": "\\1\\2 synaptic transmission\\3"
         "(.*)postsynaptic synaptic(.*)": "\\1postsynaptic\\2"
+        "(.*)binding(.*)": "\\1binding activity\\2"
     evidence_codes:
         EXP:
             group: EXPERIMENTAL
@@ -290,24 +336,6 @@ go_sentences_options:
           prefix: "is involved in"
           postfix: ""
           special_cases:
-        - aspect: P
-          group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS
-          qualifier: ""
-          prefix: "is predicted to be involved in"
-          postfix: ""
-          special_cases:
-        - aspect: P
-          group: INFERRED_BY_CURATORS_AND_AUTHORS
-          qualifier: ""
-          prefix: "is predicted to be involved in"
-          postfix: ""
-          special_cases:
-        - aspect: P
-          group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS
-          qualifier: ""
-          prefix: "is predicted to be involved in"
-          postfix: ""
-          special_cases:
         - aspect: C
           group: EXPERIMENTAL
           qualifier: ""
@@ -348,66 +376,6 @@ go_sentences_options:
                 match_regex: "intracellular$"
                 prefix: "is"
                 postfix: ""
-        - aspect: C
-          group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS
-          qualifier: ""
-          prefix: "is predicted to localize to"
-          postfix: ""
-          special_cases:
-              - id: 1
-                match_regex: "intracellular$"
-                prefix: "is predicted to be"
-                postfix: ""
-        - aspect: C
-          group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS
-          qualifier: "colocalizes_with"
-          prefix: "is predicted to colocalize with"
-          postfix: ""
-          special_cases:
-              - id: 1
-                match_regex: "intracellular$"
-                prefix: "is predicted to be"
-                postfix: ""
-        - aspect: C
-          group: INFERRED_BY_CURATORS_AND_AUTHORS
-          qualifier: ""
-          prefix: "is predicted to localize to"
-          postfix: ""
-          special_cases:
-              - id: 1
-                match_regex: "intracellular$"
-                prefix: "is predicted to be"
-                postfix: ""
-        - aspect: C
-          group: INFERRED_BY_CURATORS_AND_AUTHORS
-          qualifier: "colocalizes_with"
-          prefix: "is predicted to colocalize with"
-          postfix: ""
-          special_cases:
-              - id: 1
-                match_regex: "intracellular$"
-                prefix: "is predicted to be"
-                postfix: ""
-        - aspect: C
-          group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS
-          qualifier: ""
-          prefix: "is predicted to localize to"
-          postfix: ""
-          special_cases:
-              - id: 1
-                match_regex: "intracellular$"
-                prefix: "is predicted to be"
-                postfix: ""
-        - aspect: C
-          group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS
-          qualifier: "colocalize_with"
-          prefix: "is predicted to colocalize with"
-          postfix: ""
-          special_cases:
-              - id: 1
-                match_regex: "intracellular$"
-                prefix: "is predicted to be"
-                postfix: ""
 
     go_truncate_others_aggregation_word: several
     go_truncate_others_terms:
diff --git a/genedescriptions/config_parser.py b/genedescriptions/config_parser.py
index 54e91e3..08ec66c 100644
--- a/genedescriptions/config_parser.py
+++ b/genedescriptions/config_parser.py
@@ -338,4 +338,6 @@ def get_genedesc_writer(self):
     def get_genedesc_output_dir(self, genedesc_writer: str):
         return self.config[genedesc_writer + "_options"]["output_dir"]
 
+    def get_ortholog_species(self):
+        return
 
diff --git a/genedescriptions/data_fetcher.py b/genedescriptions/data_fetcher.py
index ffb5c15..02c6944 100644
--- a/genedescriptions/data_fetcher.py
+++ b/genedescriptions/data_fetcher.py
@@ -3,8 +3,9 @@
 import shutil
 import os
 import re
+
 from enum import Enum
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 from typing import List, Iterable, Dict
 from ontobio import AssociationSetFactory
 from genedescriptions.descriptions_rules import SingleDescStats, set_all_depths_in_subgraph
@@ -260,12 +261,37 @@ def set_gene_data(self, gene_data: List[Gene]):
     def load_gene_data_from_file(self):
         pass
 
+    @staticmethod
+    def get_human_gene_props():
+        """ retrieve data for human genes, including Ensembl ID, symbol, name, and family name
+        Returns:
+            Dict[List[str]]: a dictionary of all human genes properties, indexed by Ensembl ID
+
+        """
+        human_genes_props = defaultdict(list)
+        human_content = urllib.request.urlopen("https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=g"
+                                               "d_app_sym&col=gd_app_name&col=gd_pub_ensembl_id&col=family.id&c"
+                                               "ol=family.name&status=Approved&status=Entry+Withdrawn&status_op"
+                                               "t=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbt"
+                                               "ag=on&submit=submit")
+        header = True
+        for line in human_content:
+            if not header:
+                linearr = line.decode("utf-8").split("\t")
+                linearr[-1] = linearr[-1].strip()
+                if linearr[3] != "":
+                    human_genes_props[linearr[3]] = [linearr[1], linearr[2], linearr[5]]
+            else:
+                header = False
+        return human_genes_props
+
 
 class WBDataFetcher(DataFetcher):
     """data fetcher for WormBase raw files for a single species"""
 
     def __init__(self, raw_files_source: str, cache_location: str, release_version: str, species: str, project_id: str,
-                 go_relations: List[str] = None, do_relations: List[str] = None, use_cache: bool = False):
+                 go_relations: List[str] = None, do_relations: List[str] = None, use_cache: bool = False,
+                 sister_sp_fullname: str = ""):
         """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file
         locations are automatically generated and stored in class variables ending in _url for remote filed and
         _cache_path for caching
@@ -308,6 +334,13 @@ def __init__(self, raw_files_source: str, cache_location: str, release_version:
                                                           ".do_annotations.daf.txt")
         self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
                                        release_version + '.daf.txt'
+        self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \
+                             '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz'
+        self.orthology_cache_path = os.path.join(cache_location, "wormbase", release_version, "species", species,
+                                                 project_id, "annotation", species + '.' + project_id + '.' +
+                                                 release_version + ".orthologs.txt.gz")
+        self.orthologs = defaultdict(lambda: defaultdict(list))
+        self.sister_sp_fullname = sister_sp_fullname
 
     def load_gene_data_from_file(self) -> None:
         """load gene list from pre-set file location"""
@@ -382,6 +415,55 @@ def load_associations_from_file(self, associations_type: DataType, associations_
                                                                        ontology=self.do_ontology,
                                                                        terms_blacklist=exclusion_list)
 
+    def load_orthology_from_file(self):
+        orthology_file = self._get_cached_file(cache_path=self.orthology_cache_path,
+                                               file_source_url=self.orthology_url)
+        orthologs = defaultdict(list)
+        gene_id = ""
+        header = True
+        for line in open(orthology_file):
+            if not line.startswith("#"):
+                if line.strip() == "=":
+                    header = True
+                    self.orthologs["WB:" + gene_id] = orthologs
+                    orthologs = defaultdict(list)
+                elif header:
+                    gene_id = line.strip().split()[0]
+                    header = False
+                else:
+                    ortholog_arr = line.strip().split("\t")
+                    orthologs[ortholog_arr[0]].append(ortholog_arr[1:4])
+
+    def get_best_orthologs_for_gene(self, gene_id: str, orth_species_full_name: List[str],
+                                    sister_species_data_fetcher: DataFetcher = None,
+                                    ecode_priority_list: List[str] = None):
+        best_orthologs = None
+        curr_orth_fullname = None
+        if len(orth_species_full_name) > 0:
+            for curr_orth_fullname in orth_species_full_name:
+                if curr_orth_fullname in self.orthologs[gene_id]:
+                    orthologs = self.orthologs[gene_id][curr_orth_fullname]
+                    orthologs_keys = []
+                    if len(orthologs) > 1:
+                        for ortholog in orthologs:
+                            if sister_species_data_fetcher:
+                                orthologs_keys.append([ortholog[0], ortholog[1], len(ortholog[2].split(";")),
+                                                       len(sister_species_data_fetcher.get_annotations_for_gene(
+                                                           gene_id=ortholog[0], annot_type=DataType.GO,
+                                                           priority_list=ecode_priority_list))])
+                            else:
+                                orthologs_keys.append([ortholog[0], ortholog[1], len(ortholog[2].split(";"))])
+                        if sister_species_data_fetcher:
+                            best_orthologs = [sorted(orthologs_keys, key=lambda x: (x[2], x[3]), reverse=True)[0][0:2]]
+                        else:
+                            best_orthologs = [[orth_key[0], orth_key[1]] for orth_key in
+                                              sorted(orthologs_keys, key=lambda x: x[2], reverse=True) if
+                                              orth_key[2] == max([orth[2] for orth in orthologs_keys])]
+                    else:
+                        best_orthologs = [[orthologs[0][0], orthologs[0][1]]]
+                    break
+        return best_orthologs, curr_orth_fullname
+
     def load_all_data_from_file(self, go_terms_replacement_regex: Dict[str, str] = None,
                                 go_terms_exclusion_list: List[str] = None,
                                 do_terms_replacement_regex: Dict[str, str] = None,
@@ -413,4 +495,5 @@ def load_all_data_from_file(self, go_terms_replacement_regex: Dict[str, str] = N
         self.load_associations_from_file(associations_type=DataType.DO, associations_url=self.do_associations_url,
                                          associations_cache_path=self.do_associations_cache_path,
                                          exclusion_list=do_terms_exclusion_list)
+        self.load_orthology_from_file()
 
diff --git a/genedescriptions/descriptions_rules.py b/genedescriptions/descriptions_rules.py
index 193caf9..eb24412 100644
--- a/genedescriptions/descriptions_rules.py
+++ b/genedescriptions/descriptions_rules.py
@@ -1,5 +1,9 @@
+import json
+
 import inflect
 import re
+import urllib.request
+
 from namedlist import namedlist
 from genedescriptions.ontology_tools import *
 from ontobio.ontol import Ontology
@@ -24,12 +28,17 @@ def __init__(self):
 class GeneDesc(object):
     """gene description"""
     def __init__(self, gene_id: str, gene_name: str = "", description: str = "", go_description: str = "",
-                 disease_description: str = "", stats: SingleDescStats = None):
+                 disease_description: str = "", stats: SingleDescStats = None, publications: str = "", refs: str = "",
+                 species: str = "", release_version: str = ""):
         self.gene_id = gene_id
         self.gene_name = gene_name
         self.description = description
         self.go_description = go_description
         self.disease_description = disease_description
+        self.publications = publications
+        self.refs = refs
+        self.species = species
+        self.release_version = release_version
         if stats:
             self.stats = stats
         else:
@@ -372,3 +381,79 @@ def _get_single_sentence(node_ids: List[str], ontology: Ontology, aspect: str, e
                         ancestors_covering_multiple_terms=ancestors_with_multiple_children)
     else:
         return None
+
+
+def generate_ortholog_sentence(orthologs: List[List[str]], orthologs_sp_fullname: str, human_genes_props):
+    orth_sentence = None
+    if orthologs_sp_fullname == "Homo sapiens":
+        if len(orthologs) > 3:
+            gene_families = defaultdict(list)
+            for ortholog in orthologs:
+                if human_genes_props[ortholog[0]]:
+                    gene_families[human_genes_props[ortholog[0]][2]].append(human_genes_props[ortholog[0]])
+            if len(gene_families.values()) > 0:
+                gene_family_names = list(gene_families.keys())
+                if len(gene_family_names) > 3:
+                    gene_family_names = gene_family_names[0:3]
+                gene_names = [ortholog[0] + " (" + ortholog[1] + ")" for orthologs in gene_families.values() for
+                              ortholog in orthologs]
+                if len(gene_names) > 3:
+                    gene_names = gene_names[0:3]
+                family_word = "family"
+                if len(gene_family_names) > 1:
+                    family_word = "families"
+                if len(gene_family_names) > 2:
+                    ortholog_families_str = ", ".join(gene_family_names[0:-1]) + ", and " + gene_family_names[-1]
+                else:
+                    ortholog_families_str = " and ".join(gene_family_names)
+                if len(gene_names) > 2:
+                    ortholog_genes_str = ", ".join(gene_names[0:-1]) + ", and " + gene_names[-1]
+                else:
+                    ortholog_genes_str = " and ".join(gene_names)
+                orth_sentence = "is an ortholog of members of the human " + ortholog_families_str + " gene " + \
+                                family_word + " including " + ortholog_genes_str
+        else:
+            symbol_name_arr = sorted([human_genes_props[best_orth[0]][0] + " (" + human_genes_props[best_orth[0]][1] +
+                                      ")" for best_orth in orthologs if human_genes_props[best_orth[0]]])
+            if len(symbol_name_arr) > 0:
+                if len(symbol_name_arr) > 2:
+                    orth_sentence = "is an ortholog of human " + ", ".join(symbol_name_arr[0:-1]) + ", and " + \
+                                    symbol_name_arr[-1]
+                else:
+                    orth_sentence = "is an ortholog of human " + " and ".join(symbol_name_arr)
+    else:
+        fullname_arr = orthologs_sp_fullname.split(" ")
+        if len(fullname_arr[0]) > 2:
+            fullname_arr[0] = fullname_arr[0][0] + "."
+            orthologs_sp_fullname = " ".join(fullname_arr)
+        if len(orthologs) > 3:
+            gene_classes = defaultdict(list)
+            for ortholog in orthologs:
+                gene_class_data = json.loads(urllib.request.urlopen("http://rest.wormbase.org/rest/field/gene/" +
+                                                                    ortholog[0] + "/gene_class").read())
+                if "gene_class" in gene_class_data and gene_class_data["gene_class"]["data"] and "tag" in \
+                        gene_class_data["gene_class"]["data"] and "label" in \
+                        gene_class_data["gene_class"]["data"]["tag"]:
+                    gene_classes[gene_class_data["gene_class"]["data"]["tag"]["label"]].append(ortholog)
+            classes_gene_symbols = list(gene_classes.keys())
+            if len(classes_gene_symbols) > 0:
+                classes_word = "class"
+                if len(classes_gene_symbols) > 1:
+                    classes_word = "classes"
+                if len(classes_gene_symbols) > 2:
+                    orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \
+                                    ", ".join(classes_gene_symbols[0:-1]) + ", and " + classes_gene_symbols[-1] + \
+                                    " gene " + classes_word
+                else:
+                    orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \
+                                    " and ".join(classes_gene_symbols) + " gene " + classes_word
+                return orth_sentence
+        orthologs_symbols = [orth[1] for orth in orthologs]
+        if len(orthologs_symbols) > 2:
+            if len(orthologs_symbols) > 3:
+                orthologs_symbols = orthologs_symbols[0:3]
+            orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + ", ".join(orthologs_symbols[0:-1]) + \
+                            ", and " + orthologs_symbols[-1]
+        else:
+            orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + " and ".join(orthologs_symbols)
+    return orth_sentence
diff --git a/genedescriptions/descriptions_writer.py b/genedescriptions/descriptions_writer.py
index 8ea14b2..0b9a9d9 100644
--- a/genedescriptions/descriptions_writer.py
+++ b/genedescriptions/descriptions_writer.py
@@ -1,3 +1,4 @@
+import datetime
 import json
 from abc import ABCMeta, abstractmethod
 import numpy as np
@@ -71,3 +72,24 @@ def write(self, file_path: str, pretty: bool = False, include_single_gene_stats:
         with open(file_path, "w") as outfile:
             json.dump(vars(json_serializable_self), outfile, indent=indent)
 
+
+class WBWriter(DescriptionsWriter):
+    def __init__(self):
+        super().__init__()
+
+    def write(self, file_path: str):
+        """write the descriptions to a WB file
+
+        Args:
+            file_path (str): the path to the file to write
+        """
+        with open(file_path, "w") as outfile:
+            for genedesc in self.data:
+                now = datetime.datetime.now()
+                outfile.write(genedesc.gene_id + "\t" + str(now.year) + "-" + str(now.month) + "-" + str(now.day) +
+                              "\t" + genedesc.publications + "\t" + genedesc.refs + "\t" + genedesc.description + "\t" +
+                              genedesc.species + "\t" + "This description was generated automatically by a script "
+                                                        "based on homology/orthology data, Gene Ontology (GO) "
+                                                        "annotations, Disease ontology (DO) annotations, and tissue "
+                                                        "expression data from the " + genedesc.release_version +
+                              " version of WormBase)")
diff --git a/generate_desc_from_raw_files.py b/generate_desc_from_raw_files.py
index 295a341..208d602 100755
--- a/generate_desc_from_raw_files.py
+++ b/generate_desc_from_raw_files.py
@@ -4,7 +4,7 @@
 import os
 
 from genedescriptions.config_parser import GenedescConfigParser
-from genedescriptions.data_fetcher import WBDataFetcher, DataType
+from genedescriptions.data_fetcher import WBDataFetcher, DataType, DataFetcher
 from genedescriptions.descriptions_rules import *
 from genedescriptions.descriptions_writer import JsonGDWriter, GeneDesc
 
@@ -53,17 +53,24 @@ def main():
         organisms_list = conf_parser.get_agr_organisms_to_process()
     else:
         organisms_list = conf_parser.get_wb_organisms_to_process()
+    human_genes_props = DataFetcher.get_human_gene_props()
     for organism in organisms_list:
         logging.info("processing organism " + organism)
-        sister_gene_name_id_map = {}
         sister_df = None
-        species = None
         species = conf_parser.get_wb_species()
+        sister_sp_fullname = ""
+        if "main_sister_species" in species[organism] and "full_name" in \
+                species[species[organism]["main_sister_species"]]:
+            sister_sp_fullname = species[species[organism]["main_sister_species"]]["full_name"]
+        orthologs_sp_fullname = ""
+        if "ortholog" in species[organism] and all(["full_name" in species[ortholog_sp] for ortholog_sp in
+                                                    species[organism]["ortholog"]]):
+            orthologs_sp_fullname = [species[ortholog_sp]["full_name"] for ortholog_sp in species[organism]["ortholog"]]
         df = WBDataFetcher(raw_files_source=conf_parser.get_raw_file_sources("wb_data_fetcher"),
                            release_version=conf_parser.get_release("wb_data_fetcher"),
                            species=organism, project_id=species[organism]["project_id"],
                            cache_location=conf_parser.get_cache_location(), do_relations=None,
-                           go_relations=["subClassOf", "BFO:0000050"])
+                           go_relations=["subClassOf", "BFO:0000050"], sister_sp_fullname=sister_sp_fullname)
         if "main_sister_species" in species[organism] and species[organism]["main_sister_species"]:
             sister_df = WBDataFetcher(raw_files_source=conf_parser.get_raw_file_sources("wb_data_fetcher"),
                                       release_version=conf_parser.get_release("wb_data_fetcher"),
@@ -71,9 +78,10 @@ def main():
                                       project_id=species[species[organism]["main_sister_species"]]["project_id"],
                                       cache_location=conf_parser.get_cache_location(), do_relations=None,
                                       go_relations=["subClassOf", "BFO:0000050"])
-            sister_df.load_all_data_from_file()
-            for gene in sister_df.get_gene_data():
-                sister_gene_name_id_map[gene.name] = gene.id
+            sister_df.load_all_data_from_file(go_terms_replacement_regex=conf_parser.get_go_rename_terms(),
+                                              go_terms_exclusion_list=conf_parser.get_go_terms_exclusion_list(),
+                                              do_terms_replacement_regex=None,
+                                              do_terms_exclusion_list=conf_parser.get_do_terms_exclusion_list())
         df.load_all_data_from_file(go_terms_replacement_regex=conf_parser.get_go_rename_terms(),
                                    go_terms_exclusion_list=conf_parser.get_go_terms_exclusion_list(),
                                    do_terms_replacement_regex=None,
@@ -81,8 +89,23 @@ def main():
         desc_writer = JsonGDWriter()
         for gene in df.get_gene_data():
             logging.debug("processing gene " + gene.name)
-            gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name)
+            gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name,
+                                 publications=", ".join([annot["publication"] for annot in df.get_annotations_for_gene(
+                                     gene.id, annot_type=DataType.GO,
+                                     priority_list=conf_parser.get_go_evidence_groups_priority_list())]),
+                                 refs=", ".join([annot["refs"] for annot in df.get_annotations_for_gene(
+                                     gene.id, annot_type=DataType.GO,
+                                     priority_list=conf_parser.get_go_evidence_groups_priority_list())]),
+                                 species=species[organism]["full_name"],
+                                 release_version=conf_parser.get_release("wb_data_fetcher"))
             joined_sent = []
+
+            best_orthologs, selected_orth_name = df.get_best_orthologs_for_gene(
+                gene.id, orth_species_full_name=orthologs_sp_fullname)
+            if best_orthologs:
+                orth_sent = generate_ortholog_sentence(best_orthologs, selected_orth_name, human_genes_props)
+                if orth_sent:
+                    joined_sent.append(orth_sent)
             go_sent_generator = SentenceGenerator(
                 annotations=df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.GO,
                                                         priority_list=conf_parser.get_go_annotations_priority(),
@@ -125,17 +148,23 @@ def main():
                 joined_sent.append(disease_sent)
 
             if conf_parser.get_data_fetcher() == "wb_data_fetcher" and "main_sister_species" in species[organism] and \
-                    species[organism]["main_sister_species"] and gene.name.startswith("Cbr-") and gene.name[4:] in \
-                    sister_gene_name_id_map:
+                    species[organism]["main_sister_species"] and df.get_best_orthologs_for_gene(
+                    gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df,
+                    ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI",
+                                         "HEP"])[0]:
+                best_ortholog = df.get_best_orthologs_for_gene(
+                    gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df,
+                    ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI",
+                                         "HEP"])[0][0]
                 sister_sentences_generator = SentenceGenerator(sister_df.get_annotations_for_gene(
-                    annot_type=DataType.GO, gene_id=sister_gene_name_id_map[gene.name[4:]],
+                    annot_type=DataType.GO, gene_id="WB:" + best_ortholog[0],
                     priority_list=("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"),
                     desc_stats=gene_desc.stats), ontology=df.go_ontology, **go_sent_gen_common_props)
                 sister_proc_sent = " and ".join([sentence.text for sentence in sister_sentences_generator.get_sentences(
                     aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props)])
                 if sister_proc_sent:
                     joined_sent.append("in " + species[species[organism]["main_sister_species"]]["name"] + ", " +
-                                       gene.name[4:] + " " + sister_proc_sent)
+                                       best_ortholog[1] + " " + sister_proc_sent)
             if len(joined_sent) > 0:
                 desc = "; ".join(joined_sent) + "."
                 if len(desc) > 0:
diff --git a/requirements.txt b/requirements.txt
index 3269d56..d48f3e7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+urllib3
 namedlist==1.7
 inflect==0.3.1
 PyYAML==3.12
diff --git a/tests/test_data_fetcher.py b/tests/test_data_fetcher.py
index 0e039ef..9daa7cc 100644
--- a/tests/test_data_fetcher.py
+++ b/tests/test_data_fetcher.py
@@ -54,4 +54,30 @@ def test_load_do_associations_from_file(self):
                                             exclusion_list=self.conf_parser.get_do_terms_exclusion_list())
         self.assertTrue(self.df.do_associations is not None)
 
+    def test_load_orthology_from_file(self):
+        species = self.conf_parser.get_wb_species()
+        df = WBDataFetcher(raw_files_source=self.conf_parser.get_raw_file_sources("wb_data_fetcher"),
+                           release_version="WS265", species="c_briggsae",
+                           project_id=species["c_briggsae"]["project_id"],
+                           cache_location=self.conf_parser.get_cache_location(), do_relations=None,
+                           go_relations=["subClassOf", "BFO:0000050"], sister_sp_fullname="Caenorhabditis elegans")
+        sister_df = WBDataFetcher(raw_files_source=self.conf_parser.get_raw_file_sources("wb_data_fetcher"),
+                                  release_version="WS265", species="c_elegans",
+                                  project_id=species["c_elegans"]["project_id"],
+                                  cache_location=self.conf_parser.get_cache_location(), do_relations=None,
+                                  go_relations=["subClassOf", "BFO:0000050"])
+        sister_df.load_gene_data_from_file()
+        sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url,
+                                          ontology_cache_path=sister_df.go_ontology_cache_path,
+                                          terms_replacement_regex=self.conf_parser.get_go_rename_terms())
+        sister_df.load_associations_from_file(associations_type=DataType.GO,
+                                              associations_url=sister_df.go_associations_url,
+                                              associations_cache_path=sister_df.go_associations_cache_path,
+                                              exclusion_list=self.conf_parser.get_do_terms_exclusion_list())
+        df.load_orthology_from_file(sister_species_data_fetcher=sister_df,
+                                    ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP",
+                                                         "HGI", "HEP"])
+        best_ortholog = df.get_best_sister_ortholog_for_gene("WB:WBGene00000307")
+        pass
+
 
diff --git a/tests/test_descriptions_rules.py b/tests/test_descriptions_rules.py
index 17d0660..6fff700 100644
--- a/tests/test_descriptions_rules.py
+++ b/tests/test_descriptions_rules.py
@@ -10,7 +10,7 @@ class TestDescriptionsRules(unittest.TestCase):
 
     def setUp(self):
         this_dir = os.path.split(__file__)[0]
-        self.conf_parser = GenedescConfigParser(os.path.join(this_dir, os.path.pardir, "config_wb.yml"))
+        self.conf_parser = GenedescConfigParser(os.path.join(this_dir, os.path.pardir, "config_wb_non_c_elegans.yml"))
 
     def test_generate_sentences(self):
         df = WBDataFetcher(raw_files_source=self.conf_parser.get_raw_file_sources("wb_data_fetcher"),