diff --git a/genedescriptions/data_fetcher.py b/genedescriptions/data_fetcher.py index 6015145..ffc471d 100644 --- a/genedescriptions/data_fetcher.py +++ b/genedescriptions/data_fetcher.py @@ -249,27 +249,37 @@ def load_gene_data_from_file(self): @staticmethod def get_human_gene_props(): - """ retrieve data for human genes, including Ensembl ID, symbol, name, and family name + """ retrieve data for human genes, including Ensembl ID, symbol, name, and family symbol and name Returns: Dict[List[str]]: a dictionary of all human genes properties, indexed by Ensembl ID """ human_genes_props = defaultdict(list) - human_content = urllib.request.urlopen("https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=g" - "d_app_sym&col=gd_app_name&col=gd_pub_ensembl_id&col=family.id&c" - "ol=family.name&status=Approved&status=Entry+Withdrawn&status_op" - "t=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbt" - "ag=on&submit=submit") + human_content_w_ensmbl = urllib.request.urlopen("https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=" + "gd_pub_ensembl_id&status=Approved&status=Entry+Withdrawn&statu" + "s_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgn" + "c_dbtag=on&submit=submit") + human_content_w_fam_sym = urllib.request.urlopen( + "https://www.genenames.org/cgi-bin/genefamilies/download-all/tsv") + + header = True + for line in human_content_w_ensmbl: + if not header: + linearr = line.decode("utf-8").split("\t") + linearr[-1] = linearr[-1].strip() + if linearr[1] != "": + human_genes_props[linearr[0][5:]] = [linearr[1]] + else: + header = False header = True - for line in human_content: + for line in human_content_w_fam_sym: if not header: linearr = line.decode("utf-8").split("\t") linearr[-1] = linearr[-1].strip() - if linearr[3] != "": - human_genes_props[linearr[3]] = [linearr[1], linearr[2], linearr[5]] + human_genes_props[linearr[0]].extend([linearr[1], linearr[2], linearr[9], linearr[10]]) else: header = False - return human_genes_props + return {v[0]: v[1:] for k, v in human_genes_props.items()} class WBDataFetcher(DataFetcher): diff --git a/genedescriptions/descriptions_rules.py b/genedescriptions/descriptions_rules.py index 10ce30e..f58c216 100644 --- a/genedescriptions/descriptions_rules.py +++ b/genedescriptions/descriptions_rules.py @@ -438,27 +438,33 @@ def _generate_ortholog_sentence_wormbase_human(orthologs: List[List[str]], human """ if len(orthologs) > 3: gene_families = defaultdict(list) - gene_symbols_wo_family = [] + gene_symbols_wo_family = set() for ortholog in orthologs: - if human_genes_props[ortholog[0]]: + if ortholog[0] in human_genes_props and human_genes_props[ortholog[0]] and \ + len(human_genes_props[ortholog[0]]) == 4: gene_families[human_genes_props[ortholog[0]][2]].append(human_genes_props[ortholog[0]]) else: - gene_symbols_wo_family.append(ortholog[1]) + gene_symbols_wo_family.add(ortholog[1]) if len(list(gene_families.keys())) == 1: - gene_symbols_wo_family.extend([human_p[0] + " (" + human_p[1] + ")" for human_p in - gene_families[list(gene_families.keys())[0]]]) + gene_symbols_wo_family.update(set([human_p[0] + " (" + human_p[1] + ")" for human_p in + gene_families[list(gene_families.keys())[0]]])) gene_families = {} else: - for family_name, human_ps in gene_families.items(): - if family_name == "" or len(orthologs) == 1: - gene_symbols_wo_family.append(human_ps[0][0] + " (" + human_ps[0][1] + ")") + for family_symbol, human_ps in gene_families.items(): + if family_symbol == "" or len(human_ps) == 1: + for human_p in human_ps: + gene_symbols_wo_family.add(human_p[0] + " (" + human_p[1] + ")") gene_families = {family_name: human_ps for family_name, human_ps in gene_families.items() if len(human_ps) > 1 and family_name != ""} - gene_family_names = list(gene_families.keys()) - genes_in_families = [hps[0][0] for hps in gene_families.values()] + gene_family_names = [human_ps[0][2] + " (" + human_ps[0][3] + ")" for human_ps in gene_families.values()] + genes_in_families = list(set([hps[0] for gene_list in gene_families.values() for hps in gene_list])) + gene_symbols_wo_family = list(gene_symbols_wo_family) if len(gene_family_names) > 3: gene_family_names = gene_family_names[0:3] + if len(genes_in_families) > 3: genes_in_families = genes_in_families[0:3] + if len(gene_symbols_wo_family) > 3: + gene_symbols_wo_family = gene_symbols_wo_family[0:3] family_word = "family" if len(gene_family_names) > 1: family_word = "families" @@ -472,7 +478,8 @@ def _generate_ortholog_sentence_wormbase_human(orthologs: List[List[str]], human orth_sentence = "is an ortholog of " + " and ".join(sentences_arr) else: symbol_name_arr = sorted([human_genes_props[best_orth[0]][0] + " (" + human_genes_props[best_orth[0]][1] + - ")" if human_genes_props[best_orth[0]] else best_orth[1] for best_orth in orthologs]) + ")" if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]] else + best_orth[1] for best_orth in orthologs]) orth_sentence = "is an ortholog of human " + concatenate_words_with_oxford_comma(symbol_name_arr) return orth_sentence