Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Demonstrate using TAXRANK for ranks and properties #120

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 96 additions & 81 deletions src/ncbitaxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from datetime import date
from textwrap import dedent

from tabulate import tabulate

oboInOwl = {
"SynonymTypeProperty": "synonym_type_property",
"hasAlternativeId": "has_alternative_id",
Expand Down Expand Up @@ -44,52 +46,54 @@
"teleomorph": (related_synonym, None, None),
}


ranks = [
"class",
"cohort",
"family",
"forma",
"genus",
"infraclass",
"infraorder",
"kingdom",
"order",
"parvorder",
"phylum",
"section",
"series",
"species group",
"species subgroup",
"species",
"subclass",
"subcohort",
"subfamily",
"subgenus",
"subkingdom",
"suborder",
"subphylum",
"subsection",
"subspecies",
"subtribe",
"superclass",
"superfamily",
"superkingdom",
"superorder",
"superphylum",
"tribe",
"varietas",
"strain",
"serogroup",
"biotype",
"clade",
"forma specialis",
"isolate",
"serotype",
"genotype",
"morph",
"pathogroup",
]
#: A mapping from ranks appearing in the NCBI Taxonomy database
#: to CURIEs for terms in the TAXRANK ontology.
ranks: dict[str, str] = {
"class": "TAXRANK:0000002",
"family": "TAXRANK:0000004",
"forma": "TAXRANK:0000026",
"genus": "TAXRANK:0000005",
"infraclass": "TAXRANK:0000019",
"infraorder": "TAXRANK:0000013",
"kingdom": "TAXRANK:0000017",
"order": "TAXRANK:0000003",
"parvorder": "TAXRANK:0000021",
"phylum": "TAXRANK:0000001",
"section": "TAXRANK:0000030",
"series": "TAXRANK:0000031",
"species group": "TAXRANK:0000010",
"species subgroup": "TAXRANK:0000011",
"species": "TAXRANK:0000006",
"subclass": "TAXRANK:0000007",
"subfamily": "TAXRANK:0000024",
"subgenus": "TAXRANK:0000009",
"subkingdom": "TAXRANK:0000029",
"suborder": "TAXRANK:0000014",
"subphylum": "TAXRANK:0000008",
"subsection": "TAXRANK:0000053",
"subspecies": "TAXRANK:0000023",
"subtribe": "TAXRANK:0000028",
"superclass": "TAXRANK:0000015",
"superfamily": "TAXRANK:0000018",
"superkingdom": "TAXRANK:0000022",
"superorder": "TAXRANK:0000020",
"superphylum": "TAXRANK:0000027",
"tribe": "TAXRANK:0000025",
"varietas": "TAXRANK:0000016",
"cohort": "TAXRANK:0001010",
"subcohort": "TAXRANK:0001012",
"strain": "TAXRANK:0001001",
"serogroup": "TAXRANK:0001006",
"biotype": "TAXRANK:0001008",
"clade": "TAXRANK:0001004",
"forma specialis": "TAXRANK:0001005",
"isolate": "TAXRANK:0001002",
"serotype": "TAXRANK:0001003",
"genotype": "TAXRANK:0001007",
"morph": "TAXRANK:0001009",
"pathogroup": "TAXRANK:0001011",
"no rank": "TAXRANK:0000060",
}

nodes_fields = [
"tax_id", # node id in GenBank taxonomy database
Expand All @@ -108,6 +112,8 @@
]

UNRECOGNIZED_RANKS = Counter()
RECOGNIZED_RANKS = Counter()
RANK_EXAMPLES = {}


def escape_literal(text):
Expand All @@ -120,7 +126,8 @@ def label_to_id(text):

def convert_synonyms(tax_id, synonyms):
"""Given a tax_id and list of synonyms,
return a Turtle string asserting triples and OWL annotations on them."""
return a Turtle string asserting triples and OWL annotations on them.
"""
output = []
for synonym, unique, name_class in synonyms:
if name_class in predicates:
Expand All @@ -143,7 +150,8 @@ def convert_synonyms(tax_id, synonyms):

def convert_node(node, label, merged, synonyms, citations):
"""Given a node dictionary, a label string, and lists for merged, synonyms, and citations,
return a Turtle string representing this tax_id."""
return a Turtle string representing this tax_id.
"""
tax_id = node["tax_id"]
output = [f"NCBITaxon:{tax_id} a owl:Class"]

Expand All @@ -155,19 +163,19 @@ def convert_node(node, label, merged, synonyms, citations):
output.append(f"; rdfs:subClassOf NCBITaxon:{parent_tax_id}")

rank = node["rank"]
if rank and rank != "" and rank != "no rank":
if rank not in ranks:
if rank:
rank_curie = ranks.get(rank)
if not rank_curie:
if rank not in UNRECOGNIZED_RANKS:
print(f"unrecognized rank: '{rank}'")
UNRECOGNIZED_RANKS[rank] += 1
rank = label_to_id(rank)
# WARN: This is a special case for backward compatibility
if rank in ["species_group", "species_subgroup"]:
output.append(
f"; ncbitaxon:has_rank <http://purl.obolibrary.org/obo/NCBITaxon#_{rank}>"
)
else:
output.append(f"; ncbitaxon:has_rank NCBITaxon:{rank}")
RECOGNIZED_RANKS[rank_curie] += 1
output.append(f"; TAXRANK:1000000 {rank_curie}")

# Keep track of examples of each rank for making tables later
if rank not in RANK_EXAMPLES:
RANK_EXAMPLES[rank_curie] = f"NCBITaxon:{tax_id}", label

gc_id = node["genetic_code_id"]
if gc_id:
Expand Down Expand Up @@ -197,7 +205,8 @@ def convert(taxdmp_path, output_path, taxa=None):
and an optional set of tax_id strings to extract,
read from the taxdmp.zip file, collect annotations,
convert nodes to Turtle strings,
and write to the output file."""
and write to the output file.
"""
scientific_names = defaultdict(list)
labels = {}
synonyms = defaultdict(list)
Expand All @@ -217,6 +226,7 @@ def convert(taxdmp_path, output_path, taxa=None):
@prefix terms: <http://purl.org/dc/terms/> .
@prefix ncbitaxon: <http://purl.obolibrary.org/obo/ncbitaxon#> .
@prefix NCBITaxon: <http://purl.obolibrary.org/obo/NCBITaxon_> .
@prefix TAXRANK: <http://purl.obolibrary.org/obo/TAXRANK_> .
@prefix : <http://purl.obolibrary.org/obo/ncbitaxon.owl#> .

<http://purl.obolibrary.org/obo/ncbitaxon.owl> a owl:Ontology
Expand All @@ -233,11 +243,11 @@ def convert(taxdmp_path, output_path, taxa=None):
; rdfs:label "definition"^^xsd:string
.

ncbitaxon:has_rank a owl:AnnotationProperty
TAXRANK:1000000 a owl:AnnotationProperty
; obo:IAO_0000115 "A metadata relation between a class and its taxonomic rank (eg species, family)"^^xsd:string
; rdfs:label "has_rank"^^xsd:string
; rdfs:comment "This is an abstract class for use with the NCBI taxonomy to name the depth of the node within the tree. The link between the node term and the rank is only visible if you are using an obo 1.3 aware browser/editor; otherwise this can be ignored"^^xsd:string
; oboInOwl:hasOBONamespace "ncbi_taxonomy"^^xsd:string
; oboInOwl:hasOBONamespace "taxonomic_rank"^^xsd:string
.
"""
)
Expand Down Expand Up @@ -285,7 +295,7 @@ def convert(taxdmp_path, output_path, taxa=None):
print("WARN: Duplicate unique names", tax_ids, uniques)
for tax_id, unique in values:
labels[tax_id] = unique
# Reason for the line below
# Reason for the line below
# issue #56: https://github.com/obophenotype/ncbitaxon/issues/56
if name != 'environmental samples':
synonyms[tax_id].append(
Expand Down Expand Up @@ -335,33 +345,38 @@ def convert(taxdmp_path, output_path, taxa=None):
)
output.write(result)

print("Summary of unrecognized ranks:")
print(UNRECOGNIZED_RANKS)
if UNRECOGNIZED_RANKS:
print("\nSummary of unrecognized ranks:\n")
print(tabulate(UNRECOGNIZED_RANKS.most_common(), tablefmt="github"))

print("\nSummary of rank usage:\n")
print(
tabulate(
[
(rank, curie, count, *RANK_EXAMPLES[rank])
for (rank, curie), count in RECOGNIZED_RANKS.most_common()
],
headers=["NCBI Rank", "CURIE", "Count", "Example CURIE", "Example Name"],
tablefmt="github",
)
)
# TODO: delnodes

output.write(
"""
<http://purl.obolibrary.org/obo/NCBITaxon#_taxonomic_rank> a owl:Class
; rdfs:label "taxonomic rank"^^xsd:string
; rdfs:comment "This is an abstract class for use with the NCBI taxonomy to name the depth of the node within the tree. The link between the node term and the rank is only visible if you are using an obo 1.3 aware browser/editor; otherwise this can be ignored."^^xsd:string
; oboInOwl:hasOBONamespace "ncbi_taxonomy"^^xsd:string
.
TAXRANK:0000000 a owl:Class ;
rdfs:label "taxonomic_rank"^^xsd:string
oboInOwl:hasOBONamespace "taxonomic_rank"^^xsd:string .
"""
)
for label in ranks:
rank = label_to_id(label)
if rank in ["species_group", "species_subgroup"]:
iri = f"<http://purl.obolibrary.org/obo/NCBITaxon#_{rank}>"
else:
iri = f"NCBITaxon:{rank}"
for label, rank_curie in ranks.items():
output.write(
f"""
{iri} a owl:Class
; rdfs:label "{label}"^^xsd:string
; rdfs:subClassOf <http://purl.obolibrary.org/obo/NCBITaxon#_taxonomic_rank>
; oboInOwl:hasOBONamespace "ncbi_taxonomy"^^xsd:string
.
"""
dedent(f"""\
{rank_curie} a owl:Class ;
rdfs:label "{label}"^^xsd:string ;
rdfs:subClassOf TAXRANK:0000000
oboInOwl:hasOBONamespace "taxonomic_rank"^^xsd:string .
""")
)


Expand Down