Skip to content

Commit

Permalink
Merge pull request #676 from biolink/go-site-2246-gorule-0000001-rnac…
Browse files Browse the repository at this point in the history
…-rna-types-are-getting-mangled

Go site 2246 gorule 0000001 rnac rna types are getting mangled
  • Loading branch information
mugitty authored Jun 11, 2024
2 parents 38611d1 + 0f91398 commit e574591
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 2 deletions.
1 change: 1 addition & 0 deletions ontobio/io/assocparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ class Report(object):
VIOLATES_GO_RULE = "Violates GO Rule"
RULE_PASS = "Passes GO Rule"
INVALID_REFERENCES = "Only one reference per ID space allowed"
INVALID_SUBJECT_TYPE = "Invalid subject type"

def __init__(self, group="unknown", dataset="unknown", config=None):
self.messages = []
Expand Down
10 changes: 10 additions & 0 deletions ontobio/io/gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,9 @@ def to_association(gaf_line: List[str], report=None, group="unknown", dataset="u
DB_OBJECT_SYMBOL = 2
TAXON_INDEX = 12
REFERENCE_INDEX = 5
DEFAULT_SUBJECT_TYPE = 'gene_product'
STR_DEFAULT_SUBJECT_TYPE_CURIE = str(association.map_gp_type_label_to_curie(DEFAULT_SUBJECT_TYPE))

if gaf_line[DB_INDEX] == "":
report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
return assocparser.ParseResult(source_line, [], True, report=report)
Expand Down Expand Up @@ -417,7 +420,14 @@ def to_association(gaf_line: List[str], report=None, group="unknown", dataset="u

interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None
subject_curie = association.Curie(gaf_line[0], gaf_line[1])
type_label = gaf_line[11]
subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon)
# Output warnig, if system is defaulting to gene_product
if DEFAULT_SUBJECT_TYPE != type_label and len(subject.type) == 1 and STR_DEFAULT_SUBJECT_TYPE_CURIE == str(subject.type[0]):
report.warning(source_line, Report.INVALID_SUBJECT_TYPE, type_label, "defaulting to 'gene_product'", taxon=gaf_line[TAXON_INDEX], rule=1)
if association.map_gp_type_label_to_repair_curie(type_label) is not None:
report.warning(source_line, Report.INVALID_SUBJECT_TYPE, type_label, "has been repaired", taxon=gaf_line[TAXON_INDEX], rule=1)

gpi_entity = bio_entities.get(subject_curie)
if gpi_entity is not None and subject != gpi_entity:
subject = gpi_entity
Expand Down
25 changes: 23 additions & 2 deletions ontobio/model/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,20 +209,24 @@ def fullname_field(self, max=None) -> str:

# ===============================================================================
__default_entity_type_to_curie_mapping = bidict.bidict({
"autocatalytically_spliced_intron": Curie.from_str("SO:0000588"),
"protein_coding_gene": Curie.from_str("SO:0001217"),
"snRNA": Curie.from_str("SO:0000274"),
"ncRNA": Curie.from_str("SO:0000655"),
"rRNA": Curie.from_str("SO:0000252"),
"mRNA": Curie.from_str("SO:0000234"),
"lnc_RNA": Curie.from_str("SO:0001877"),
"lincRNA": Curie.from_str("SO:0001463"),
"lncRNA": Curie.from_str("SO:0001877"),
"tRNA": Curie.from_str("SO:0000253"),
"snoRNA": Curie.from_str("SO:0000275"),
"miRNA": Curie.from_str("SO:0000276"),
"RNA": Curie.from_str("SO:0000356"),
"scRNA": Curie.from_str("SO:0000013"),
"piRNA": Curie.from_str("SO:0001035"),
"pre_miRNA": Curie.from_str("SO:0001244"),
"tmRNA": Curie.from_str("SO:0000584"),
"scaRNA": Curie.from_str("SO:0002095"),
"siRNA": Curie.from_str("SO:0000646"),
"SRP_RNA": Curie.from_str("SO:0000590"),
"primary_transcript": Curie.from_str("SO:0000185"),
"ribozyme": Curie.from_str("SO:0000374"),
Expand All @@ -234,12 +238,14 @@ def fullname_field(self, max=None) -> str:
"hammerhead_ribozyme": Curie.from_str("SO:0000380"),
"protein": Curie.from_str("PR:000000001"),
"pseudogene": Curie.from_str("SO:0000336"),
"pseudogenic_transcript": Curie.from_str("SO:0000516"),
"gene": Curie.from_str("SO:0000704"),
"biological region": Curie.from_str("SO:0001411"),
"protein_complex": Curie.from_str("GO:0032991"),
"transcript": Curie.from_str("SO:0000673"),
"gene_product": Curie.from_str("CHEBI:33695"),
"antisense_lncRNA": Curie.from_str("SO:0001904"),
"antisense_lncRNA_gene": Curie.from_str("SO:0002182"),
"transposable_element_gene": Curie.from_str("SO:0000111"),
"gene_segment": Curie.from_str("SO:3000000"),
"genetic_marker": Curie.from_str("SO:0001645"),
Expand All @@ -257,7 +263,14 @@ def fullname_field(self, max=None) -> str:
"snRNA_gene": Curie.from_str("SO:0001268"),
"SRP_RNA_gene": Curie.from_str("SO:0001269"),
"telomerase_RNA_gene": Curie.from_str("SO:0001643"),
"tRNA_gene": Curie.from_str("SO:0001272")
"tRNA_gene": Curie.from_str("SO:0001272"),
"vault_RNA": Curie.from_str("SO:0000404"),
"Y_RNA": Curie.from_str("SO:0000405")
})

# ===============================================================================
__repair_entity_type_to_curie_mapping = bidict.bidict({
"lnc_RNA": Curie.from_str("SO:0001877")
})

def map_gp_type_label_to_curie(type_label: str) -> Curie:
Expand All @@ -267,9 +280,13 @@ def map_gp_type_label_to_curie(type_label: str) -> Curie:
This is a measure to upgrade the pseudo-labels into proper Curies. Present here are
the existing set of labels in current use, and how they should be mapped into CURIEs.
Repair Sequence Ontology (SO) labels if possible
"""
# normalized_label = type_label.translate()
global __default_entity_type_to_curie_mapping
global __repair_entity_type_to_curie_mapping
if type_label not in __default_entity_type_to_curie_mapping and type_label in __repair_entity_type_to_curie_mapping:
return __repair_entity_type_to_curie_mapping.get(type_label)
return __default_entity_type_to_curie_mapping.get(type_label, __default_entity_type_to_curie_mapping["gene_product"])

def gp_type_label_to_curie(type: Curie) -> str:
Expand All @@ -279,6 +296,10 @@ def gp_type_label_to_curie(type: Curie) -> str:
global __default_entity_type_to_curie_mapping
return __default_entity_type_to_curie_mapping.inverse.get(type, "gene_product")

def map_gp_type_label_to_repair_curie(type_label: str) -> Curie:
global __repair_entity_type_to_curie_mapping
return __repair_entity_type_to_curie_mapping.get(type_label)

@dataclass(unsafe_hash=True)
class Term:
"""
Expand Down
29 changes: 29 additions & 0 deletions tests/test_gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,36 @@ def test_obsolete_replair_of_withfrom():
assoc_result = p.parse_line(obsolete_no_replacement_line)
assert assoc_result.associations == []
assert p.report.to_report_json()["messages"]["gorule-0000020"][0]["obj"] == "GO:0016458"


def test_invalid_db_type():
#gene_product gets mapped to gene_product
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "gene_product", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('gene_product')]

#protein gets mapped to protein
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "protein", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('protein')]

#Unhandled types get mapped to 'gene_product'
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "invalid_gene_product", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('gene_product')]
assert parsed.report.to_report_json()["messages"]["gorule-0000001"][0]["type"] == parsed.report.INVALID_SUBJECT_TYPE

#'lnc_RNA' gets repaired to 'lncRNA'
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "lnc_RNA", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('lncRNA')]
assert parsed.report.to_report_json()["messages"]["gorule-0000001"][0]["type"] == parsed.report.INVALID_SUBJECT_TYPE



def test_subject_extensions_bad_curie():
"""
Expand Down

0 comments on commit e574591

Please sign in to comment.