Skip to content

Commit 398e7f9

Browse files
authored
kwalify importer (linkml#136)
* Kwalify importer * test-files * Fixed test
1 parent c743fd0 commit 398e7f9

16 files changed

+2721
-1251
lines changed

docs/packages/importers.rst

+9
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ The ``import-json-schema`` command can be used:
2323
2424
schemauto import-json-schema tests/resources/model_card.schema.json
2525
26+
Importing from Kwalify
27+
---------
28+
29+
The ``import-kwalify`` command can be used:
30+
31+
.. code-block::
32+
33+
schemauto import-kwalify tests/resources/test.kwalify.yaml
34+
2635
Importing from OWL
2736
---------
2837

poetry.lock

+58-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ schemasheets = ">=0.1.24"
3131
xmltodict = "^0.13.0"
3232
click-default-group = "^1.2.4"
3333
linkml-runtime = "^1.7.2"
34+
duckdb = "^0.10.1"
3435

3536

3637
[tool.poetry.dev-dependencies]

schema_automator/cli.py

+26
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from schema_automator.importers.dosdp_import_engine import DOSDPImportEngine
2727
from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer
2828
from schema_automator.importers.jsonschema_import_engine import JsonSchemaImportEngine
29+
from schema_automator.importers.kwalify_import_engine import KwalifyImportEngine
2930
from schema_automator.importers.owl_import_engine import OwlImportEngine
3031
from schema_automator.generalizers.rdf_data_generalizer import RdfDataGeneralizer
3132
from schema_automator.importers.rdfs_import_engine import RdfsImportEngine
@@ -347,6 +348,12 @@ def generalize_toml(input, output, schema_name, omit_null, **kwargs):
347348
@output_option
348349
@schema_name_option
349350
@use_attributes_option
351+
@click.option(
352+
"--is-openapi/--no-is-openapi",
353+
default=False,
354+
show_default=True,
355+
help="If true, use OpenAPI schema style"
356+
)
350357
@click.option("--import-project/--no-import-project",
351358
help="If true, then the input path should be a directory with multiple schema files")
352359
@click.option('--format', '-f', default='json', help='JSON Schema format - yaml or json')
@@ -370,6 +377,25 @@ def import_json_schema(input, output, import_project: bool, schema_name, format,
370377
ie.import_project(input, output, name=schema_name, format=format)
371378

372379

380+
@main.command()
381+
@click.argument('input')
382+
@output_option
383+
@schema_name_option
384+
@use_attributes_option
385+
def import_kwalify(input, output, schema_name, **kwargs):
386+
"""
387+
Imports from Kwalify Schema to LinkML
388+
389+
See :ref:`importers` for more on the importer framework
390+
391+
Example:
392+
393+
schemauto import-kwalify my/schema/personinfo.kwalify.yaml
394+
"""
395+
ie = KwalifyImportEngine(**kwargs)
396+
schema = ie.convert(input, output, name=schema_name, format=format)
397+
write_schema(schema, output)
398+
373399
@main.command()
374400
@click.argument('input')
375401
@output_option

schema_automator/importers/cadsr_import_engine.py

+106-12
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66
import logging
77
import urllib
8-
from typing import Union, Dict, Tuple, List, Any, Optional, Iterable
8+
from typing import Union, Dict, Tuple, List, Any, Optional, Iterable, Iterator
99

1010
from dataclasses import dataclass
1111

@@ -19,6 +19,7 @@
1919
from schema_automator.importers.import_engine import ImportEngine
2020
import schema_automator.metamodels.cadsr as cadsr
2121

22+
ID_LABEL_PAIR = Tuple[str, str]
2223

2324
TMAP = {
2425
"DATE": "date",
@@ -38,6 +39,28 @@
3839
"Floating-point": "float",
3940
}
4041

42+
def extract_concepts(concepts: List[cadsr.Concept]) -> Tuple[ID_LABEL_PAIR, List[str]]:
43+
main = None
44+
rest = []
45+
if not concepts:
46+
raise ValueError("No concepts")
47+
for concept in concepts:
48+
if concept.evsSource != "NCI_CONCEPT_CODE":
49+
continue
50+
id = f"NCIT:{concept.conceptCode.strip()}"
51+
pair = id, concept.longName
52+
if concept.primaryIndicator == "Yes":
53+
if main:
54+
raise ValueError(f"Multiple primary for: {concepts}")
55+
main = pair
56+
else:
57+
rest.append(id)
58+
if not main:
59+
logging.warning(f"No primary, using arbitrary from {rest}")
60+
main = rest[0]
61+
rest = rest[1:]
62+
return main, rest
63+
4164
@dataclass
4265
class CADSRImportEngine(ImportEngine):
4366
"""
@@ -94,19 +117,30 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
94117
title=cde.preferredName,
95118
description=cde.preferredDefinition,
96119
aliases=[cde.longName],
120+
conforms_to=f"cadsr:DataElement",
97121
source=source,
98122
)
123+
# each data element belongs to a concept
124+
# (may be reused across classes?)
99125
slots[slot.name] = slot
100126
concept = cde.DataElementConcept
101-
concept_name = urllib.parse.quote(camelcase(f"{ctxt} {concept.preferredName}"))
102-
parent_concept_name = urllib.parse.quote(camelcase(concept.longName))
127+
# a concept is linked to a class
128+
objectClass = concept.ObjectClass
129+
# NCIT concepts describing the class
130+
mainConcept, mappings = extract_concepts(objectClass.Concepts)
131+
class_name = objectClass.longName
132+
concept_name = urllib.parse.quote(camelcase(f"{ctxt} {class_name}"))
133+
parent_concept_name = urllib.parse.quote(class_name)
103134
if parent_concept_name not in classes:
104135
parent_cls = ClassDefinition(
105136
name=parent_concept_name,
106-
title=concept.preferredName,
107-
description=concept.preferredDefinition,
137+
title=objectClass.preferredName,
138+
description=objectClass.preferredDefinition,
108139
#aliases=[concept.longName],
109-
class_uri=f"cadsr:{concept.publicId}",
140+
class_uri=f"cadsr:{objectClass.publicId}",
141+
exact_mappings=[mainConcept[0]],
142+
broad_mappings=mappings,
143+
conforms_to=f"cadsr:ObjectClass",
110144
)
111145
classes[parent_concept_name] = parent_cls
112146
if concept_name not in classes:
@@ -117,14 +151,23 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
117151
aliases=[concept.longName],
118152
class_uri=f"cadsr:{concept.publicId}",
119153
is_a=parent_concept_name,
154+
conforms_to=f"cadsr:DataElementConcept",
120155
)
121156
classes[concept_name] = cls
122157
else:
123158
cls = classes[concept_name]
124159
cls.slots.append(slot.name)
125-
objectClass = concept.ObjectClass
126-
# TODO
160+
# In theory the ObjectClass should link to a general class of utility in NCIT.
161+
# In practice the actual concept may not be so useful. E.g. in 2724331
162+
# "Agent Adverse Event Attribution Name" the DataConcept is
163+
# Agent (C1708) defined as "An active power or cause (as principle,
164+
# substance, physical or biological factor, etc.) that produces a specific effect."
165+
# which is very upper-ontological
166+
#for ocConcept in objectClass.Concepts:
167+
# if ocConcept.evsSource == "NCI_CONCEPT_CODE":
168+
# cls.is_a = f"NCIT:{ocConcept.conceptCode}"
127169
valueDomain = cde.ValueDomain
170+
# TODO
128171
conceptualDomain = valueDomain.ConceptualDomain
129172
pvs = valueDomain.PermissibleValues
130173
if pvs:
@@ -140,7 +183,7 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
140183
rng = enum_name
141184
for pv in pvs:
142185
# url encode the value to escape symbols like <, >, etc.
143-
pv_value = urllib.parse.quote(pv.value)
186+
pv_value = urllib.parse.quote(pv.value).replace("%20", " ")
144187
tgt_pv = PermissibleValue(
145188
text=pv_value,
146189
title=pv.value,
@@ -151,9 +194,10 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
151194
tgt_pv.title = vm.preferredName
152195
if not tgt_pv.description:
153196
tgt_pv.description = vm.preferredDefinition
154-
for c in vm.Concepts:
155-
code = c.conceptCode.strip()
156-
tgt_pv.meaning = f"NCIT:{code}"
197+
if vm.Concepts:
198+
mainConcept, mappings = extract_concepts(vm.Concepts)
199+
tgt_pv.meaning = mainConcept[0]
200+
tgt_pv.broad_mappings = mappings
157201
else:
158202
datatype = valueDomain.dataType
159203
rng = TMAP.get(datatype, "string")
@@ -179,6 +223,56 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
179223
schema.enums = enums
180224
return schema
181225

226+
def as_rows(self, paths: Iterable[str], **kwargs) -> Iterator[Dict]:
227+
for path in paths:
228+
logging.info(f"Loading {path}")
229+
with (open(path) as file):
230+
container: cadsr.DataElementContainer
231+
container = json_loader.load(file, target_class=cadsr.DataElementContainer)
232+
cde = container.DataElement
233+
yield from self._obj_as_rows(cde, path)
234+
235+
def _obj_as_rows(self, e: Union[cadsr.DataElement, cadsr.DataElementConcept, cadsr.Concept, cadsr.Property, cadsr.ObjectClass, cadsr.ConceptualDomain,
236+
cadsr.ValueDomain, cadsr.PermissibleValue, cadsr.ValueMeaning], parent_id: str) -> Iterator[Dict]:
237+
if isinstance(e, cadsr.Concept):
238+
obj = {
239+
"id": e.conceptCode,
240+
"context": e.evsSource,
241+
"longName": e.longName,
242+
}
243+
elif isinstance(e, cadsr.CDEPermissibleValue):
244+
obj = {
245+
"id": e.publicId,
246+
"value": e.value,
247+
"valueDescription": e.valueDescription,
248+
}
249+
else:
250+
obj = {
251+
"id": e.publicId,
252+
"preferredName": e.preferredName,
253+
"context": e.context,
254+
"longName": e.longName,
255+
}
256+
obj["parentId"] = parent_id
257+
obj["type"] = type(e).class_name
258+
id = obj["id"]
259+
yield obj
260+
if isinstance(e, cadsr.DataElement):
261+
yield from self._obj_as_rows(e.DataElementConcept, id)
262+
yield from self._obj_as_rows(e.ValueDomain, id)
263+
elif isinstance(e, cadsr.DataElementConcept):
264+
yield from self._obj_as_rows(e.ObjectClass, id)
265+
yield from self._obj_as_rows(e.Property, id)
266+
yield from self._obj_as_rows(e.ConceptualDomain, id)
267+
elif isinstance(e, cadsr.ValueDomain):
268+
for pv in e.PermissibleValues:
269+
yield from self._obj_as_rows(pv.ValueMeaning, id)
270+
if isinstance(e, (cadsr.ObjectClass, cadsr.Property, cadsr.PermissibleValue)):
271+
for c in e.Concepts:
272+
yield from self._obj_as_rows(c, id)
273+
274+
275+
182276

183277

184278

0 commit comments

Comments
 (0)