Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export to rdf/xml #53

Merged
merged 15 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 79 additions & 1 deletion bam_masterdata/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@
import click
from decouple import config as environ
from openpyxl import Workbook
from rdflib import Graph

from bam_masterdata.cli.entities_to_excel import entities_to_excel
from bam_masterdata.cli.entities_to_json import entities_to_json
from bam_masterdata.cli.entities_to_rdf import entities_to_rdf
from bam_masterdata.cli.fill_masterdata import MasterdataCodeGenerator
from bam_masterdata.logger import logger
from bam_masterdata.utils import (
delete_and_create_dir,
duplicated_property_types,
import_module,
listdir_py_modules,
)
Expand Down Expand Up @@ -164,6 +167,12 @@ def export_to_json(force_delete, python_path):

# Process each module using the `model_to_json` method of each entity
for module_path in py_modules:
if module_path.endswith("property_types.py"):
if duplicated_property_types(module_path=module_path, logger=logger):
click.echo(
"Please fix the duplicated property types before exporting to RDF/XML."
)
return
entities_to_json(module_path=module_path, export_dir=export_dir, logger=logger)

click.echo(f"All entity artifacts have been generated and saved to {export_dir}")
Expand Down Expand Up @@ -211,9 +220,15 @@ def export_to_excel(force_delete, python_path):
definitions_module = import_module(module_path=str(definitions_path.resolve()))

# Process the modules and save the entities to the openBIS masterdata Excel file
masterdata_file = os.path.join(".", "artifacts", "masterdata.xlsx")
masterdata_file = os.path.join(export_dir, "masterdata.xlsx")
wb = Workbook()
for i, module_path in enumerate(py_modules):
if module_path.endswith("property_types.py"):
if duplicated_property_types(module_path=module_path, logger=logger):
click.echo(
"Please fix the duplicated property types before exporting to RDF/XML."
)
return
if i == 0:
ws = wb.active
else:
Expand All @@ -234,5 +249,68 @@ def export_to_excel(force_delete, python_path):
click.echo(f"All masterdata have been generated and saved to {masterdata_file}")


@cli.command(
name="export_to_rdf",
help="Export entities to a RDF/XML file in the path `./artifacts/bam_masterdata.owl`.",
)
@click.option(
"--force-delete",
type=bool,
required=False,
default=False,
help="""
(Optional) If set to `True`, it will delete the current `./artifacts/` folder and create a new one. Default is `False`.
""",
)
@click.option(
"--python-path",
type=str,
required=False,
default=DATAMODEL_DIR,
help="""
(Optional) The path to the individual Python module or the directory containing the Python modules to process the datamodel.
Default is `./bam_masterdata/datamodel/`.
""",
)
def export_to_rdf(force_delete, python_path):
# Get the directories from the Python modules and the export directory for the static artifacts
export_dir = os.path.join(".", "artifacts")

# Delete and create the export directory
delete_and_create_dir(
directory_path=export_dir,
logger=logger,
force_delete=force_delete,
)

# Get the Python modules to process the datamodel
py_modules = listdir_py_modules(directory_path=python_path, logger=logger)
# ! Remove the module containing 'vocabulary_types.py'
py_modules = [
module for module in py_modules if "vocabulary_types.py" not in module
]

# Process each module using the `model_to_rdf` method of each entity
graph = Graph()
for module_path in py_modules:
if module_path.endswith("property_types.py"):
if duplicated_property_types(module_path=module_path, logger=logger):
click.echo(
"Please fix the duplicated property types before exporting to RDF/XML."
)
return
entities_to_rdf(graph=graph, module_path=module_path, logger=logger)

# Saving RDF/XML to file
rdf_output = graph.serialize(format="pretty-xml")
masterdata_file = os.path.join(export_dir, "masterdata.owl")
with open(masterdata_file, "w", encoding="utf-8") as f:
f.write(rdf_output)

click.echo(
f"All masterdata has been generated in RDF/XML format and saved to {masterdata_file}"
)


if __name__ == "__main__":
cli()
236 changes: 236 additions & 0 deletions bam_masterdata/cli/entities_to_rdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import inspect
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from rdflib import Graph
from structlog._config import BoundLoggerLazyProxy

import click
from rdflib import BNode, Literal, Namespace
from rdflib.namespace import DC, OWL, RDF, RDFS

from bam_masterdata.utils import code_to_class_name, import_module

BAM = Namespace("https://bamresearch.github.io/bam-masterdata/")
PROV = Namespace("http://www.w3.org/ns/prov#")


def rdf_graph_init(g: "Graph") -> None:
"""
Initialize the RDF graph with base namespaces, annotation properties, and internal BAM properties. This
function also creates placeholders for PropertyType and other entity types. The graph is to be printed out
in RDF/XML format in the `entities_to_rdf` function.

Args:
g (Graph): The RDF graph to be initialized.
"""
# Adding base namespaces
g.bind("dc", DC)
g.bind("owl", OWL)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("bam", BAM)
g.bind("prov", PROV)

# Adding annotation properties from base namespaces
annotation_props = [
RDFS.label,
RDFS.comment,
DC.identifier,
]
for prop in annotation_props:
g.add((prop, RDF.type, OWL.AnnotationProperty))

# Custom annotation properties from openBIS: `dataType`, `propertyLabel
custom_annotation_props = {
BAM[
"dataType"
]: """Represents the data type of a property as defined in the openBIS platform.
This annotation is used to ensure alignment with the native data types in openBIS,
facilitating seamless integration and data exchange.

The allowed values for this annotation correspond directly to the openBIS type system,
including BOOLEAN, CONTROLLEDVOCABULARY, DATE, HYPERLINK, INTEGER, MULTILINE_VARCHAR, OBJECT,
REAL, TIMESTAMP, VARCHAR, and XML.

While `bam:dataType` is primarily intended for internal usage with openBIS, mappings to
standard vocabularies such as `xsd` (e.g., `xsd:boolean`, `xsd:string`) are possible to use and documented to
enhance external interoperability. The full mapping is:
- BOOLEAN: xsd:boolean
- CONTROLLEDVOCABULARY: xsd:string
- DATE: xsd:date
- HYPERLINK: xsd:anyURI
- INTEGER: xsd:integer
- MULTILINE_VARCHAR: xsd:string
- OBJECT: bam:ObjectType
- REAL: xsd:decimal
- TIMESTAMP: xsd:dateTime
- VARCHAR: xsd:string
- XML: xsd:string""",
BAM[
"propertyLabel"
]: """A UI-specific annotation used in openBIS to provide an alternative label for a property
displayed in the frontend. Not intended for semantic reasoning or interoperability beyond openBIS.""",
}
for custom_prop, custom_prop_def in custom_annotation_props.items():
g.add((custom_prop, RDF.type, OWL.AnnotationProperty))
g.add(
(
custom_prop,
RDFS.label,
Literal(f"bam:{custom_prop.split('/')[-1]}", lang="en"),
)
)
g.add((custom_prop, RDFS.comment, Literal(custom_prop_def, lang="en")))

# Internal BAM properties
# ? `section`, `ordinal`, `show_in_edit_views`?
bam_props_uri = {
BAM["hasMandatoryProperty"]: [
(RDF.type, OWL.ObjectProperty),
# (RDFS.domain, OWL.Class),
(RDFS.range, BAM.PropertyType),
(RDFS.label, Literal("hasMandatoryProperty", lang="en")),
(
RDFS.comment,
Literal(
"The property must be mandatorily filled when creating the object in openBIS.",
lang="en",
),
),
],
BAM["hasOptionalProperty"]: [
(RDF.type, OWL.ObjectProperty),
# (RDFS.domain, OWL.Class),
(RDFS.range, BAM.PropertyType),
(RDFS.label, Literal("hasOptionalProperty", lang="en")),
(
RDFS.comment,
Literal(
"The property is optionally filled when creating the object in openBIS.",
lang="en",
),
),
],
BAM["referenceTo"]: [
(RDF.type, OWL.ObjectProperty),
(RDFS.domain, BAM.PropertyType), # Restricting domain to PropertyType
# (RDFS.range, OWL.Class), # Explicitly setting range to ObjectType
(RDFS.label, Literal("referenceTo", lang="en")),
(
RDFS.comment,
Literal(
"The property is referencing an object existing in openBIS.",
lang="en",
),
),
],
}
for prop_uri, obj_properties in bam_props_uri.items():
for prop in obj_properties: # type: ignore
g.add((prop_uri, prop[0], prop[1])) # type: ignore

# Adding base PropertyType and other objects as placeholders
# ! add only PropertyType
prop_type_description = """A conceptual placeholder used to define and organize properties as first-class entities.
PropertyType is used to place properties and define their metadata, separating properties from the
entities they describe.

In integration scenarios:
- PropertyType can align with `BFO:Quality` for inherent attributes.
- PropertyType can represent `BFO:Role` if properties serve functional purposes.
- PropertyType can be treated as a `prov:Entity` when properties participate in provenance relationships."""
for entity in ["PropertyType", "ObjectType", "CollectionType", "DatasetType"]:
entity_uri = BAM[entity]
g.add((entity_uri, RDF.type, OWL.Thing))
g.add((entity_uri, RDFS.label, Literal(entity, lang="en")))
if entity == "PropertyType":
g.add((entity_uri, RDFS.comment, Literal(prop_type_description, lang="en")))


def entities_to_rdf(
graph: "Graph", module_path: str, logger: "BoundLoggerLazyProxy"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the "Graph" class work? Does it inserts everything inside the file already nested?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graph is the class to describe the triples in notologies, and hence when printing to RDF/XML already has the format looked for

"triples" are normally 2 nodes connected via a relationship. Something like: (node_1, relationship, node_2), which you can see some examples when using Graph.add(). Basically it is a way of defining DAGs.

Here the complication is not printing to RDF/XML, or to create the Graph, but actually mapping the openBIS info into the triples.

) -> None:
"""
Convert the entities defined in the specified module to RDF triples and add them to the graph. The function
uses the `model_to_rdf` method defined in each class to convert the class attributes to RDF triples. The
function also adds the PropertyType and other entity types as placeholders in the graph.

Args:
graph (Graph): The RDF graph to which the entities are added.
module_path (str): The path to the module containing the entities to be converted.
logger (BoundLoggerLazyProxy): The logger to log messages.
"""
rdf_graph_init(graph)

module = import_module(module_path=module_path)

# Special case of `PropertyTypeDef` in `property_types.py`
# PROPERTY TYPES
# skos:prefLabel used for class names
# skos:definition used for `description` (en, de)
# skos:altLabel used for `property_label`
# dc:identifier used for `code` # ! only defined for internal codes with $ symbol
# dc:type used for `data_type`
if "property_types.py" in module_path:
for name, obj in inspect.getmembers(module):
if name.startswith("_") or name == "PropertyTypeDef":
continue
prop_uri = BAM[obj.id]

# Define the property as an OWL class inheriting from PropertyType
graph.add((prop_uri, RDF.type, OWL.Thing))
graph.add((prop_uri, RDFS.subClassOf, BAM.PropertyType))

# Add attributes like id, code, description in English and Deutsch, property_label, data_type
graph.add((prop_uri, RDFS.label, Literal(obj.id, lang="en")))
graph.add((prop_uri, DC.identifier, Literal(obj.code)))
descriptions = obj.description.split("//")
if len(descriptions) > 1:
graph.add((prop_uri, RDFS.comment, Literal(descriptions[0], lang="en")))
graph.add((prop_uri, RDFS.comment, Literal(descriptions[1], lang="de")))
else:
graph.add((prop_uri, RDFS.comment, Literal(obj.description, lang="en")))
graph.add(
(prop_uri, BAM.propertyLabel, Literal(obj.property_label, lang="en"))
)
graph.add((prop_uri, BAM.dataType, Literal(obj.data_type.value)))
if obj.data_type.value == "OBJECT":
# entity_ref_uri = BAM[code_to_class_name(obj.object_code)]
# graph.add((prop_uri, BAM.referenceTo, entity_ref_uri))
if not code_to_class_name(obj.object_code, logger):
logger.error(
f"Failed to identify the `object_code` for the property {obj.id}"
)
continue
entity_ref_uri = BAM[code_to_class_name(obj.object_code, logger)]

# Create a restriction with referenceTo
restriction = BNode()
graph.add((restriction, RDF.type, OWL.Restriction))
graph.add((restriction, OWL.onProperty, BAM["referenceTo"]))
graph.add((restriction, OWL.someValuesFrom, entity_ref_uri))

# Add the restriction as a subclass of the property
graph.add((prop_uri, RDFS.subClassOf, restriction))
return None

# All other datamodel modules
# OBJECT/DATASET/COLLECTION TYPES
# skos:prefLabel used for class names
# skos:definition used for `description` (en, de)
# dc:identifier used for `code` # ! only defined for internal codes with $ symbol
# parents defined from `code`
# assigned properties can be Mandatory or Optional, can be PropertyType or ObjectType
# ? For OBJECT TYPES
# ? `generated_code_prefix`, `auto_generated_codes`?
for name, obj in inspect.getmembers(module, inspect.isclass):
# Ensure the class has the `model_to_rdf` method
if not hasattr(obj, "defs") or not callable(getattr(obj, "model_to_rdf")):
continue
try:
# Instantiate the class and call the method
entity = obj()
entity.model_to_rdf(namespace=BAM, graph=graph)
except Exception as err:
click.echo(f"Failed to process class {name} in {module_path}: {err}")
7 changes: 5 additions & 2 deletions bam_masterdata/cli/fill_masterdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class will inherit from `parent_class`.
class_names (dict): A dictionary with the class names of the entities.
default (str): The default parent class if the parent class does not exist.
lines (list): A list of strings to be printed to the Python module.

logger (BoundLoggerLazyProxy): The logger to log messages.
Returns:
tuple: The parent code, parent class, and class name of the entity.
"""
Expand Down Expand Up @@ -138,6 +138,9 @@ def generate_property_types(self) -> str:
Generate Python code for the property types in the Openbis datamodel. The code is generated
as a string which is then printed out to the specific Python module in `bam_masterdata/datamodel/property_types.py`.

Args:
logger (BoundLoggerLazyProxy): The logger to log messages.

Returns:
str: Python code for the property types.
"""
Expand All @@ -154,7 +157,7 @@ def generate_property_types(self) -> str:
continue

# Format class name
class_name = code_to_class_name(code, entity_type="property")
class_name = code_to_class_name(code=code, entity_type="property")

# Add class definition
lines.append(f"{class_name} = PropertyTypeDef(")
Expand Down
Loading