Skip to content

Commit

Permalink
make the schema in to_dict a callable
Browse files Browse the repository at this point in the history
  • Loading branch information
MBueschelberger committed Dec 19, 2024
1 parent 2bb6c79 commit e3cf43a
Show file tree
Hide file tree
Showing 15 changed files with 118 additions and 79 deletions.
5 changes: 0 additions & 5 deletions data2rdf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,6 @@ class Config(BaseSettings):
description="In TBox mode, exclude the title of the ontology in the graph.",
)

dsms_schema_default: bool = Field(
True,
description="""Default value for the `dsms_schema` parameter of the `to_dict` method.""",
)

model_config = ConfigDict(extra="ignore")

@model_validator(mode="after")
Expand Down
51 changes: 29 additions & 22 deletions data2rdf/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import json
import warnings
from abc import abstractmethod
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union

from rdflib import Graph

from data2rdf.config import Config
from data2rdf.modes import PipelineMode

from .utils import generate_id, load_mapping_file
from .utils import load_mapping_file

from pydantic import ( # isort:skip
BaseModel,
Expand Down Expand Up @@ -195,12 +195,32 @@ def plain_metadata(self) -> List[Dict[str, Any]]:
`plain_metadata` is deprecated and will be removed in a future version.
Use the `to_dict()` instead."""
warnings.warn(message, DeprecationWarning)
return self.to_dict(dsms_schema=self.config.dsms_schema_default)
return self.to_dict()

def to_dict(
self, dsms_schema: bool = False
self, schema: Callable = None
) -> "Union[Dict[str, Any], List[Dict[str, Any]]]":
"""Return list of general metadata as DSMS custom properties"""
"""
Return general metadata as a list of dictionaries.
The list contains dictionaries, where the key is the label of the metadata,
and the value is a dictionary with the keys 'label' and 'value'. If the
metadata has a measurement unit associated with it, the dictionary will
also contain the key 'measurement_unit' with the value of the measurement
unit.
If the schema parameter is provided, it will be used to transform the
metadata list. The schema should be a callable which takes the list of
metadata dictionaries and returns the transformed metadata.
If no schema is provided, the function will return a dictionary where the
keys are the labels of the metadata, and the values are the dictionaries
from the list.
:param schema: A callable which takes a list of dictionaries and returns
the transformed metadata.
:return: A dictionary or list of dictionaries with the metadata.
"""
metadata = []
for metadatum in self.general_metadata:
prop = {
Expand All @@ -212,21 +232,8 @@ def to_dict(
"measurement_unit"
] = metadatum.measurement_unit.model_dump(exclude={"config"})
metadata.append(prop)
if dsms_schema:
if metadata:
for metadatum in metadata:
metadatum["id"] = generate_id()
metadata = {
"sections": [
{
"id": generate_id(),
"name": "General",
"entries": metadata,
}
]
}
else:
metadata = {}
if not isinstance(schema, type(None)):
metadata = schema(metadata)
else:
metadata = {datum.get("label"): datum for datum in metadata}
return metadata
Expand Down Expand Up @@ -313,10 +320,10 @@ def plain_metadata(cls) -> Dict[str, Any]:
"`plain_metadata` is not available in `tbox`-mode."
)

def to_dict(self, dsms_schema: bool = False) -> "List[Dict[str, Any]]":
def to_dict(self, schema: Callable = None) -> "List[Dict[str, Any]]":
"""Return list of general metadata as DSMS custom properties"""
if self.mode == PipelineMode.ABOX:
return self.abox.to_dict(dsms_schema=dsms_schema)
return self.abox.to_dict(schema=schema)
else:
raise NotImplementedError(
"`to_dict()` is not available in `tbox`-mode."
Expand Down
23 changes: 0 additions & 23 deletions data2rdf/parsers/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
"""Data2RDF parser utilities"""

import json
import random
import string
import time
import warnings
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -219,23 +216,3 @@ def _value_exists(value: "Any") -> bool:
bool: True if the value exists and is valid, otherwise False.
"""
return pd.notnull(value) and value != ""


def generate_id(prefix: str = "id") -> str:
# Generate a unique part using time and random characters
"""
Generates a unique id using a combination of the current time and 6 random characters.
Args:
prefix (str): The prefix to use for the generated id. Defaults to "id".
Returns:
str: The generated id.
"""
unique_part = f"{int(time.time() * 1000)}" # Milliseconds since epoch
random_part = "".join(
random.choices(string.ascii_lowercase + string.digits, k=6) # nosec
)
# Combine prefix, unique part, and random part
generated_id = f"{prefix}{unique_part}{random_part}"
return generated_id
6 changes: 3 additions & 3 deletions data2rdf/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union

from rdflib import Graph

Expand Down Expand Up @@ -210,10 +210,10 @@ def graph(cls) -> Graph:
graph += cls._validate_additional_triples(cls.additional_triples)
return graph

def to_dict(self, dsms_schema: bool = False) -> "List[Dict[str, Any]]":
def to_dict(self, schema: Callable = None) -> "List[Dict[str, Any]]":
"""Return list of general metadata as DSMS custom properties"""
if self.mode == PipelineMode.ABOX:
return self.parser.abox.to_dict(dsms_schema=dsms_schema)
return self.parser.abox.to_dict(schema=schema)
else:
raise NotImplementedError(
"`to_dict()` is not available in `tbox`-mode."
Expand Down
4 changes: 3 additions & 1 deletion tests/abox/csv_empty_rows/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import os

from ..utils import dsms_schema

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
output_folder = os.path.join(test_folder, "output")
Expand Down Expand Up @@ -59,5 +61,5 @@ def test_csv_nan_vals() -> None:
assert len(column) == 31

assert parser.graph.isomorphic(expected_graph)
assert parser.to_dict(dsms_schema=True) == {}
assert parser.to_dict(schema=dsms_schema) == {}
assert parser.to_dict() == {}
4 changes: 3 additions & 1 deletion tests/abox/csv_empty_rows/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import os

from ..utils import dsms_schema

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
output_folder = os.path.join(test_folder, "output")
Expand Down Expand Up @@ -70,5 +72,5 @@ def test_csv_na_values_pipeline() -> None:

assert pipeline.graph.isomorphic(expected_graph)

assert pipeline.to_dict(dsms_schema=True) == {}
assert pipeline.to_dict(schema=dsms_schema) == {}
assert pipeline.to_dict() == {}
6 changes: 3 additions & 3 deletions tests/abox/csv_pipeline_test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from ..utils import as_non_dsms_schema, remove_ids, sort_entries
from ..utils import as_non_dsms_schema, dsms_schema, remove_ids, sort_entries

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
Expand Down Expand Up @@ -305,7 +305,7 @@ def test_parser_csv(extension) -> None:

assert parser.graph.isomorphic(expected_graph)

assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)
assert sort_entries(parser.to_dict()) == as_non_dsms_schema(metadata)
Expand Down Expand Up @@ -348,7 +348,7 @@ def test_parser_csv_input(input_kind) -> None:

assert parser.graph.isomorphic(expected_graph)

assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)

Expand Down
6 changes: 3 additions & 3 deletions tests/abox/csv_pipeline_test/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pytest

from ..utils import as_non_dsms_schema, remove_ids, sort_entries
from ..utils import as_non_dsms_schema, dsms_schema, remove_ids, sort_entries

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
Expand Down Expand Up @@ -294,7 +294,7 @@ def test_csv_pipeline(extension) -> None:

assert pipeline.graph.isomorphic(expected_graph)

assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)
assert sort_entries(pipeline.to_dict()) == as_non_dsms_schema(metadata)
Expand Down Expand Up @@ -343,7 +343,7 @@ def test_csv_pipeline_inputs(input_kind) -> None:

assert pipeline.graph.isomorphic(expected_graph)

assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)
assert sort_entries(pipeline.to_dict()) == as_non_dsms_schema(metadata)
4 changes: 3 additions & 1 deletion tests/abox/csv_without_header/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import os

from ..utils import dsms_schema

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
output_folder = os.path.join(test_folder, "output")
Expand Down Expand Up @@ -50,5 +52,5 @@ def test_csv_wo_header_parser_config() -> None:
assert len(column) == 4

assert parser.graph.isomorphic(expected_graph)
assert parser.to_dict(dsms_schema=True) == {}
assert parser.to_dict(schema=dsms_schema) == {}
assert parser.to_dict() == {}
4 changes: 3 additions & 1 deletion tests/abox/csv_without_header/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import os

from ..utils import dsms_schema

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
output_folder = os.path.join(test_folder, "output")
Expand Down Expand Up @@ -60,5 +62,5 @@ def test_csv_wo_header_pipeline() -> None:
expected_graph.parse(expected)

assert pipeline.graph.isomorphic(expected_graph)
assert pipeline.to_dict(dsms_schema=True) == {}
assert pipeline.to_dict(schema=dsms_schema) == {}
assert pipeline.to_dict() == {}
6 changes: 3 additions & 3 deletions tests/abox/json_pipeline_test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from ..utils import remove_ids, sort_entries
from ..utils import dsms_schema, remove_ids, sort_entries

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
Expand Down Expand Up @@ -85,7 +85,7 @@ def test_parser_json(mapping_format, data_format) -> None:

assert parser.graph.isomorphic(expected_graph)

assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)

Expand Down Expand Up @@ -127,6 +127,6 @@ def test_json_parser_different_mapping_files(extension) -> None:

assert parser.graph.isomorphic(expected_graph)

assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)
6 changes: 3 additions & 3 deletions tests/abox/json_pipeline_test/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from ..utils import remove_ids, sort_entries
from ..utils import dsms_schema, remove_ids, sort_entries

test_folder = os.path.dirname(os.path.abspath(__file__))
working_folder = os.path.join(test_folder, "input")
Expand Down Expand Up @@ -90,7 +90,7 @@ def test_pipeline_json(mapping_format, data_format) -> None:

assert pipeline.graph.isomorphic(expected_graph)

assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)

Expand Down Expand Up @@ -140,6 +140,6 @@ def test_json_pipeline_different_mapping_types(extension) -> None:

assert pipeline.graph.isomorphic(expected_graph)

assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
metadata
)
52 changes: 52 additions & 0 deletions tests/abox/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""data2rdf pytest utilty"""
import random
import string
import time


def remove_ids(metadata: dict) -> dict:
Expand Down Expand Up @@ -47,3 +50,52 @@ def as_non_dsms_schema(metadata: dict) -> dict:
for entry in section.get("entries", []):
response[entry["label"]] = entry
return response


def dsms_schema(metadata: dict) -> dict:
"""
Convert a flat dictionary to a DSMS schema.
The input should be a dictionary with each key-value pair representing
a metadata entry. The output is a dictionary in the DSMS schema, with
a single section named "General", containing the given metadata entries.
:param metadata: The metadata dictionary to convert.
:return: A dictionary in the DSMS schema.
"""
if metadata:
for metadatum in metadata:
metadatum["id"] = generate_id()
metadata = {
"sections": [
{
"id": generate_id(),
"name": "General",
"entries": metadata,
}
]
}
else:
metadata = {}

return metadata


def generate_id(prefix: str = "id") -> str:
# Generate a unique part using time and random characters
"""
Generates a unique id using a combination of the current time and 6 random characters.
Args:
prefix (str): The prefix to use for the generated id. Defaults to "id".
Returns:
str: The generated id.
"""
unique_part = f"{int(time.time() * 1000)}" # Milliseconds since epoch
random_part = "".join(
random.choices(string.ascii_lowercase + string.digits, k=6) # nosec
)
# Combine prefix, unique part, and random part
generated_id = f"{prefix}{unique_part}{random_part}"
return generated_id
Loading

0 comments on commit e3cf43a

Please sign in to comment.