make the schema in to_dict a callable

MI-FraunhoferIWM · Dec 19, 2024 · e3cf43a · e3cf43a
1 parent 2bb6c79
commit e3cf43a
Show file tree

Hide file tree

Showing 15 changed files with 118 additions and 79 deletions.
diff --git a/data2rdf/config.py b/data2rdf/config.py
@@ -79,11 +79,6 @@ class Config(BaseSettings):
         description="In TBox mode, exclude the title of the ontology in the graph.",
     )
 
-    dsms_schema_default: bool = Field(
-        True,
-        description="""Default value for the `dsms_schema` parameter of the `to_dict` method.""",
-    )
-
     model_config = ConfigDict(extra="ignore")
 
     @model_validator(mode="after")

diff --git a/data2rdf/parsers/base.py b/data2rdf/parsers/base.py
@@ -3,14 +3,14 @@
 import json
 import warnings
 from abc import abstractmethod
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from rdflib import Graph
 
 from data2rdf.config import Config
 from data2rdf.modes import PipelineMode
 
-from .utils import generate_id, load_mapping_file
+from .utils import load_mapping_file
 
 from pydantic import (  # isort:skip
     BaseModel,
@@ -195,12 +195,32 @@ def plain_metadata(self) -> List[Dict[str, Any]]:
         `plain_metadata` is deprecated and will be removed in a future version.
         Use the `to_dict()` instead."""
         warnings.warn(message, DeprecationWarning)
-        return self.to_dict(dsms_schema=self.config.dsms_schema_default)
+        return self.to_dict()
 
     def to_dict(
-        self, dsms_schema: bool = False
+        self, schema: Callable = None
     ) -> "Union[Dict[str, Any], List[Dict[str, Any]]]":
-        """Return list of general metadata as DSMS custom properties"""
+        """
+        Return general metadata as a list of dictionaries.
+
+        The list contains dictionaries, where the key is the label of the metadata,
+        and the value is a dictionary with the keys 'label' and 'value'. If the
+        metadata has a measurement unit associated with it, the dictionary will
+        also contain the key 'measurement_unit' with the value of the measurement
+        unit.
+
+        If the schema parameter is provided, it will be used to transform the
+        metadata list. The schema should be a callable which takes the list of
+        metadata dictionaries and returns the transformed metadata.
+
+        If no schema is provided, the function will return a dictionary where the
+        keys are the labels of the metadata, and the values are the dictionaries
+        from the list.
+
+        :param schema: A callable which takes a list of dictionaries and returns
+            the transformed metadata.
+        :return: A dictionary or list of dictionaries with the metadata.
+        """
         metadata = []
         for metadatum in self.general_metadata:
             prop = {
@@ -212,21 +232,8 @@ def to_dict(
                     "measurement_unit"
                 ] = metadatum.measurement_unit.model_dump(exclude={"config"})
             metadata.append(prop)
-        if dsms_schema:
-            if metadata:
-                for metadatum in metadata:
-                    metadatum["id"] = generate_id()
-                metadata = {
-                    "sections": [
-                        {
-                            "id": generate_id(),
-                            "name": "General",
-                            "entries": metadata,
-                        }
-                    ]
-                }
-            else:
-                metadata = {}
+        if not isinstance(schema, type(None)):
+            metadata = schema(metadata)
         else:
             metadata = {datum.get("label"): datum for datum in metadata}
         return metadata
@@ -313,10 +320,10 @@ def plain_metadata(cls) -> Dict[str, Any]:
                 "`plain_metadata` is not available in `tbox`-mode."
             )
 
-    def to_dict(self, dsms_schema: bool = False) -> "List[Dict[str, Any]]":
+    def to_dict(self, schema: Callable = None) -> "List[Dict[str, Any]]":
         """Return list of general metadata as DSMS custom properties"""
         if self.mode == PipelineMode.ABOX:
-            return self.abox.to_dict(dsms_schema=dsms_schema)
+            return self.abox.to_dict(schema=schema)
         else:
             raise NotImplementedError(
                 "`to_dict()` is not available in `tbox`-mode."

diff --git a/data2rdf/parsers/utils.py b/data2rdf/parsers/utils.py
@@ -1,9 +1,6 @@
 """Data2RDF parser utilities"""
 
 import json
-import random
-import string
-import time
 import warnings
 from typing import TYPE_CHECKING
 
@@ -219,23 +216,3 @@ def _value_exists(value: "Any") -> bool:
         bool: True if the value exists and is valid, otherwise False.
     """
     return pd.notnull(value) and value != ""
-
-
-def generate_id(prefix: str = "id") -> str:
-    # Generate a unique part using time and random characters
-    """
-    Generates a unique id using a combination of the current time and 6 random characters.
-
-    Args:
-    prefix (str): The prefix to use for the generated id. Defaults to "id".
-
-    Returns:
-    str: The generated id.
-    """
-    unique_part = f"{int(time.time() * 1000)}"  # Milliseconds since epoch
-    random_part = "".join(
-        random.choices(string.ascii_lowercase + string.digits, k=6)  # nosec
-    )
-    # Combine prefix, unique part, and random part
-    generated_id = f"{prefix}{unique_part}{random_part}"
-    return generated_id
diff --git a/data2rdf/pipelines/main.py b/data2rdf/pipelines/main.py
@@ -2,7 +2,7 @@
 
 import json
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from rdflib import Graph
 
@@ -210,10 +210,10 @@ def graph(cls) -> Graph:
             graph += cls._validate_additional_triples(cls.additional_triples)
         return graph
 
-    def to_dict(self, dsms_schema: bool = False) -> "List[Dict[str, Any]]":
+    def to_dict(self, schema: Callable = None) -> "List[Dict[str, Any]]":
         """Return list of general metadata as DSMS custom properties"""
         if self.mode == PipelineMode.ABOX:
-            return self.parser.abox.to_dict(dsms_schema=dsms_schema)
+            return self.parser.abox.to_dict(schema=schema)
         else:
             raise NotImplementedError(
                 "`to_dict()` is not available in `tbox`-mode."

diff --git a/tests/abox/csv_empty_rows/test_parser.py b/tests/abox/csv_empty_rows/test_parser.py
@@ -2,6 +2,8 @@
 
 import os
 
+from ..utils import dsms_schema
+
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
 output_folder = os.path.join(test_folder, "output")
@@ -59,5 +61,5 @@ def test_csv_nan_vals() -> None:
         assert len(column) == 31
 
     assert parser.graph.isomorphic(expected_graph)
-    assert parser.to_dict(dsms_schema=True) == {}
+    assert parser.to_dict(schema=dsms_schema) == {}
     assert parser.to_dict() == {}
diff --git a/tests/abox/csv_empty_rows/test_pipeline.py b/tests/abox/csv_empty_rows/test_pipeline.py
@@ -2,6 +2,8 @@
 
 import os
 
+from ..utils import dsms_schema
+
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
 output_folder = os.path.join(test_folder, "output")
@@ -70,5 +72,5 @@ def test_csv_na_values_pipeline() -> None:
 
     assert pipeline.graph.isomorphic(expected_graph)
 
-    assert pipeline.to_dict(dsms_schema=True) == {}
+    assert pipeline.to_dict(schema=dsms_schema) == {}
     assert pipeline.to_dict() == {}
diff --git a/tests/abox/csv_pipeline_test/test_parser.py b/tests/abox/csv_pipeline_test/test_parser.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from ..utils import as_non_dsms_schema, remove_ids, sort_entries
+from ..utils import as_non_dsms_schema, dsms_schema, remove_ids, sort_entries
 
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
@@ -305,7 +305,7 @@ def test_parser_csv(extension) -> None:
 
     assert parser.graph.isomorphic(expected_graph)
 
-    assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
     assert sort_entries(parser.to_dict()) == as_non_dsms_schema(metadata)
@@ -348,7 +348,7 @@ def test_parser_csv_input(input_kind) -> None:
 
     assert parser.graph.isomorphic(expected_graph)
 
-    assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
 

diff --git a/tests/abox/csv_pipeline_test/test_pipeline.py b/tests/abox/csv_pipeline_test/test_pipeline.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from ..utils import as_non_dsms_schema, remove_ids, sort_entries
+from ..utils import as_non_dsms_schema, dsms_schema, remove_ids, sort_entries
 
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
@@ -294,7 +294,7 @@ def test_csv_pipeline(extension) -> None:
 
     assert pipeline.graph.isomorphic(expected_graph)
 
-    assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
     assert sort_entries(pipeline.to_dict()) == as_non_dsms_schema(metadata)
@@ -343,7 +343,7 @@ def test_csv_pipeline_inputs(input_kind) -> None:
 
     assert pipeline.graph.isomorphic(expected_graph)
 
-    assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
     assert sort_entries(pipeline.to_dict()) == as_non_dsms_schema(metadata)
diff --git a/tests/abox/csv_without_header/test_parser.py b/tests/abox/csv_without_header/test_parser.py
@@ -2,6 +2,8 @@
 
 import os
 
+from ..utils import dsms_schema
+
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
 output_folder = os.path.join(test_folder, "output")
@@ -50,5 +52,5 @@ def test_csv_wo_header_parser_config() -> None:
         assert len(column) == 4
 
     assert parser.graph.isomorphic(expected_graph)
-    assert parser.to_dict(dsms_schema=True) == {}
+    assert parser.to_dict(schema=dsms_schema) == {}
     assert parser.to_dict() == {}
diff --git a/tests/abox/csv_without_header/test_pipeline.py b/tests/abox/csv_without_header/test_pipeline.py
@@ -2,6 +2,8 @@
 
 import os
 
+from ..utils import dsms_schema
+
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
 output_folder = os.path.join(test_folder, "output")
@@ -60,5 +62,5 @@ def test_csv_wo_header_pipeline() -> None:
     expected_graph.parse(expected)
 
     assert pipeline.graph.isomorphic(expected_graph)
-    assert pipeline.to_dict(dsms_schema=True) == {}
+    assert pipeline.to_dict(schema=dsms_schema) == {}
     assert pipeline.to_dict() == {}
diff --git a/tests/abox/json_pipeline_test/test_parser.py b/tests/abox/json_pipeline_test/test_parser.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from ..utils import remove_ids, sort_entries
+from ..utils import dsms_schema, remove_ids, sort_entries
 
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
@@ -85,7 +85,7 @@ def test_parser_json(mapping_format, data_format) -> None:
 
     assert parser.graph.isomorphic(expected_graph)
 
-    assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
 
@@ -127,6 +127,6 @@ def test_json_parser_different_mapping_files(extension) -> None:
 
     assert parser.graph.isomorphic(expected_graph)
 
-    assert remove_ids(parser.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(parser.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
diff --git a/tests/abox/json_pipeline_test/test_pipeline.py b/tests/abox/json_pipeline_test/test_pipeline.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from ..utils import remove_ids, sort_entries
+from ..utils import dsms_schema, remove_ids, sort_entries
 
 test_folder = os.path.dirname(os.path.abspath(__file__))
 working_folder = os.path.join(test_folder, "input")
@@ -90,7 +90,7 @@ def test_pipeline_json(mapping_format, data_format) -> None:
 
     assert pipeline.graph.isomorphic(expected_graph)
 
-    assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
 
@@ -140,6 +140,6 @@ def test_json_pipeline_different_mapping_types(extension) -> None:
 
     assert pipeline.graph.isomorphic(expected_graph)
 
-    assert remove_ids(pipeline.to_dict(dsms_schema=True)) == sort_entries(
+    assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries(
         metadata
     )
diff --git a/tests/abox/utils.py b/tests/abox/utils.py
@@ -1,4 +1,7 @@
 """data2rdf pytest utilty"""
+import random
+import string
+import time
 
 
 def remove_ids(metadata: dict) -> dict:
@@ -47,3 +50,52 @@ def as_non_dsms_schema(metadata: dict) -> dict:
         for entry in section.get("entries", []):
             response[entry["label"]] = entry
     return response
+
+
+def dsms_schema(metadata: dict) -> dict:
+    """
+    Convert a flat dictionary to a DSMS schema.
+
+    The input should be a dictionary with each key-value pair representing
+    a metadata entry. The output is a dictionary in the DSMS schema, with
+    a single section named "General", containing the given metadata entries.
+
+    :param metadata: The metadata dictionary to convert.
+    :return: A dictionary in the DSMS schema.
+    """
+    if metadata:
+        for metadatum in metadata:
+            metadatum["id"] = generate_id()
+        metadata = {
+            "sections": [
+                {
+                    "id": generate_id(),
+                    "name": "General",
+                    "entries": metadata,
+                }
+            ]
+        }
+    else:
+        metadata = {}
+
+    return metadata
+
+
+def generate_id(prefix: str = "id") -> str:
+    # Generate a unique part using time and random characters
+    """
+    Generates a unique id using a combination of the current time and 6 random characters.
+
+    Args:
+    prefix (str): The prefix to use for the generated id. Defaults to "id".
+
+    Returns:
+    str: The generated id.
+    """
+    unique_part = f"{int(time.time() * 1000)}"  # Milliseconds since epoch
+    random_part = "".join(
+        random.choices(string.ascii_lowercase + string.digits, k=6)  # nosec
+    )
+    # Combine prefix, unique part, and random part
+    generated_id = f"{prefix}{unique_part}{random_part}"
+    return generated_id