zenml-io · Dev-Khant · Mar 18, 2024 · Mar 19, 2024 · Mar 20, 2024 · Mar 21, 2024
diff --git a/docs/book/user-guide/advanced-guide/data-management/handle-custom-data-types.md b/docs/book/user-guide/advanced-guide/data-management/handle-custom-data-types.md
@@ -12,7 +12,7 @@ A materializer dictates how a given artifact can be written to and retrieved fro
 
 ZenML already includes built-in materializers for many common data types. These are always enabled and are used in the background without requiring any user interaction / activation:
 
-<table data-full-width="true"><thead><tr><th>Materializer</th><th>Handled Data Types</th><th>Storage Format</th></tr></thead><tbody><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.built_in_materializer.BuiltInMaterializer">BuiltInMaterializer</a></td><td><code>bool</code>, <code>float</code>, <code>int</code>, <code>str</code>, <code>None</code></td><td><code>.json</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.built_in_materializer.BytesMaterializer">BytesInMaterializer</a></td><td><code>bytes</code></td><td><code>.txt</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.built_in_materializer.BuiltInContainerMaterializer">BuiltInContainerMaterializer</a></td><td><code>dict</code>, <code>list</code>, <code>set</code>, <code>tuple</code></td><td>Directory</td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.numpy_materializer.NumpyMaterializer">NumpyMaterializer</a></td><td><code>np.ndarray</code></td><td><code>.npy</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.pandas_materializer.PandasMaterializer">PandasMaterializer</a></td><td><code>pd.DataFrame</code>, <code>pd.Series</code></td><td><code>.csv</code> (or <code>.gzip</code> if <code>parquet</code> is installed)</td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.pydantic_materializer.PydanticMaterializer">PydanticMaterializer</a></td><td><code>pydantic.BaseModel</code></td><td><code>.json</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.service_materializer.ServiceMaterializer">ServiceMaterializer</a></td><td><code>zenml.services.service.BaseService</code></td><td><code>.json</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.structured_string_materializer.StructuredStringMaterializer">StructuredStringMaterializer</a></td><td><code>zenml.types.CSVString</code>, <code>zenml.types.HTMLString</code>, <code>zenml.types.MarkdownString</code></td><td><code>.csv</code> / <code>.html</code> / <code>.md</code> (depending on type)</td></tr></tbody></table>
+<table data-full-width="true"><thead><tr><th>Materializer</th><th>Handled Data Types</th><th>Storage Format</th></tr></thead><tbody><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.built_in_materializer.BuiltInMaterializer">BuiltInMaterializer</a></td><td><code>bool</code>, <code>float</code>, <code>int</code>, <code>str</code>, <code>None</code></td><td><code>.json</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.built_in_materializer.BytesMaterializer">BytesInMaterializer</a></td><td><code>bytes</code></td><td><code>.txt</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.built_in_materializer.BuiltInContainerMaterializer">BuiltInContainerMaterializer</a></td><td><code>dict</code>, <code>list</code>, <code>set</code>, <code>tuple</code></td><td>Directory</td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.numpy_materializer.NumpyMaterializer">NumpyMaterializer</a></td><td><code>np.ndarray</code></td><td><code>.npy</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.numpy_st_materializer.NumpySTMaterializer">NumpySTMaterializer</a></td><td><code>bool</code>, <code>float</code>, <code>int</code></td><td><code>.safetensors</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.pandas_materializer.PandasMaterializer">PandasMaterializer</a></td><td><code>pd.DataFrame</code>, <code>pd.Series</code></td><td><code>.csv</code> (or <code>.gzip</code> if <code>parquet</code> is installed)</td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.pydantic_materializer.PydanticMaterializer">PydanticMaterializer</a></td><td><code>pydantic.BaseModel</code></td><td><code>.json</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.service_materializer.ServiceMaterializer">ServiceMaterializer</a></td><td><code>zenml.services.service.BaseService</code></td><td><code>.json</code></td></tr><tr><td><a href="https://sdkdocs.zenml.io/latest/core_code_docs/core-materializers/#zenml.materializers.structured_string_materializer.StructuredStringMaterializer">StructuredStringMaterializer</a></td><td><code>zenml.types.CSVString</code>, <code>zenml.types.HTMLString</code>, <code>zenml.types.MarkdownString</code></td><td><code>.csv</code> / <code>.html</code> / <code>.md</code> (depending on type)</td></tr></tbody></table>
 
 {% hint style="warning" %}
 ZenML provides a built-in [CloudpickleMaterializer](https://sdkdocs.zenml.io/latest/core\_code\_docs/core-materializers/#zenml.materializers.cloudpickle\_materializer.CloudpickleMaterializer) that can handle any object by saving it with [cloudpickle](https://github.com/cloudpipe/cloudpickle). However, this is not production-ready because the resulting artifacts cannot be loaded when running with a different Python version. In such cases, you should consider building a [custom Materializer](handle-custom-data-types.md#custom-materializers) to save your objects in a more robust and efficient format.
@@ -30,7 +30,80 @@ In addition to the built-in materializers, ZenML also provides several integrati
 If you are running pipelines with a Docker-based [orchestrator](../../component-guide/orchestrators/orchestrators.md), you need to specify the corresponding integration as `required_integrations` in the `DockerSettings` of your pipeline in order to have the integration materializer available inside your Docker container. See the [pipeline configuration documentation](../pipelining-features/pipeline-settings.md) for more information.
 {% endhint %}
 
-## Custom materializers
+
+## Safetensor Materializers
+
+
+In addition to the standard integration-specific materializers that employ `Pickle` for serialization, opting for `Safetensors` offers a faster and more secure approach to model serialization. Further details on `Safetensors` can be found [here](https://huggingface.co/docs/safetensors/en/index).
+
+<table data-full-width="true"><thead><tr><th width="199.5">Integration</th><th width="271">Materializer</th><th width="390">Handled Data Types</th><th width="200">Storage Format</th></tr></thead><tbody><tr><td>huggingface</td><td><a href="https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-huggingface/#zenml.integrations.huggingface.materializers.huggingface_pt_model_st_materializer.HFPTModelSTMaterializer">HFPTModelSTMaterializer</a></td><td><code>transformers.PreTrainedModel</code></td><td><code>.safetensors</code></td></tr><tr><td>pytorch</td><td><a href="https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-pytorch/#zenml.integrations.pytorch.materializers.pytorch_module_st_materializer.PyTorchModuleSTMaterializer">PyTorchModuleSTMaterializer</a></td><td><code>torch.Module</code></td><td><code>.safetensors</code></td></tr><tr><td>pytorch_lightning</td><td><a href="https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-pytorch_lightning/#zenml.integrations.pytorch_lightning.materializers.pytorch_lightning_st_materializer.PyTorchLightningSTMaterializer">PyTorchLightningSTMaterializer</a></td><td><code>torch.Module</code></td><td><code>.safetensors</code></td></tr></tbody></table>
+
+### Here's an example showing how to use `PyTorchModuleSTMaterializer`:
+
+
+Let's see how materialization using safetensors works with a basic example. Here we will use `resnet50` from pytorch to test the functionality:
+
+
+Create `pipeline` which includes steps to `save` and `load` model.
+
+
+``` python
+import logging
+
+from torch.nn import Module
+
+from zenml import step, pipeline
+from zenml.integrations.pytorch.materializers import PyTorchModuleSTMaterializer
+
+
+@step(enable_cache=False, output_materializers=PyTorchModuleSTMaterializer)
+def my_first_step() -> Module:
+    """Step that saves a Pytorch model"""
+    from torchvision.models import resnet50
+
+    pretrained_model = resnet50()
+
+    return pretrained_model
+
+
+@step(enable_cache=False)
+def my_second_step(model: Module):
+    """Step that loads the model."""
+    logging.info("Model loaded correctly.")
+
+
+@pipeline
+def first_pipeline():
+    model = my_first_step()
+    my_second_step(model)
+
+
+if __name__ == "__main__":
+    first_pipeline()
+```
+
+By running pipeline it will yield the following output:
+
+```python
+Initiating a new run for the pipeline: first_pipeline.
+Reusing registered pipeline version: (version: 3).
+Executing a new run.
+Using user: default
+Using stack: default
+  orchestrator: default
+  artifact_store: default
+You can visualize your pipeline runs in the ZenML Dashboard. In order to try it locally, please run zenml up.
+Caching disabled explicitly for my_first_step.
+Step my_first_step has started.
+Step my_first_step has finished in 1.159s.
+Caching disabled explicitly for my_second_step.
+Step my_second_step has started.
+Model loaded correctly.
+Step my_second_step has finished in 0.061s.
+Pipeline run has finished in 1.266s.
+```
+
+## Custom Materializers
 
 ### Configuring a step/pipeline to use a custom materializer
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -116,6 +116,9 @@ azure-mgmt-resource = { version = ">=21.0.0", optional = true }
 # Optional dependencies for the S3 artifact store
 s3fs = { version = ">=2022.11.0", optional = true }
 
+# Optional dependencies for materializers using safetensors
+safetensors = { version = "^0.4.2", optional = true }
+
 # Optional dependencies for the GCS artifact store
 gcsfs = { version = ">=2022.11.0", optional = true }
 
@@ -186,6 +189,7 @@ server = [
 templates = ["copier", "jinja2-time", "ruff", "pyyaml-include"]
 terraform = ["python-terraform"]
 secrets-aws = ["boto3"]
+safetensors = ["safetensors"]
 secrets-gcp = ["google-cloud-secret-manager"]
 secrets-azure = ["azure-identity", "azure-keyvault-secrets"]
 secrets-hashicorp = ["hvac"]
@@ -451,6 +455,7 @@ module = [
     "fastapi_utils.*",
     "sqlalchemy_utils.*",
     "sky.*",
+    "safetensors.*",
     "copier.*",
     "datasets.*",
     "pyngrok.*",

diff --git a/src/zenml/integrations/huggingface/materializers/__init__.py b/src/zenml/integrations/huggingface/materializers/__init__.py
@@ -25,3 +25,6 @@
 from zenml.integrations.huggingface.materializers.huggingface_tokenizer_materializer import (
     HFTokenizerMaterializer,
 )
+from zenml.integrations.huggingface.materializers.huggingface_pt_model_st_materializer import (
+    HFPTModelSTMaterializer
+)
diff --git a/src/zenml/integrations/huggingface/materializers/huggingface_pt_model_st_materializer.py b/src/zenml/integrations/huggingface/materializers/huggingface_pt_model_st_materializer.py
@@ -0,0 +1,112 @@
+#  Copyright (c) ZenML GmbH 2024. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+"""Implementation of the Huggingface PyTorch model materializer using Safetensors."""
+
+import os
+from typing import Any, ClassVar, Dict, Tuple, Type
+
+import torch
+from transformers import (  # type: ignore [import-untyped]
+    PreTrainedModel,
+)
+
+try:
+    from safetensors.torch import load_file, save_file
+except ImportError:
+    raise ImportError(
+        "You are using `HFMaterializer` with safetensors.",
+        "You can install `safetensors` by running `pip install safetensors`.",
+    )
+
+from tempfile import TemporaryDirectory
+
+from zenml.enums import ArtifactType
+from zenml.materializers.base_materializer import BaseMaterializer
+from zenml.metadata.metadata_types import DType, MetadataType
+from zenml.utils import io_utils
+
+DEFAULT_FILENAME = "model.safetensors"
+DEFAULT_MODEL_FILENAME = "model_architecture.json"
+
+
+class HFPTModelSTMaterializer(BaseMaterializer):
+    """Materializer to read torch model to and from huggingface pretrained model."""
+
+    ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (PreTrainedModel,)
+    ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.MODEL
+
+    def load(self, data_type: Type[PreTrainedModel]) -> PreTrainedModel:
+        """Reads HFModel.
+
+        Args:
+            data_type: The type of the model to read.
+
+        Returns:
+            The model read from the specified dir.
+        """
+        temp_dir = TemporaryDirectory()
+        io_utils.copy_dir(self.uri, temp_dir.name)
+        # Load model architecture
+        model_filename = os.path.join(temp_dir.name, DEFAULT_MODEL_FILENAME)
+        model_arch = torch.load(model_filename)
+        _model = model_arch["model"]
+
+        # Load model weight
+        obj_filename = os.path.join(temp_dir.name, DEFAULT_FILENAME)
+        weights = load_file(obj_filename)
+        _model.load_state_dict(weights)
+
+        return _model
+
+    def save(self, model: PreTrainedModel) -> None:
+        """Writes a Model to the specified dir.
+
+        Args:
+            model: The Torch Model to write.
+        """
+        temp_dir = TemporaryDirectory()
+
+        # Save model weights
+        obj_filename = os.path.join(temp_dir.name, DEFAULT_FILENAME)
+        save_file(model.state_dict(), obj_filename)
+
+        # Save model architecture
+        model_arch = {"model": model}
+        model_filename = os.path.join(temp_dir.name, DEFAULT_MODEL_FILENAME)
+        torch.save(model_arch, model_filename)
+
+        io_utils.copy_dir(
+            temp_dir.name,
+            self.uri,
+        )
+
+    def extract_metadata(
+        self, model: PreTrainedModel
+    ) -> Dict[str, "MetadataType"]:
+        """Extract metadata from the given `PreTrainedModel` object.
+
+        Args:
+            model: The `PreTrainedModel` object to extract metadata from.
+
+        Returns:
+            The extracted metadata as a dictionary.
+        """
+        from zenml.integrations.pytorch.utils import count_module_params
+
+        module_param_metadata = count_module_params(model)
+        return {
+            **module_param_metadata,
+            "dtype": DType(str(model.dtype)),
+            "device": str(model.device),
+        }
diff --git a/src/zenml/integrations/pytorch/materializers/__init__.py b/src/zenml/integrations/pytorch/materializers/__init__.py
@@ -19,3 +19,7 @@
 from zenml.integrations.pytorch.materializers.pytorch_module_materializer import (  # noqa
     PyTorchModuleMaterializer,
 )
+
+from zenml.integrations.pytorch.materializers.pytorch_module_st_materializer import (  #noqa
+    PyTorchModuleSTMaterializer
+)
diff --git a/src/zenml/integrations/pytorch/materializers/base_pytorch_st_materializer.py b/src/zenml/integrations/pytorch/materializers/base_pytorch_st_materializer.py
@@ -0,0 +1,94 @@
+#  Copyright (c) ZenML GmbH 2024. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+"""Implementation of the PyTorch DataLoader materializer using Safetensors."""
+
+import os
+from typing import Any, ClassVar, Optional, Type
+
+import torch
+
+try:
+    from safetensors.torch import load_file, save_file
+except ImportError:
+    raise ImportError(
+        "You are using `PytorchMaterializer` with safetensors.",
+        "You can install `safetensors` by running `pip install safetensors`.",
+    )
+
+from zenml.materializers.base_materializer import BaseMaterializer
+
+DEFAULT_FILENAME = "obj.safetensors"
+DEFAULT_MODEL_FILENAME = "model_architecture.json"
+
+
+class BasePyTorchSTMaterializer(BaseMaterializer):
+    """Base class for PyTorch materializers."""
+
+    FILENAME: ClassVar[str] = DEFAULT_FILENAME
+    SKIP_REGISTRATION: ClassVar[bool] = True
+
+    def load(self, data_type: Optional[Type[Any]]) -> Any:
+        """Uses `safetensors` to load a PyTorch object.
+
+        Args:
+            data_type: The type of the object to load.
+
+        Raises:
+            ValueError: If the data_type is not a nn.Module or Dict[str, torch.Tensor]
+
+        Returns:
+            The loaded PyTorch object.
+        """
+        obj_filename = os.path.join(self.uri, self.FILENAME)
+        try:
+            if isinstance(data_type, dict):
+                return load_file(obj_filename)
+
+            # Load model architecture
+            model_filename = os.path.join(self.uri, DEFAULT_MODEL_FILENAME)
+            model_arch = torch.load(model_filename)
+            _model = model_arch["model"]
+
+            # Load model weights
+            weights = load_file(obj_filename)
+            _model.load_state_dict(weights)
+
+            return _model
+        except Exception as e:
+            raise ValueError(f"Invalid data_type received: {e}")
+
+    def save(self, obj: Any) -> None:
+        """Uses `safetensors` to save a PyTorch object.
+
+        Args:
+            obj: The PyTorch object to save.
+
+        Raises:
+            ValueError: If the data_type is not a nn.Module or Dict[str, torch.Tensor]
+
+        """
+        obj_filename = os.path.join(self.uri, self.FILENAME)
+        try:
+            if isinstance(obj, dict):
+                save_file(obj, obj_filename)
+            else:
+                # Save model weights
+                save_file(obj.state_dict(), obj_filename)
+
+                # Save model architecture
+                model_arch = {"model": obj}
+                model_filename = os.path.join(self.uri, DEFAULT_MODEL_FILENAME)
+                torch.save(model_arch, model_filename)
+        except Exception as e:
+            raise ValueError(f"Invalid data_type received: {e}")