From 1dfe2916b8c63e9b1d5a1053f5ce1707a391f79d Mon Sep 17 00:00:00 2001 From: Ralf Date: Wed, 23 Oct 2024 15:58:11 +0300 Subject: [PATCH 01/30] [FSTORE-1537][APPEND] External feature group fix for getting features from dataset (#369) --- python/hsfs/engine/spark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 2ff6bc39d..69b17915a 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -197,7 +197,7 @@ def register_external_temporary_table(self, external_fg, alias): external_fg.query, external_fg.data_format, external_fg.options, - external_fg.prepare_spark_location(), + external_fg.storage_connector._get_path(external_fg.path), # cant rely on location since this method can be used before FG is saved ) else: external_dataset = external_fg.dataframe From 5c596c9a21edffbeaaadc811f37e0e2c88d5a12a Mon Sep 17 00:00:00 2001 From: manu-sj <152865565+manu-sj@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:15:06 +0200 Subject: [PATCH 02/30] [FSTORE-1576] Fix built-in transformation functions workflow test and Parallelize transformation function tests --- python/hsfs/builtin_transformations.py | 4 ++-- python/hsfs/engine/python.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py index 1fc2ce670..4426268cc 100644 --- a/python/hsfs/builtin_transformations.py +++ b/python/hsfs/builtin_transformations.py @@ -43,7 +43,7 @@ def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Serie ) -@udf(int, drop=["feature"]) +@udf(int, drop=["feature"], mode="pandas") def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = sorted([value for value in statistics.feature.unique_values]) value_to_index = {value: index for index, value in enumerate(unique_data)} @@ -56,7 +56,7 @@ def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Serie ) -@udf(bool, drop=["feature"]) +@udf(bool, drop=["feature"], mode="pandas") def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = [value for value in statistics.feature.unique_values] diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index b2fb1968d..34fc1ffd5 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -1213,7 +1213,9 @@ def save_stream_dataframe( ) def save_empty_dataframe( - self, feature_group: Union[FeatureGroup, ExternalFeatureGroup], new_features=None + self, + feature_group: Union[FeatureGroup, ExternalFeatureGroup], + new_features=None, ) -> None: """Wrapper around save_dataframe in order to provide no-op.""" pass @@ -1406,7 +1408,9 @@ def _apply_pandas_udf( for feature in hopsworks_udf.transformation_features ] ) - ) + ).set_index( + dataframe.index + ) # Index is set to the input dataframe index so that pandas would merge the new columns without reordering them. else: dataframe[hopsworks_udf.output_column_names[0]] = hopsworks_udf.get_udf( online=False @@ -1417,9 +1421,11 @@ def _apply_pandas_udf( for feature in hopsworks_udf.transformation_features ] ) - ) + ).set_axis( + dataframe.index + ) # Index is set to the input dataframe index so that pandas would merge the new column without reordering it. if hopsworks_udf.output_column_names[0] in dataframe.columns: - # Overwriting features so reordering dataframe to move overwritten column to the end of the dataframe + # Overwriting features also reordering dataframe to move overwritten column to the end of the dataframe cols = dataframe.columns.tolist() cols.append(cols.pop(cols.index(hopsworks_udf.output_column_names[0]))) dataframe = dataframe[cols] @@ -1582,9 +1588,10 @@ def _start_offline_materialization(offline_write_options: Dict[str, Any]) -> boo def _convert_feature_log_to_df(feature_log, cols) -> pd.DataFrame: if feature_log is None and cols: return pd.DataFrame(columns=cols) - if not (isinstance(feature_log, (list, pd.DataFrame, pl.DataFrame)) or ( - HAS_NUMPY and isinstance(feature_log, np.ndarray) - )): + if not ( + isinstance(feature_log, (list, pd.DataFrame, pl.DataFrame)) + or (HAS_NUMPY and isinstance(feature_log, np.ndarray)) + ): raise ValueError(f"Type '{type(feature_log)}' not accepted") if isinstance(feature_log, list) or ( HAS_NUMPY and isinstance(feature_log, np.ndarray) From 07daf8969d361c3951c9472b08edb4b3fa7faf42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Tue, 10 Sep 2024 08:37:38 +0200 Subject: [PATCH 03/30] [HWORKS-1221] Bump protobuf==^4.25.4 --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index a66d15115..6ff1e6a0c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -54,7 +54,7 @@ dependencies = [ "opensearch-py>=1.1.0,<=2.4.2", "tqdm", "grpcio>=1.49.1,<2.0.0", # ^1.49.1 - "protobuf>=3.19.0,<4.0.0", # ^3.19.0 + "protobuf>=4.25.4,<5.0.0", # ^4.25.4 ] [project.optional-dependencies] From e8fdae26a2b2b8c3f02b8951b48797dcadd503e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Fri, 13 Sep 2024 17:29:42 +0200 Subject: [PATCH 04/30] [HWORKS-1224] Add llm signature and openai endpoint --- python/hopsworks_common/constants.py | 2 + python/hsml/core/serving_api.py | 3 + python/hsml/engine/serving_engine.py | 5 +- python/hsml/llm/__init__.py | 15 +++++ python/hsml/llm/model.py | 75 ++++++++++++++++++++++ python/hsml/llm/predictor.py | 28 +++++++++ python/hsml/llm/signature.py | 77 +++++++++++++++++++++++ python/hsml/model_registry.py | 9 +++ python/hsml/predictor.py | 18 +++--- python/hsml/util.py | 7 +++ python/tests/fixtures/model_fixtures.json | 27 ++++++++ python/tests/fixtures/model_fixtures.py | 7 +++ python/tests/test_constants.py | 2 + python/tests/test_model.py | 13 ++++ python/tests/test_predictor.py | 19 ++++++ python/tests/test_util.py | 48 ++++++++++++++ 16 files changed, 347 insertions(+), 8 deletions(-) create mode 100644 python/hsml/llm/__init__.py create mode 100644 python/hsml/llm/model.py create mode 100644 python/hsml/llm/predictor.py create mode 100644 python/hsml/llm/signature.py diff --git a/python/hopsworks_common/constants.py b/python/hopsworks_common/constants.py index 72672dae8..b9f90f22b 100644 --- a/python/hopsworks_common/constants.py +++ b/python/hopsworks_common/constants.py @@ -158,6 +158,7 @@ class MODEL: FRAMEWORK_TORCH = "TORCH" FRAMEWORK_PYTHON = "PYTHON" FRAMEWORK_SKLEARN = "SKLEARN" + FRAMEWORK_LLM = "LLM" class MODEL_REGISTRY: @@ -210,6 +211,7 @@ class PREDICTOR: # model server MODEL_SERVER_PYTHON = "PYTHON" MODEL_SERVER_TF_SERVING = "TENSORFLOW_SERVING" + MODEL_SERVER_VLLM = "VLLM" # serving tool SERVING_TOOL_DEFAULT = "DEFAULT" SERVING_TOOL_KSERVE = "KSERVE" diff --git a/python/hsml/core/serving_api.py b/python/hsml/core/serving_api.py index 92d947728..9a124465d 100644 --- a/python/hsml/core/serving_api.py +++ b/python/hsml/core/serving_api.py @@ -419,4 +419,7 @@ def _get_hopsworks_inference_path(self, project_id: int, deployment_instance): ] def _get_istio_inference_path(self, deployment_instance): + if deployment_instance.model_server == "VLLM": + return ["openai", "v1", "completions"] + return ["v1", "models", deployment_instance.name + ":predict"] diff --git a/python/hsml/engine/serving_engine.py b/python/hsml/engine/serving_engine.py index 15e2b3fa6..12f311d17 100644 --- a/python/hsml/engine/serving_engine.py +++ b/python/hsml/engine/serving_engine.py @@ -493,7 +493,10 @@ def predict( inputs: Union[Dict, List[Dict]], ): # validate user-provided payload - self._validate_inference_payload(deployment_instance.api_protocol, data, inputs) + if deployment_instance.model_server != "VLLM": + self._validate_inference_payload( + deployment_instance.api_protocol, data, inputs + ) # build inference payload based on API protocol payload = self._build_inference_payload( diff --git a/python/hsml/llm/__init__.py b/python/hsml/llm/__init__.py new file mode 100644 index 000000000..ff8055b9b --- /dev/null +++ b/python/hsml/llm/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/hsml/llm/model.py b/python/hsml/llm/model.py new file mode 100644 index 000000000..b52cf6398 --- /dev/null +++ b/python/hsml/llm/model.py @@ -0,0 +1,75 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import humps +from hsml.constants import MODEL +from hsml.model import Model + + +class Model(Model): + """Metadata object representing a LLM model in the Model Registry.""" + + def __init__( + self, + id, + name, + version=None, + created=None, + creator=None, + environment=None, + description=None, + project_name=None, + metrics=None, + program=None, + user_full_name=None, + model_schema=None, + training_dataset=None, + input_example=None, + model_registry_id=None, + tags=None, + href=None, + feature_view=None, + training_dataset_version=None, + **kwargs, + ): + super().__init__( + id, + name, + version=version, + created=created, + creator=creator, + environment=environment, + description=description, + project_name=project_name, + metrics=metrics, + program=program, + user_full_name=user_full_name, + model_schema=model_schema, + training_dataset=training_dataset, + input_example=input_example, + framework=MODEL.FRAMEWORK_LLM, + model_registry_id=model_registry_id, + feature_view=feature_view, + training_dataset_version=training_dataset_version, + ) + + def update_from_response_json(self, json_dict): + json_decamelized = humps.decamelize(json_dict) + json_decamelized.pop("framework") + if "type" in json_decamelized: # backwards compatibility + _ = json_decamelized.pop("type") + self.__init__(**json_decamelized) + return self diff --git a/python/hsml/llm/predictor.py b/python/hsml/llm/predictor.py new file mode 100644 index 000000000..814edc522 --- /dev/null +++ b/python/hsml/llm/predictor.py @@ -0,0 +1,28 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from hsml.constants import MODEL, PREDICTOR +from hsml.predictor import Predictor + + +class Predictor(Predictor): + """Configuration for a predictor running with the vLLM backend""" + + def __init__(self, **kwargs): + kwargs["model_framework"] = MODEL.FRAMEWORK_LLM + kwargs["model_server"] = PREDICTOR.MODEL_SERVER_VLLM + + super().__init__(**kwargs) diff --git a/python/hsml/llm/signature.py b/python/hsml/llm/signature.py new file mode 100644 index 000000000..9ac7db9ff --- /dev/null +++ b/python/hsml/llm/signature.py @@ -0,0 +1,77 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Union + +import numpy +import pandas +from hopsworks_common import usage +from hsml.llm.model import Model +from hsml.model_schema import ModelSchema + + +_mr = None + + +@usage.method_logger +def create_model( + name: str, + version: Optional[int] = None, + metrics: Optional[dict] = None, + description: Optional[str] = None, + input_example: Optional[ + Union[pandas.DataFrame, pandas.Series, numpy.ndarray, list] + ] = None, + model_schema: Optional[ModelSchema] = None, + feature_view=None, + training_dataset_version: Optional[int] = None, +): + """Create an LLM model metadata object. + + !!! note "Lazy" + This method is lazy and does not persist any metadata or uploads model artifacts in the + model registry on its own. To save the model object and the model artifacts, call the `save()` method with a + local file path to the directory containing the model artifacts. + + # Arguments + name: Name of the model to create. + version: Optionally version of the model to create, defaults to `None` and + will create the model with incremented version from the last + version in the model registry. + metrics: Optionally a dictionary with model evaluation metrics (e.g., accuracy, MAE) + description: Optionally a string describing the model, defaults to empty string + `""`. + input_example: Optionally an input example that represents a single input for the model, defaults to `None`. + model_schema: Optionally a model schema for the model inputs and/or outputs. + + # Returns + `Model`. The model metadata object. + """ + model = Model( + id=None, + name=name, + version=version, + description=description, + metrics=metrics, + input_example=input_example, + model_schema=model_schema, + feature_view=feature_view, + training_dataset_version=training_dataset_version, + ) + model._shared_registry_project_name = _mr.shared_registry_project_name + model._model_registry_id = _mr.model_registry_id + + return model diff --git a/python/hsml/model_registry.py b/python/hsml/model_registry.py index 8968e6d16..70b90b989 100644 --- a/python/hsml/model_registry.py +++ b/python/hsml/model_registry.py @@ -20,6 +20,7 @@ from hopsworks_common import usage from hsml import util from hsml.core import model_api +from hsml.llm import signature as llm_signature # noqa: F401 from hsml.python import signature as python_signature # noqa: F401 from hsml.sklearn import signature as sklearn_signature # noqa: F401 from hsml.tensorflow import signature as tensorflow_signature # noqa: F401 @@ -49,11 +50,13 @@ def __init__( self._python = python_signature self._sklearn = sklearn_signature self._torch = torch_signature + self._llm = llm_signature tensorflow_signature._mr = self python_signature._mr = self sklearn_signature._mr = self torch_signature._mr = self + llm_signature._mr = self @classmethod def from_response_json(cls, json_dict): @@ -191,6 +194,12 @@ def python(self): return python_signature + @property + def llm(self): + """Module for exporting a Large Language Model.""" + + return llm_signature + def __repr__(self): project_name = ( self._shared_registry_project_name diff --git a/python/hsml/predictor.py b/python/hsml/predictor.py index 87f00c9aa..f1d458a3f 100644 --- a/python/hsml/predictor.py +++ b/python/hsml/predictor.py @@ -167,18 +167,22 @@ def _validate_serving_tool(cls, serving_tool): @classmethod def _validate_script_file(cls, model_framework, script_file): - if model_framework == MODEL.FRAMEWORK_PYTHON and script_file is None: + if script_file is None and ( + model_framework == MODEL.FRAMEWORK_PYTHON + or model_framework == MODEL.FRAMEWORK_LLM + ): raise ValueError( - "Predictor scripts are required in deployments for custom Python models" + "Predictor scripts are required in deployments for custom Python models and LLMs." ) @classmethod def _infer_model_server(cls, model_framework): - return ( - PREDICTOR.MODEL_SERVER_TF_SERVING - if model_framework == MODEL.FRAMEWORK_TENSORFLOW - else PREDICTOR.MODEL_SERVER_PYTHON - ) + if model_framework == MODEL.FRAMEWORK_TENSORFLOW: + return PREDICTOR.MODEL_SERVER_TF_SERVING + elif model_framework == MODEL.FRAMEWORK_LLM: + return PREDICTOR.MODEL_SERVER_VLLM + else: + return PREDICTOR.MODEL_SERVER_PYTHON @classmethod def _get_default_serving_tool(cls): diff --git a/python/hsml/util.py b/python/hsml/util.py index 3fb243566..461793ebf 100644 --- a/python/hsml/util.py +++ b/python/hsml/util.py @@ -95,6 +95,7 @@ def default(self, obj): # pylint: disable=E0202 def set_model_class(model): + from hsml.llm.model import Model as LLMModel from hsml.model import Model as BaseModel from hsml.python.model import Model as PyModel from hsml.sklearn.model import Model as SkLearnModel @@ -120,6 +121,8 @@ def set_model_class(model): return SkLearnModel(**model) elif framework == MODEL.FRAMEWORK_PYTHON: return PyModel(**model) + elif framework == MODEL.FRAMEWORK_LLM: + return LLMModel(**model) else: raise ValueError( "framework {} is not a supported framework".format(str(framework)) @@ -232,6 +235,8 @@ def validate_metrics(metrics): def get_predictor_for_model(model, **kwargs): + from hsml.llm.model import Model as LLMModel + from hsml.llm.predictor import Predictor as vLLMPredictor from hsml.model import Model as BaseModel from hsml.predictor import Predictor as BasePredictor from hsml.python.model import Model as PyModel @@ -258,6 +263,8 @@ def get_predictor_for_model(model, **kwargs): return SkLearnPredictor(**kwargs) if type(model) is PyModel: return PyPredictor(**kwargs) + if type(model) is LLMModel: + return vLLMPredictor(**kwargs) if type(model) is BaseModel: return BasePredictor( # python as default framework and model server model_framework=MODEL.FRAMEWORK_PYTHON, diff --git a/python/tests/fixtures/model_fixtures.json b/python/tests/fixtures/model_fixtures.json index 40c0b8002..cf44c3111 100644 --- a/python/tests/fixtures/model_fixtures.json +++ b/python/tests/fixtures/model_fixtures.json @@ -133,6 +133,33 @@ ] } }, + "get_llm": { + "response": { + "count": 1, + "items": [ + { + "id": "5", + "name": "llmmodel", + "version": 0, + "created": "created", + "creator": "creator", + "environment": "environment.yml", + "description": "description", + "project_name": "myproject", + "metrics": { "acc": 0.7 }, + "program": "program", + "user_full_name": "Full Name", + "model_schema": "model_schema.json", + "training_dataset": "training_dataset", + "input_example": "input_example.json", + "model_registry_id": 1, + "tags": [], + "framework": "LLM", + "href": "test_href" + } + ] + } + }, "get_list": { "response": { "count": 2, diff --git a/python/tests/fixtures/model_fixtures.py b/python/tests/fixtures/model_fixtures.py index 32fe396de..9b3796d05 100644 --- a/python/tests/fixtures/model_fixtures.py +++ b/python/tests/fixtures/model_fixtures.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd import pytest +from hsml.llm.model import Model as LLMModel from hsml.model import Model as BaseModel from hsml.python.model import Model as PythonModel from hsml.sklearn.model import Model as SklearnModel @@ -29,12 +30,14 @@ MODEL_SKLEARN_ID = 2 MODEL_TENSORFLOW_ID = 3 MODEL_TORCH_ID = 4 +MODEL_LLM_ID = 5 MODEL_BASE_NAME = "basemodel" MODEL_PYTHON_NAME = "pythonmodel" MODEL_SKLEARN_NAME = "sklearnmodel" MODEL_TENSORFLOW_NAME = "tensorflowmodel" MODEL_TORCH_NAME = "torchmodel" +MODEL_LLM_NAME = "llmmodel" # models @@ -63,6 +66,10 @@ def model_tensorflow(): def model_torch(): return TorchModel(MODEL_TORCH_ID, MODEL_TORCH_NAME) +@pytest.fixture +def model_llm(): + return LLMModel(MODEL_LLM_ID, MODEL_LLM_NAME) + # input example diff --git a/python/tests/test_constants.py b/python/tests/test_constants.py index 7a923d8d8..783770b14 100644 --- a/python/tests/test_constants.py +++ b/python/tests/test_constants.py @@ -38,6 +38,7 @@ def test_model_framework_constants(self): "FRAMEWORK_TORCH": "TORCH", "FRAMEWORK_PYTHON": "PYTHON", "FRAMEWORK_SKLEARN": "SKLEARN", + "FRAMEWORK_LLM": "LLM", } # Assert @@ -193,6 +194,7 @@ def test_predictor_model_server_constants(self): model_servers = { "MODEL_SERVER_PYTHON": "PYTHON", "MODEL_SERVER_TF_SERVING": "TENSORFLOW_SERVING", + "MODEL_SERVER_VLLM": "VLLM" } # Assert diff --git a/python/tests/test_model.py b/python/tests/test_model.py index 1f706a845..2442ac7fb 100644 --- a/python/tests/test_model.py +++ b/python/tests/test_model.py @@ -138,6 +138,19 @@ def test_constructor_torch(self, mocker, backend_fixtures): # Assert self.assert_model(mocker, m, json, MODEL.FRAMEWORK_TORCH) + def test_constructor_llm(self, mocker, backend_fixtures): + # Arrange + json = backend_fixtures["model"]["get_llm"]["response"]["items"][0] + m_json = copy.deepcopy(json) + id = m_json.pop("id") + name = m_json.pop("name") + + # Act + m = model.Model(id=id, name=name, **m_json) + + # Assert + self.assert_model(mocker, m, json, MODEL.FRAMEWORK_LLM) + # save def test_save(self, mocker, backend_fixtures): diff --git a/python/tests/test_predictor.py b/python/tests/test_predictor.py index e2e5485fc..166666baf 100644 --- a/python/tests/test_predictor.py +++ b/python/tests/test_predictor.py @@ -340,6 +340,14 @@ def test_validate_script_file_py_none(self): # Assert assert "Predictor scripts are required" in str(e_info.value) + def test_validate_script_file_llm_none(self): + # Act + with pytest.raises(ValueError) as e_info: + _ = predictor.Predictor._validate_script_file(MODEL.FRAMEWORK_LLM, None) + + # Assert + assert "Predictor scripts are required" in str(e_info.value) + def test_validate_script_file_tf_script_file(self): # Act predictor.Predictor._validate_script_file( @@ -360,6 +368,10 @@ def test_validate_script_file_py_script_file(self): # Act predictor.Predictor._validate_script_file(MODEL.FRAMEWORK_PYTHON, "script_file") + def test_validate_script_file_llm_script_file(self): + # Act + predictor.Predictor._validate_script_file(MODEL.FRAMEWORK_LLM, "script_file") + # infer model server def test_infer_model_server_tf(self): @@ -390,6 +402,13 @@ def test_infer_model_server_py(self): # Assert assert ms == PREDICTOR.MODEL_SERVER_PYTHON + def test_infer_model_server_llm(self): + # Act + ms = predictor.Predictor._infer_model_server(MODEL.FRAMEWORK_LLM) + + # Assert + assert ms == PREDICTOR.MODEL_SERVER_VLLM + # default serving tool def test_get_default_serving_tool_kserve_installed(self, mocker): diff --git a/python/tests/test_util.py b/python/tests/test_util.py index 21b411a71..076b2aea7 100644 --- a/python/tests/test_util.py +++ b/python/tests/test_util.py @@ -28,6 +28,8 @@ from hsfs.feature import Feature from hsml import util from hsml.constants import MODEL +from hsml.llm.model import Model as LLMModel +from hsml.llm.predictor import Predictor as LLMPredictor from hsml.model import Model as BaseModel from hsml.predictor import Predictor as BasePredictor from hsml.python.model import Model as PythonModel @@ -105,6 +107,17 @@ def test_set_model_class_torch(self, backend_fixtures): assert isinstance(model, TorchModel) assert model.framework == MODEL.FRAMEWORK_TORCH + def test_set_model_class_llm(self, backend_fixtures): + # Arrange + json = backend_fixtures["model"]["get_llm"]["response"]["items"][0] + + # Act + model = util.set_model_class(json) + + # Assert + assert isinstance(model, LLMModel) + assert model.framework == MODEL.FRAMEWORK_LLM + def test_set_model_class_unsupported(self, backend_fixtures): # Arrange json = backend_fixtures["model"]["get_base"]["response"]["items"][0] @@ -361,6 +374,7 @@ def pred_base_spec(model_framework, model_server): pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_base) @@ -374,6 +388,7 @@ def pred_base_spec(model_framework, model_server): pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_python(self, mocker, model_python): # Arrange @@ -384,6 +399,7 @@ def test_get_predictor_for_model_python(self, mocker, model_python): pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_python) @@ -395,6 +411,7 @@ def test_get_predictor_for_model_python(self, mocker, model_python): pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_sklearn(self, mocker, model_sklearn): # Arrange @@ -405,6 +422,7 @@ def test_get_predictor_for_model_sklearn(self, mocker, model_sklearn): ) pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_sklearn) @@ -416,6 +434,7 @@ def test_get_predictor_for_model_sklearn(self, mocker, model_sklearn): pred_sklearn.assert_called_once() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_tensorflow(self, mocker, model_tensorflow): # Arrange @@ -426,6 +445,7 @@ def test_get_predictor_for_model_tensorflow(self, mocker, model_tensorflow): "hsml.tensorflow.predictor.Predictor.__init__", return_value=None ) pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_tensorflow) @@ -437,6 +457,7 @@ def test_get_predictor_for_model_tensorflow(self, mocker, model_tensorflow): pred_sklearn.assert_not_called() pred_tensorflow.assert_called_once() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_torch(self, mocker, model_torch): # Arrange @@ -447,6 +468,7 @@ def test_get_predictor_for_model_torch(self, mocker, model_torch): pred_torch = mocker.patch( "hsml.torch.predictor.Predictor.__init__", return_value=None ) + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_torch) @@ -458,6 +480,30 @@ def test_get_predictor_for_model_torch(self, mocker, model_torch): pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_called_once() + pred_llm.assert_not_called() + + def test_get_predictor_for_model_llm(self, mocker, model_llm): + # Arrange + pred_base = mocker.patch("hsml.predictor.Predictor.__init__") + pred_python = mocker.patch("hsml.python.predictor.Predictor.__init__") + pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") + pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") + pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch( + "hsml.llm.predictor.Predictor.__init__", return_value=None + ) + + # Act + predictor = util.get_predictor_for_model(model_llm) + + # Assert + assert isinstance(predictor, LLMPredictor) + pred_base.assert_not_called() + pred_python.assert_not_called() + pred_sklearn.assert_not_called() + pred_tensorflow.assert_not_called() + pred_torch.assert_not_called() + pred_llm.assert_called_once() def test_get_predictor_for_model_non_base(self, mocker): # Arrange @@ -466,6 +512,7 @@ def test_get_predictor_for_model_non_base(self, mocker): pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") class NonBaseModel: pass @@ -482,6 +529,7 @@ class NonBaseModel: pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_hostname_replaced_url(self, mocker): # Arrange From 0b3843a79e27ae99de60fb72aa8f8d3c234ce103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Thu, 24 Oct 2024 12:12:49 +0200 Subject: [PATCH 05/30] [HWORKS-1737] Use pyarrow-hdfs to upload and download a model if available --- python/hopsworks_common/constants.py | 2 + python/hsml/core/hdfs_api.py | 93 +++++++++++++++++++++ python/hsml/deployment.py | 16 +++- python/hsml/engine/local_engine.py | 70 +++++++++++----- python/hsml/engine/model_engine.py | 50 ++++++------ python/hsml/engine/serving_engine.py | 116 +++++++++++++++++++++------ python/hsml/model.py | 16 +++- python/hsml/predictor.py | 12 ++- python/tests/test_constants.py | 17 ++-- python/tests/test_deployment.py | 10 ++- python/tests/test_model.py | 4 +- 11 files changed, 323 insertions(+), 83 deletions(-) create mode 100644 python/hsml/core/hdfs_api.py diff --git a/python/hopsworks_common/constants.py b/python/hopsworks_common/constants.py index b9f90f22b..b98ed8497 100644 --- a/python/hopsworks_common/constants.py +++ b/python/hopsworks_common/constants.py @@ -163,10 +163,12 @@ class MODEL: class MODEL_REGISTRY: HOPSFS_MOUNT_PREFIX = "/hopsfs/" + MODEL_FILES_DIR_NAME = "Files" class MODEL_SERVING: MODELS_DATASET = "Models" + ARTIFACTS_DIR_NAME = "Artifacts" class ARTIFACT_VERSION: diff --git a/python/hsml/core/hdfs_api.py b/python/hsml/core/hdfs_api.py new file mode 100644 index 000000000..d786bce37 --- /dev/null +++ b/python/hsml/core/hdfs_api.py @@ -0,0 +1,93 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import os + + +class HdfsApi: + def __init__(self): + + import fsspec.implementations.arrow as pfs + + host, port = os.environ["LIBHDFS_DEFAULT_FS"].split(":") + + self._hopsfs = pfs.HadoopFileSystem( + host=host, + port=int(port), + user=os.environ["LIBHDFS_DEFAULT_USER"], + ) + + DEFAULT_BUFFER_SIZE = 0 + + def upload( + self, + local_path: str, + upload_path: str, + overwrite: bool = False, + buffer_size: int = DEFAULT_BUFFER_SIZE, + ): + """Upload file/directory to the Hopsworks filesystem. + :param local_path: local path to file to upload + :type local_path: str + :param upload_path: path to directory where to upload the file in Hopsworks filesystem + :type upload_path: str + :param overwrite: overwrite file if exists + :type overwrite: bool + :param buffer_size: size of the temporary read and write buffer. Defaults to 0. + :type buffer_size: int + """ + # local path could be absolute or relative, + if not os.path.isabs(local_path) and os.path.exists( + os.path.join(os.getcwd(), local_path) + ): + local_path = os.path.join(os.getcwd(), local_path) + + _, file_name = os.path.split(local_path) + + destination_path = upload_path + "/" + file_name + + if self._hopsfs.exists(destination_path): + if overwrite: + self._hopsfs.rm(destination_path, recursive=True) + else: + raise Exception( + "{} already exists, set overwrite=True to overwrite it".format( + local_path + ) + ) + + self._hopsfs.upload( + lpath=local_path, + rpath=destination_path, + recursive=True, + buffer_size=buffer_size, + ) + + return upload_path + "/" + os.path.basename(local_path) + + def download(self, path, local_path, buffer_size=DEFAULT_BUFFER_SIZE): + """Download file/directory on a path in datasets. + :param path: path to download + :type path: str + :param local_path: path to download in datasets + :type local_path: str + :param buffer_size: size of the temporary read and write buffer. Defaults to 0. + :type buffer_size: int + """ + + self._hopsfs.download(path, local_path, recursive=True, buffer_size=buffer_size) diff --git a/python/hsml/deployment.py b/python/hsml/deployment.py index 8891b149f..f6c064759 100644 --- a/python/hsml/deployment.py +++ b/python/hsml/deployment.py @@ -219,10 +219,14 @@ def get_model(self): ) @usage.method_logger - def download_artifact(self): - """Download the model artifact served by the deployment""" + def download_artifact_files(self, local_path=None): + """Download the artifact files served by the deployment - return self._serving_engine.download_artifact(self) + # Arguments + local_path: path where to download the artifact files in the local filesystem + """ + + return self._serving_engine.download_artifact_files(self, local_path=local_path) def get_logs(self, component="predictor", tail=10): """Prints the deployment logs of the predictor or transformer. @@ -373,9 +377,15 @@ def artifact_version(self): def artifact_version(self, artifact_version: Union[int, str]): self._predictor.artifact_version = artifact_version + @property + def artifact_files_path(self): + """Path of the artifact files deployed by the predictor.""" + return self._predictor.artifact_files_path + @property def artifact_path(self): """Path of the model artifact deployed by the predictor.""" + # TODO: deprecated return self._predictor.artifact_path @property diff --git a/python/hsml/engine/local_engine.py b/python/hsml/engine/local_engine.py index 7b669a249..d703002da 100644 --- a/python/hsml/engine/local_engine.py +++ b/python/hsml/engine/local_engine.py @@ -17,7 +17,7 @@ import os from hsml import client -from hsml.core import dataset_api, model_api +from hsml.core import dataset_api, hdfs_api, model_api class LocalEngine: @@ -25,6 +25,11 @@ def __init__(self): self._dataset_api = dataset_api.DatasetApi() self._model_api = model_api.ModelApi() + try: + self._hdfs_api = hdfs_api.HdfsApi() + except Exception: + self._hdfs_api = None + def mkdir(self, remote_path: str): remote_path = self._prepend_project_path(remote_path) self._dataset_api.mkdir(remote_path) @@ -38,26 +43,55 @@ def upload(self, local_path: str, remote_path: str, upload_configuration=None): # Initialize the upload configuration to empty dictionary if is None upload_configuration = upload_configuration if upload_configuration else {} - self._dataset_api.upload( - local_path, - remote_path, - chunk_size=upload_configuration.get( - "chunk_size", self._dataset_api.DEFAULT_UPLOAD_FLOW_CHUNK_SIZE - ), - simultaneous_uploads=upload_configuration.get( - "simultaneous_uploads", - self._dataset_api.DEFAULT_UPLOAD_SIMULTANEOUS_UPLOADS, - ), - max_chunk_retries=upload_configuration.get( - "max_chunk_retries", - self._dataset_api.DEFAULT_UPLOAD_MAX_CHUNK_RETRIES, - ), - ) - def download(self, remote_path: str, local_path: str): + if self._hdfs_api is not None: + # use the hdfs client if available + self._hdfs_api.upload( + local_path=local_path, + upload_path=remote_path, + buffer_size=upload_configuration.get( + "buffer_size", self._hdfs_api.DEFAULT_BUFFER_SIZE + ), + ) + else: + # otherwise, use the REST API + self._dataset_api.upload( + local_path, + remote_path, + chunk_size=upload_configuration.get( + "chunk_size", self._dataset_api.DEFAULT_UPLOAD_FLOW_CHUNK_SIZE + ), + simultaneous_uploads=upload_configuration.get( + "simultaneous_uploads", + self._dataset_api.DEFAULT_UPLOAD_SIMULTANEOUS_UPLOADS, + ), + max_chunk_retries=upload_configuration.get( + "max_chunk_retries", + self._dataset_api.DEFAULT_UPLOAD_MAX_CHUNK_RETRIES, + ), + ) + + def download(self, remote_path: str, local_path: str, download_configuration=None): local_path = self._get_abs_path(local_path) remote_path = self._prepend_project_path(remote_path) - self._dataset_api.download(remote_path, local_path) + + # Initialize the download configuration to empty dictionary if is None + download_configuration = ( + download_configuration if download_configuration else {} + ) + + if self._hdfs_api is not None: + # use the hdfs client if available + self._hdfs_api.download( + path=remote_path, + local_path=local_path, + buffer_size=download_configuration.get( + "buffer_size", self._hdfs_api.DEFAULT_BUFFER_SIZE + ), + ) + else: + # otherwise, use the REST API + self._dataset_api.download(remote_path, local_path) def copy(self, source_path, destination_path): source_path = self._prepend_project_path(source_path) diff --git a/python/hsml/engine/model_engine.py b/python/hsml/engine/model_engine.py index b8ae974f8..d2e8a85af 100644 --- a/python/hsml/engine/model_engine.py +++ b/python/hsml/engine/model_engine.py @@ -80,11 +80,11 @@ def _upload_additional_resources(self, model_instance): return model_instance def _copy_or_move_hopsfs_model_item( - self, item_attr, to_model_version_path, keep_original_files + self, item_attr, to_model_files_path, keep_original_files ): """Copy or move model item from a hdfs path to the model version folder in the Models dataset. It works with files and folders.""" path = item_attr["path"] - to_hdfs_path = os.path.join(to_model_version_path, os.path.basename(path)) + to_hdfs_path = os.path.join(to_model_files_path, os.path.basename(path)) if keep_original_files: self._engine.copy(path, to_hdfs_path) else: @@ -93,7 +93,7 @@ def _copy_or_move_hopsfs_model_item( def _copy_or_move_hopsfs_model( self, from_hdfs_model_path, - to_model_version_path, + to_model_files_path, keep_original_files, update_upload_progress, ): @@ -122,7 +122,7 @@ def _copy_or_move_hopsfs_model( )["items"]: path_attr = entry["attributes"] self._copy_or_move_hopsfs_model_item( - path_attr, to_model_version_path, keep_original_files + path_attr, to_model_files_path, keep_original_files ) if path_attr.get("dir", False): n_dirs += 1 @@ -132,7 +132,7 @@ def _copy_or_move_hopsfs_model( else: # if path is a file, copy/move it self._copy_or_move_hopsfs_model_item( - model_path_attr, to_model_version_path, keep_original_files + model_path_attr, to_model_files_path, keep_original_files ) n_files += 1 update_upload_progress(n_dirs=n_dirs, n_files=n_files) @@ -156,7 +156,9 @@ def _download_model_from_hopsfs_recursive( if path_attr.get("dir", False): # otherwise, make a recursive call for the folder - if basename == "Artifacts": + if ( + basename == constants.MODEL_SERVING.ARTIFACTS_DIR_NAME + ): # TODO: Not needed anymore continue # skip Artifacts subfolder local_folder_path = os.path.join(to_local_path, basename) os.mkdir(local_folder_path) @@ -195,11 +197,11 @@ def _download_model_from_hopsfs( def _upload_local_model( self, from_local_model_path, - to_model_version_path, + to_model_files_path, update_upload_progress, upload_configuration=None, ): - """Copy or upload model files from a local path to the model version folder in the Models dataset.""" + """Copy or upload model files from a local path to the model files folder in the Models dataset.""" n_dirs, n_files = 0, 0 if os.path.isdir(from_local_model_path): # if path is a dir, upload files and folders iteratively @@ -210,7 +212,7 @@ def _upload_local_model( # - files is the list of file names present in the root dir # we need to replace the local path prefix with the hdfs path prefix (i.e., /srv/hops/....../root with /Projects/.../) remote_base_path = root.replace( - from_local_model_path, to_model_version_path + from_local_model_path, to_model_files_path ) for d_name in dirs: self._engine.mkdir(remote_base_path + "/" + d_name) @@ -228,7 +230,7 @@ def _upload_local_model( # if path is a file, upload file self._engine.upload( from_local_model_path, - to_model_version_path, + to_model_files_path, upload_configuration=upload_configuration, ) n_files += 1 @@ -249,14 +251,14 @@ def _save_model_from_local_or_hopsfs_mount( from_hdfs_model_path=model_path.replace( constants.MODEL_REGISTRY.HOPSFS_MOUNT_PREFIX, "" ), - to_model_version_path=model_instance.version_path, + to_model_files_path=model_instance.model_files_path, keep_original_files=keep_original_files, update_upload_progress=update_upload_progress, ) else: self._upload_local_model( from_local_model_path=model_path, - to_model_version_path=model_instance.version_path, + to_model_files_path=model_instance.model_files_path, update_upload_progress=update_upload_progress, upload_configuration=upload_configuration, ) @@ -365,6 +367,7 @@ def save( if step["id"] == 0: # Create folders self._engine.mkdir(model_instance.version_path) + self._engine.mkdir(model_instance.model_files_path) if step["id"] == 1: def update_upload_progress(n_dirs=0, n_files=0, step=step): @@ -374,7 +377,7 @@ def update_upload_progress(n_dirs=0, n_files=0, step=step): update_upload_progress(n_dirs=0, n_files=0) - # Upload Model files from local path to /Models/{model_instance._name}/{model_instance._version} + # Upload Model files from local path to /Models/{model_instance._name}/{model_instance._version}/Files # check local absolute if os.path.isabs(model_path) and os.path.exists(model_path): self._save_model_from_local_or_hopsfs_mount( @@ -401,7 +404,7 @@ def update_upload_progress(n_dirs=0, n_files=0, step=step): ): # check hdfs relative and absolute self._copy_or_move_hopsfs_model( from_hdfs_model_path=model_path, - to_model_version_path=model_instance.version_path, + to_model_files_path=model_instance.model_files_path, keep_original_files=keep_original_files, update_upload_progress=update_upload_progress, ) @@ -431,12 +434,13 @@ def update_upload_progress(n_dirs=0, n_files=0, step=step): return model_instance - def download(self, model_instance): - model_name_path = os.path.join( - tempfile.gettempdir(), str(uuid.uuid4()), model_instance._name - ) - model_version_path = model_name_path + "/" + str(model_instance._version) - os.makedirs(model_version_path) + def download(self, model_instance, local_path=None): + if local_path is None: + local_path = os.path.join( + tempfile.gettempdir(), str(uuid.uuid4()), model_instance._name + ) + local_path = local_path + "/" + str(model_instance._version) + os.makedirs(local_path, exist_ok=True) def update_download_progress(n_dirs, n_files, done=False): print( @@ -446,20 +450,20 @@ def update_download_progress(n_dirs, n_files, done=False): ) try: - from_hdfs_model_path = model_instance.version_path + from_hdfs_model_path = model_instance.model_files_path if from_hdfs_model_path.startswith("hdfs:/"): projects_index = from_hdfs_model_path.find("/Projects", 0) from_hdfs_model_path = from_hdfs_model_path[projects_index:] self._download_model_from_hopsfs( from_hdfs_model_path=from_hdfs_model_path, - to_local_path=model_version_path, + to_local_path=local_path, update_download_progress=update_download_progress, ) except BaseException as be: raise be - return model_version_path + return local_path def read_file(self, model_instance, resource): hdfs_resource_path = self._build_resource_path( diff --git a/python/hsml/engine/serving_engine.py b/python/hsml/engine/serving_engine.py index 12f311d17..164ac7504 100644 --- a/python/hsml/engine/serving_engine.py +++ b/python/hsml/engine/serving_engine.py @@ -15,11 +15,12 @@ # import os +import tempfile import time import uuid from typing import Dict, List, Union -from hsml import util +from hsml import constants from hsml.client.exceptions import ModelServingException, RestAPIError from hsml.client.istio.utils.infer_type import InferInput from hsml.constants import ( @@ -31,6 +32,7 @@ INFERENCE_ENDPOINTS as IE, ) from hsml.core import dataset_api, serving_api +from hsml.engine import local_engine from tqdm.auto import tqdm @@ -51,6 +53,8 @@ def __init__(self): self._serving_api = serving_api.ServingApi() self._dataset_api = dataset_api.DatasetApi() + self._engine = local_engine.LocalEngine() + def _poll_deployment_status( self, deployment_instance, status: str, await_status: int, update_progress=None ): @@ -304,7 +308,64 @@ def _get_stopped_instances(self, available_instances, requested_instances): num_instances = requested_instances - available_instances return num_instances if num_instances >= 0 else 0 - def download_artifact(self, deployment_instance): + def _download_files_from_hopsfs_recursive( + self, + from_hdfs_path: str, + to_local_path: str, + update_download_progress, + n_dirs, + n_files, + ): + """Download model files from a model path in hdfs, recursively""" + + for entry in self._dataset_api.list(from_hdfs_path, sort_by="NAME:desc")[ + "items" + ]: + path_attr = entry["attributes"] + path = path_attr["path"] + basename = os.path.basename(path) + + if path_attr.get("dir", False): + # otherwise, make a recursive call for the folder + if ( + basename == constants.MODEL_SERVING.ARTIFACTS_DIR_NAME + ): # TODO: Not needed anymore + continue # skip Artifacts subfolder + local_folder_path = os.path.join(to_local_path, basename) + os.mkdir(local_folder_path) + n_dirs, n_files = self._download_files_from_hopsfs_recursive( + from_hdfs_path=path, + to_local_path=local_folder_path, + update_download_progress=update_download_progress, + n_dirs=n_dirs, + n_files=n_files, + ) + n_dirs += 1 + update_download_progress(n_dirs=n_dirs, n_files=n_files) + else: + # if it's a file, download it + local_file_path = os.path.join(to_local_path, basename) + self._engine.download(path, local_file_path) + n_files += 1 + update_download_progress(n_dirs=n_dirs, n_files=n_files) + + return n_dirs, n_files + + def _download_files_from_hopsfs( + self, from_hdfs_path: str, to_local_path: str, update_download_progress + ): + """Download files from a model path in hdfs.""" + + n_dirs, n_files = self._download_files_from_hopsfs_recursive( + from_hdfs_path=from_hdfs_path, + to_local_path=to_local_path, + update_download_progress=update_download_progress, + n_dirs=0, + n_files=0, + ) + update_download_progress(n_dirs=n_dirs, n_files=n_files, done=True) + + def download_artifact_files(self, deployment_instance, local_path=None): if deployment_instance.id is None: raise ModelServingException( "Deployment is not created yet. To create the deployment use `.save()`" @@ -316,30 +377,39 @@ def download_artifact(self, deployment_instance): Download the model files by using `model.download()`" ) - from_artifact_zip_path = deployment_instance.artifact_path - to_artifacts_path = os.path.join( - os.getcwd(), - str(uuid.uuid4()), - deployment_instance.model_name, - str(deployment_instance.model_version), - "Artifacts", - ) - to_artifact_version_path = ( - to_artifacts_path + "/" + str(deployment_instance.artifact_version) - ) - to_artifact_zip_path = to_artifact_version_path + ".zip" + if local_path is None: + local_path = os.path.join( + tempfile.gettempdir(), + str(uuid.uuid4()), + deployment_instance.model_name, + str(deployment_instance.model_version), + constants.MODEL_SERVING.ARTIFACTS_DIR_NAME, + str(deployment_instance.artifact_version), + ) + os.makedirs(local_path, exist_ok=True) - os.makedirs(to_artifacts_path) + def update_download_progress(n_dirs, n_files, done=False): + print( + "Downloading artifact files (%s dirs, %s files)... %s" + % (n_dirs, n_files, "DONE" if done else ""), + end="\r", + ) try: - self._dataset_api.download(from_artifact_zip_path, to_artifact_zip_path) - util.decompress(to_artifact_zip_path, extract_dir=to_artifacts_path) - os.remove(to_artifact_zip_path) - finally: - if os.path.exists(to_artifact_zip_path): - os.remove(to_artifact_zip_path) - - return to_artifact_version_path + from_hdfs_path = deployment_instance.artifact_files_path + if from_hdfs_path.startswith("hdfs:/"): + projects_index = from_hdfs_path.find("/Projects", 0) + from_hdfs_path = from_hdfs_path[projects_index:] + + self._download_files_from_hopsfs( + from_hdfs_path=from_hdfs_path, + to_local_path=local_path, + update_download_progress=update_download_progress, + ) + except BaseException as be: + raise be + + return local_path def create(self, deployment_instance): try: diff --git a/python/hsml/model.py b/python/hsml/model.py index dbf4a4e37..a545fa5ea 100644 --- a/python/hsml/model.py +++ b/python/hsml/model.py @@ -22,7 +22,7 @@ import humps from hopsworks_common import usage -from hsml import client, util +from hsml import client, constants, util from hsml.constants import ARTIFACT_VERSION from hsml.constants import INFERENCE_ENDPOINTS as IE from hsml.core import explicit_provenance @@ -140,13 +140,15 @@ def save( ) @usage.method_logger - def download(self): + def download(self, local_path=None): """Download the model files. + # Arguments + local_path: path where to download the model files in the local filesystem # Returns `str`: Absolute path to local folder containing the model files. """ - return self._model_engine.download(model_instance=self) + return self._model_engine.download(model_instance=self, local_path=local_path) @usage.method_logger def delete(self): @@ -545,6 +547,14 @@ def version_path(self): """path of the model including version folder. Resolves to /Projects/{project_name}/Models/{name}/{version}""" return "{}/{}".format(self.model_path, str(self.version)) + @property + def model_files_path(self): + """path of the model files including version and files folder. Resolves to /Projects/{project_name}/Models/{name}/{version}/Files""" + return "{}/{}".format( + self.version_path, + constants.MODEL_REGISTRY.MODEL_FILES_DIR_NAME, + ) + @property def shared_registry_project_name(self): """shared_registry_project_name of the model.""" diff --git a/python/hsml/predictor.py b/python/hsml/predictor.py index f1d458a3f..236b7cb20 100644 --- a/python/hsml/predictor.py +++ b/python/hsml/predictor.py @@ -17,7 +17,7 @@ from typing import Optional, Union import humps -from hsml import client, deployment, util +from hsml import client, constants, deployment, util from hsml.constants import ( ARTIFACT_VERSION, INFERENCE_ENDPOINTS, @@ -395,9 +395,19 @@ def artifact_version(self): def artifact_version(self, artifact_version: Union[int, str]): self._artifact_version = artifact_version + @property + def artifact_files_path(self): + return "{}/{}/{}/{}".format( + self._model_path, + str(self._model_version), + constants.MODEL_SERVING.ARTIFACTS_DIR_NAME, + str(self._artifact_version), + ) + @property def artifact_path(self): """Path of the model artifact deployed by the predictor. Resolves to /Projects/{project_name}/Models/{name}/{version}/Artifacts/{artifact_version}/{name}_{version}_{artifact_version}.zip""" + # TODO: Deprecated artifact_name = "{}_{}_{}.zip".format( self._model_name, str(self._model_version), str(self._artifact_version) ) diff --git a/python/tests/test_constants.py b/python/tests/test_constants.py index 783770b14..3c03263bf 100644 --- a/python/tests/test_constants.py +++ b/python/tests/test_constants.py @@ -53,26 +53,29 @@ def test_model_framework_constants(self): def test_model_registry_constants(self): # Arrange - hopsfs_mount_prefix = {"HOPSFS_MOUNT_PREFIX": "/hopsfs/"} + model_registry = { + "HOPSFS_MOUNT_PREFIX": "/hopsfs/", + "MODEL_FILES_DIR_NAME": "Files", + } # Assert self._check_added_modified_or_removed_values( constants.MODEL_REGISTRY, - num_values=len(hopsfs_mount_prefix), - expected_constants=hopsfs_mount_prefix, + num_values=len(model_registry), + expected_constants=model_registry, ) # MODEL_SERVING def test_model_serving_constants(self): # Arrange - models_dataset = {"MODELS_DATASET": "Models"} + model_serving = {"MODELS_DATASET": "Models", "ARTIFACTS_DIR_NAME": "Artifacts"} # Assert self._check_added_modified_or_removed_values( constants.MODEL_SERVING, - num_values=len(models_dataset), - expected_constants=models_dataset, + num_values=len(model_serving), + expected_constants=model_serving, ) # ARTIFACT_VERSION @@ -194,7 +197,7 @@ def test_predictor_model_server_constants(self): model_servers = { "MODEL_SERVER_PYTHON": "PYTHON", "MODEL_SERVER_TF_SERVING": "TENSORFLOW_SERVING", - "MODEL_SERVER_VLLM": "VLLM" + "MODEL_SERVER_VLLM": "VLLM", } # Assert diff --git a/python/tests/test_deployment.py b/python/tests/test_deployment.py index 7e3d7e4a5..63e791126 100644 --- a/python/tests/test_deployment.py +++ b/python/tests/test_deployment.py @@ -617,15 +617,17 @@ def test_download_artifact(self, mocker, backend_fixtures): # Arrange p = self._get_dummy_predictor(mocker, backend_fixtures) d = deployment.Deployment(predictor=p) - mock_serving_engine_download_artifact = mocker.patch( - "hsml.engine.serving_engine.ServingEngine.download_artifact" + mock_serving_engine_download_artifact_files = mocker.patch( + "hsml.engine.serving_engine.ServingEngine.download_artifact_files" ) # Act - d.download_artifact() + d.download_artifact_files() # Assert - mock_serving_engine_download_artifact.assert_called_once_with(d) + mock_serving_engine_download_artifact_files.assert_called_once_with( + d, local_path=None + ) # get logs diff --git a/python/tests/test_model.py b/python/tests/test_model.py index 2442ac7fb..b153b5742 100644 --- a/python/tests/test_model.py +++ b/python/tests/test_model.py @@ -266,7 +266,9 @@ def test_download(self, mocker, backend_fixtures): m.download() # Assert - mock_model_engine_download.assert_called_once_with(model_instance=m) + mock_model_engine_download.assert_called_once_with( + model_instance=m, local_path=None + ) # tags From 71898c59f7eca00ba550c2ffbbf2674ccd2a93ff Mon Sep 17 00:00:00 2001 From: Alex Ormenisan Date: Thu, 24 Oct 2024 17:19:01 +0200 Subject: [PATCH 06/30] Prepare for 4.2.0-SNAPSHOT development (#375) --- java/beam/pom.xml | 2 +- java/flink/pom.xml | 2 +- java/hsfs/pom.xml | 2 +- java/pom.xml | 2 +- java/spark/pom.xml | 2 +- python/hopsworks_common/version.py | 2 +- utils/java/pom.xml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/java/beam/pom.xml b/java/beam/pom.xml index b240612d9..c90394fa5 100644 --- a/java/beam/pom.xml +++ b/java/beam/pom.xml @@ -5,7 +5,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/java/flink/pom.xml b/java/flink/pom.xml index 7e39ece2a..11564004f 100644 --- a/java/flink/pom.xml +++ b/java/flink/pom.xml @@ -5,7 +5,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/java/hsfs/pom.xml b/java/hsfs/pom.xml index c56061427..b7bd606c2 100644 --- a/java/hsfs/pom.xml +++ b/java/hsfs/pom.xml @@ -5,7 +5,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/java/pom.xml b/java/pom.xml index cc3dd776c..0a5cc707f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ com.logicalclocks hsfs-parent pom - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT hsfs spark diff --git a/java/spark/pom.xml b/java/spark/pom.xml index 185da5d20..4c2d188fb 100644 --- a/java/spark/pom.xml +++ b/java/spark/pom.xml @@ -22,7 +22,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/python/hopsworks_common/version.py b/python/hopsworks_common/version.py index 52cd363fc..82beef4ab 100644 --- a/python/hopsworks_common/version.py +++ b/python/hopsworks_common/version.py @@ -14,4 +14,4 @@ # limitations under the License. # -__version__ = "4.1.0.dev1" +__version__ = "4.2.0.dev1" diff --git a/utils/java/pom.xml b/utils/java/pom.xml index 196978d6c..a3a3026b4 100644 --- a/utils/java/pom.xml +++ b/utils/java/pom.xml @@ -5,7 +5,7 @@ com.logicalclocks hsfs-utils - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 3.2.0.0-SNAPSHOT From 4bc3b37e67ffc34307fb993af45fa5e8b7d2c443 Mon Sep 17 00:00:00 2001 From: manu-sj <152865565+manu-sj@users.noreply.github.com> Date: Fri, 25 Oct 2024 18:17:02 +0200 Subject: [PATCH 07/30] [FSTORE-1574] Transformation functions bug fixes (#376) * peforming data validation after on-demand transformations are applied * correcting error message for feature group transformations * removing validations for on-deamnd features in Vector server as they are computed while retriving the featur evector * adding tests for error messages * adding warnings for transformation functions in external feature group and updating logger message * correcting logging debug message --- python/hsfs/core/feature_group_engine.py | 43 ++++- python/hsfs/core/vector_server.py | 9 + python/hsfs/engine/python.py | 25 +-- python/hsfs/engine/spark.py | 60 +++--- .../tests/core/test_feature_group_engine.py | 146 +++++++++++++++ python/tests/engine/test_python.py | 128 ++++++++----- python/tests/engine/test_spark.py | 177 ------------------ 7 files changed, 320 insertions(+), 268 deletions(-) diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py index f00a044e1..30d1cbe4b 100644 --- a/python/hsfs/core/feature_group_engine.py +++ b/python/hsfs/core/feature_group_engine.py @@ -15,7 +15,7 @@ from __future__ import annotations import warnings -from typing import List +from typing import List, Union from hsfs import engine, feature, util from hsfs import feature_group as fg @@ -67,7 +67,7 @@ def _update_feature_group_schema_on_demand_transformations( def save( self, - feature_group, + feature_group: Union[fg.FeatureGroup, fg.ExternalFeatureGroup], feature_dataframe, write_options, validation_options: dict = None, @@ -80,6 +80,21 @@ def save( feature_group=feature_group, features=dataframe_features ) ) + + # Currently on-demand transformation functions not supported in external feature groups. + if feature_group.transformation_functions: + if not isinstance(feature_group, fg.ExternalFeatureGroup): + feature_dataframe = ( + engine.get_instance()._apply_transformation_function( + feature_group.transformation_functions, feature_dataframe + ) + ) + else: + warnings.warn( + "On-Demand features were not created because On-Demand Transformations are not supported for External Feature Groups.", + stacklevel=1, + ) + util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features ) @@ -119,7 +134,7 @@ def save( def insert( self, - feature_group, + feature_group: Union[fg.FeatureGroup, fg.ExternalFeatureGroup], feature_dataframe, overwrite, operation, @@ -132,6 +147,16 @@ def insert( feature_group.time_travel_format, features=feature_group.features, ) + + # Currently on-demand transformation functions not supported in external feature groups. + if ( + not isinstance(feature_group, fg.ExternalFeatureGroup) + and feature_group.transformation_functions + ): + feature_dataframe = engine.get_instance()._apply_transformation_function( + feature_group.transformation_functions, feature_dataframe + ) + dataframe_features = ( self._update_feature_group_schema_on_demand_transformations( feature_group=feature_group, features=dataframe_features @@ -299,7 +324,9 @@ def append_features(self, feature_group, new_features): if feature_group.time_travel_format == "DELTA": engine.get_instance().add_cols_to_delta_table(feature_group, new_features) else: - engine.get_instance().save_empty_dataframe(feature_group, new_features=new_features) + engine.get_instance().save_empty_dataframe( + feature_group, new_features=new_features + ) def update_description(self, feature_group, description): """Updates the description of a feature group.""" @@ -326,7 +353,7 @@ def update_deprecated(self, feature_group, deprecate): def insert_stream( self, - feature_group, + feature_group: Union[fg.FeatureGroup, fg.ExternalFeatureGroup], dataframe, query_name, output_mode, @@ -349,6 +376,12 @@ def insert_stream( feature_group=feature_group, features=dataframe_features ) ) + + if feature_group.transformation_functions: + dataframe = engine.get_instance()._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) + util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features ) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 0e785dde5..277b25051 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -1323,6 +1323,15 @@ def identify_missing_features_pre_fetch( passed_feature_names = passed_feature_names.union( vector_db_features.keys() ) + if self._on_demand_feature_names and len(self._on_demand_feature_names) > 0: + # Remove on-demand features from validation check as they would be computed. + _logger.debug( + "Appending on_demand_feature_names : %s, to passed_feature_names for pre-fetch missing", + self._on_demand_feature_names, + ) + passed_feature_names = passed_feature_names.union( + self._on_demand_feature_names + ) neither_fetched_nor_passed = fetched_features.difference( passed_feature_names ) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 34fc1ffd5..1c001f63a 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -808,15 +808,6 @@ def save_dataframe( online_write_options: Dict[str, Any], validation_id: Optional[int] = None, ) -> Optional[job.Job]: - # Currently on-demand transformation functions not supported in external feature groups. - if ( - not isinstance(feature_group, ExternalFeatureGroup) - and feature_group.transformation_functions - ): - dataframe = self._apply_transformation_function( - feature_group.transformation_functions, dataframe - ) - if ( hasattr(feature_group, "EXTERNAL_FEATURE_GROUP") and feature_group.online_enabled @@ -1298,9 +1289,19 @@ def _apply_transformation_function( dataset.columns ) if missing_features: - raise FeatureStoreException( - f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." - ) + if ( + tf.transformation_type + == transformation_function.TransformationType.ON_DEMAND + ): + # On-demand transformation are applied using the python/spark engine during insertion, the transformation while retrieving feature vectors are performed in the vector_server. + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the on-demand transformation function '{hopsworks_udf.function_name}' are not present in the dataframe being inserted into the feature group. " + + "Please verify that the correct feature names are used in the transformation function and that these features exist in the dataframe being inserted." + ) + else: + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the model-dependent transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please verify that the correct features are specified in the transformation function." + ) if tf.hopsworks_udf.dropped_features: dropped_features.update(tf.hopsworks_udf.dropped_features) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 69b17915a..10d3a9cb1 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -415,14 +415,6 @@ def save_dataframe( validation_id=None, ): try: - # Currently on-demand transformation functions not supported in external feature groups. - if ( - not isinstance(feature_group, fg_mod.ExternalFeatureGroup) - and feature_group.transformation_functions - ): - dataframe = self._apply_transformation_function( - feature_group.transformation_functions, dataframe - ) if ( isinstance(feature_group, fg_mod.ExternalFeatureGroup) and feature_group.online_enabled @@ -467,11 +459,6 @@ def save_stream_dataframe( checkpoint_dir: Optional[str], write_options: Optional[Dict[str, Any]], ): - if feature_group.transformation_functions: - dataframe = self._apply_transformation_function( - feature_group.transformation_functions, dataframe - ) - write_options = kafka_engine.get_kafka_config( feature_group.feature_store_id, write_options, engine="spark" ) @@ -1314,13 +1301,16 @@ def save_empty_dataframe(self, feature_group, new_features=None): dataframe = self._spark_session.read.format("hudi").load(location) - if (new_features is not None): + if new_features is not None: if isinstance(new_features, list): for new_feature in new_features: - dataframe = dataframe.withColumn(new_feature.name, lit(None).cast(new_feature.type)) + dataframe = dataframe.withColumn( + new_feature.name, lit(None).cast(new_feature.type) + ) else: - dataframe = dataframe.withColumn(new_features.name, lit(None).cast(new_features.type)) - + dataframe = dataframe.withColumn( + new_features.name, lit(None).cast(new_features.type) + ) self.save_dataframe( feature_group, @@ -1337,18 +1327,22 @@ def add_cols_to_delta_table(self, feature_group, new_features): dataframe = self._spark_session.read.format("delta").load(location) - if (new_features is not None): + if new_features is not None: if isinstance(new_features, list): for new_feature in new_features: - dataframe = dataframe.withColumn(new_feature.name, lit("").cast(new_feature.type)) + dataframe = dataframe.withColumn( + new_feature.name, lit("").cast(new_feature.type) + ) else: - dataframe = dataframe.withColumn(new_features.name, lit("").cast(new_features.type)) + dataframe = dataframe.withColumn( + new_features.name, lit("").cast(new_features.type) + ) - dataframe.limit(0).write.format("delta").mode( - "append" - ).option("mergeSchema", "true").option( - "spark.databricks.delta.schema.autoMerge.enabled", "true" - ).save(location) + dataframe.limit(0).write.format("delta").mode("append").option( + "mergeSchema", "true" + ).option("spark.databricks.delta.schema.autoMerge.enabled", "true").save( + location + ) def _apply_transformation_function( self, @@ -1378,9 +1372,19 @@ def _apply_transformation_function( ) if missing_features: - raise FeatureStoreException( - f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." - ) + if ( + tf.transformation_type + == transformation_function.TransformationType.ON_DEMAND + ): + # On-demand transformation are applied using the python/spark engine during insertion, the transformation while retrieving feature vectors are performed in the vector_server. + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the on-demand transformation function '{hopsworks_udf.function_name}' are not present in the dataframe being inserted into the feature group. " + + "Please verify that the correct feature names are used in the transformation function and that these features exist in the dataframe being inserted." + ) + else: + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the model-dependent transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please verify that the correct features are specified in the transformation function." + ) if tf.hopsworks_udf.dropped_features: dropped_features.update(hopsworks_udf.dropped_features) diff --git a/python/tests/core/test_feature_group_engine.py b/python/tests/core/test_feature_group_engine.py index 91f1086ed..e5cc55c05 100644 --- a/python/tests/core/test_feature_group_engine.py +++ b/python/tests/core/test_feature_group_engine.py @@ -56,6 +56,49 @@ def test_save(self, mocker): # Assert assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + def test_save_dataframe_transformation_functions(self, mocker): + # Arrange + feature_store_id = 99 + + mocker.patch("hsfs.engine.get_type") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine.save_feature_group_metadata" + ) + mocker.patch("hsfs.core.great_expectation_engine.GreatExpectationEngine") + + fg_engine = feature_group_engine.FeatureGroupEngine( + feature_store_id=feature_store_id + ) + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=feature_store_id, + primary_key=[], + partition_key=[], + transformation_functions=[test], + id=10, + ) + + # Act + fg_engine.save( + feature_group=fg, + feature_dataframe=None, + write_options=None, + ) + + # Assert + assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + assert ( + mock_engine_get_instance.return_value._apply_transformation_function.call_count + == 1 + ) + def test_save_ge_report(self, mocker): # Arrange feature_store_id = 99 @@ -143,6 +186,56 @@ def test_insert(self, mocker): assert mock_fg_api.return_value.delete_content.call_count == 0 assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + def test_insert_transformation_functions(self, mocker): + # Arrange + feature_store_id = 99 + + mocker.patch("hsfs.engine.get_type") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine.save_feature_group_metadata" + ) + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine._verify_schema_compatibility" + ) + mocker.patch("hsfs.core.great_expectation_engine.GreatExpectationEngine") + mock_fg_api = mocker.patch("hsfs.core.feature_group_api.FeatureGroupApi") + + fg_engine = feature_group_engine.FeatureGroupEngine( + feature_store_id=feature_store_id + ) + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=feature_store_id, + transformation_functions=[test], + primary_key=[], + partition_key=[], + ) + + # Act + fg_engine.insert( + feature_group=fg, + feature_dataframe=None, + overwrite=None, + operation=None, + storage=None, + write_options=None, + ) + + # Assert + assert mock_fg_api.return_value.delete_content.call_count == 0 + assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + assert ( + mock_engine_get_instance.return_value._apply_transformation_function.call_count + == 1 + ) + def test_insert_id(self, mocker): # Arrange feature_store_id = 99 @@ -909,6 +1002,59 @@ def test_insert_stream_stream(self, mocker): mock_engine_get_instance.return_value.save_stream_dataframe.call_count == 1 ) + def test_insert_stream_stream_transformation_functions(self, mocker): + # Arrange + feature_store_id = 99 + + mocker.patch("hsfs.engine.get_type") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine.save_feature_group_metadata" + ) + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine._verify_schema_compatibility" + ) + + @udf(int) + def test(feature): + return feature + 1 + + fg_engine = feature_group_engine.FeatureGroupEngine( + feature_store_id=feature_store_id + ) + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=feature_store_id, + primary_key=[], + partition_key=[], + transformation_functions=[test], + stream=True, + ) + + # Act + fg_engine.insert_stream( + feature_group=fg, + dataframe=None, + query_name=None, + output_mode=None, + await_termination=None, + timeout=None, + checkpoint_dir=None, + write_options=None, + ) + + # Assert + assert mock_engine_get_instance.return_value.save_dataframe.call_count == 0 + assert ( + mock_engine_get_instance.return_value.save_stream_dataframe.call_count == 1 + ) + assert ( + mock_engine_get_instance.return_value._apply_transformation_function.call_count + == 1 + ) + def test_insert_stream_online_enabled_id(self, mocker): # Arrange feature_store_id = 99 diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index e921787be..84e2ca10a 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -1450,52 +1450,6 @@ def test_save_dataframe(self, mocker): assert mock_python_engine_write_dataframe_kafka.call_count == 0 assert mock_python_engine_legacy_save_dataframe.call_count == 1 - def test_save_dataframe_transformation_functions(self, mocker): - # Arrange - mock_python_engine_write_dataframe_kafka = mocker.patch( - "hsfs.engine.python.Engine._write_dataframe_kafka" - ) - mock_python_engine_legacy_save_dataframe = mocker.patch( - "hsfs.engine.python.Engine.legacy_save_dataframe" - ) - mock_python_engine_apply_transformations = mocker.patch( - "hsfs.engine.python.Engine._apply_transformation_function" - ) - - python_engine = python.Engine() - - @udf(int) - def test(feature): - return feature + 1 - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - stream=False, - transformation_functions=[test], - ) - - # Act - python_engine.save_dataframe( - feature_group=fg, - dataframe=None, - operation=None, - online_enabled=None, - storage=None, - offline_write_options=None, - online_write_options=None, - validation_id=None, - ) - - # Assert - assert mock_python_engine_write_dataframe_kafka.call_count == 0 - assert mock_python_engine_legacy_save_dataframe.call_count == 1 - assert mock_python_engine_apply_transformations.call_count == 1 - def test_save_dataframe_stream(self, mocker): # Arrange mock_python_engine_write_dataframe_kafka = mocker.patch( @@ -3456,6 +3410,88 @@ def test_get_unique_values(self): assert 2 in result assert 3 in result + def test_apply_transformation_function_missing_feature_on_demand_transformations( + self, mocker + ): + # Arrange + mocker.patch("hopsworks_common.client.get_instance") + hopsworks_common.connection._hsfs_engine_type = "python" + python_engine = python.Engine() + + @udf(int) + def add_one(col1): + return col1 + 1 + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + transformation_functions=[add_one("missing_col1")], + id=11, + stream=False, + ) + + df = pd.DataFrame(data={"tf_name": [1, 2]}) + + # Act + with pytest.raises(exceptions.FeatureStoreException) as exception: + python_engine._apply_transformation_function( + transformation_functions=fg.transformation_functions, dataset=df + ) + print(str(exception.value)) + assert ( + str(exception.value) + == "The following feature(s): `missing_col1`, specified in the on-demand transformation function 'add_one' are not present in the dataframe being inserted into the feature group. " + "Please verify that the correct feature names are used in the transformation function and that these features exist in the dataframe being inserted." + ) + + def test_apply_transformation_function_missing_feature_model_dependent_transformations( + self, mocker + ): + # Arrange + mocker.patch("hopsworks_common.client.get_instance") + hopsworks_common.connection._hsfs_engine_type = "python" + python_engine = python.Engine() + + @udf(int) + def add_one(col1): + return col1 + 1 + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[add_one("missing_col1")], + ) + + df = pd.DataFrame(data={"tf_name": [1, 2]}) + + # Act + with pytest.raises(exceptions.FeatureStoreException) as exception: + python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + print(str(exception.value)) + assert ( + str(exception.value) + == "The following feature(s): `missing_col1`, specified in the model-dependent transformation function 'add_one' are not present in the feature view. " + "Please verify that the correct features are specified in the transformation function." + ) + def test_materialization_kafka(self, mocker): # Arrange mocker.patch("hsfs.core.kafka_engine.get_kafka_config", return_value={}) diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index fb3f6e08f..f74aaf36f 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -605,51 +605,6 @@ def test_save_dataframe(self, mocker): assert mock_spark_engine_save_online_dataframe.call_count == 0 assert mock_spark_engine_save_offline_dataframe.call_count == 1 - def test_save_dataframe_transformations(self, mocker): - # Arrange - mock_spark_engine_save_online_dataframe = mocker.patch( - "hsfs.engine.spark.Engine._save_online_dataframe" - ) - mock_spark_engine_save_offline_dataframe = mocker.patch( - "hsfs.engine.spark.Engine._save_offline_dataframe" - ) - mock_spark_engine_apply_transformations = mocker.patch( - "hsfs.engine.spark.Engine._apply_transformation_function" - ) - - spark_engine = spark.Engine() - - @udf(int) - def test(feature): - return feature + 1 - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - transformation_functions=[test], - ) - - # Act - spark_engine.save_dataframe( - feature_group=fg, - dataframe=None, - operation=None, - online_enabled=None, - storage=None, - offline_write_options=None, - online_write_options=None, - validation_id=None, - ) - - # Assert - assert mock_spark_engine_save_online_dataframe.call_count == 0 - assert mock_spark_engine_save_offline_dataframe.call_count == 1 - assert mock_spark_engine_apply_transformations.call_count == 1 - def test_save_dataframe_storage_offline(self, mocker): # Arrange mock_spark_engine_save_online_dataframe = mocker.patch( @@ -988,138 +943,6 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): == 0 ) - def test_save_stream_dataframe_transformations(self, mocker, backend_fixtures): - # Arrange - mock_common_client_get_instance = mocker.patch( - "hopsworks_common.client.get_instance" - ) - mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" - ) - - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_engine_get_instance.return_value.add_file.return_value = ( - "result_from_add_file" - ) - - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - - mock_spark_engine_apply_transformations = mocker.patch( - "hsfs.engine.spark.Engine._apply_transformation_function" - ) - - json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - spark_engine = spark.Engine() - - @udf(int) - def test(feature): - return feature + 1 - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - online_topic_name="test_online_topic_name", - transformation_functions=[test], - ) - fg.feature_store = mocker.Mock() - project_id = 1 - fg.feature_store.project_id = project_id - - mock_common_client_get_instance.return_value._project_name = "test_project_name" - - # Act - spark_engine.save_stream_dataframe( - feature_group=fg, - dataframe=None, - query_name=None, - output_mode="test_mode", - await_termination=None, - timeout=None, - checkpoint_dir=None, - write_options={"test_name": "test_value"}, - ) - - # Assert - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] - == "headers" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ - 0 - ][0] - == "test_mode" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ - 0 - ][0] - == "kafka" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ - 0 - ][0] - == "checkpointLocation" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ - 0 - ][1] - == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ - 1 - ] - == { - "kafka.bootstrap.servers": "test_bootstrap_servers", - "kafka.security.protocol": "test_security_protocol", - "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "kafka.ssl.key.password": "test_ssl_key_password", - "kafka.ssl.keystore.location": "result_from_add_file", - "kafka.ssl.keystore.password": "test_ssl_keystore_password", - "kafka.ssl.truststore.location": "result_from_add_file", - "kafka.ssl.truststore.password": "test_ssl_truststore_password", - "kafka.test_option_name": "test_option_value", - "test_name": "test_value", - } - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ - 0 - ][0] - == "topic" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ - 0 - ][1] - == "test_online_topic_name" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ - 0 - ][0] - == self._get_spark_query_name(project_id, fg) - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count - == 0 - ) - assert mock_spark_engine_apply_transformations.call_count == 1 - def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): # Arrange mock_common_client_get_instance = mocker.patch( From f2b9d55e84250cbc537c91e1e7ff0584f6b7d2c1 Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Mon, 28 Oct 2024 13:28:14 +0100 Subject: [PATCH 08/30] Replace hsfs with hopsworks where it is possible in docs (#374) --- .../client/online_store_rest_client.py | 2 +- python/hopsworks_common/connection.py | 69 ++++++++++++++++++- python/hopsworks_common/project.py | 2 +- python/hsfs/feature_store.py | 2 +- python/hsfs/feature_view.py | 14 ++-- python/hsfs/training_dataset.py | 6 +- python/hsml/core/dataset_api.py | 5 +- 7 files changed, 84 insertions(+), 16 deletions(-) diff --git a/python/hopsworks_common/client/online_store_rest_client.py b/python/hopsworks_common/client/online_store_rest_client.py index 9ad05e9a3..b66897b09 100644 --- a/python/hopsworks_common/client/online_store_rest_client.py +++ b/python/hopsworks_common/client/online_store_rest_client.py @@ -305,7 +305,7 @@ def _check_hopsworks_connection(self) -> None: assert ( client.get_instance() is not None and client.get_instance()._connected ), """Hopsworks Client is not connected. Please connect to Hopsworks cluster - via hopsworks.login or hsfs.connection before initialising the Online Store REST Client. + via hopsworks.login before initialising the Online Store REST Client. """ _logger.debug("Hopsworks connection is active.") diff --git a/python/hopsworks_common/connection.py b/python/hopsworks_common/connection.py index 6972b6a85..49f504932 100644 --- a/python/hopsworks_common/connection.py +++ b/python/hopsworks_common/connection.py @@ -484,7 +484,74 @@ def connection( api_key_file: Optional[str] = None, api_key_value: Optional[str] = None, ) -> Connection: - """Connection factory method, accessible through `hopsworks.connection()`.""" + """Connection factory method, accessible through `hopsworks.connection()`. + + This class provides convenience classmethods accessible from the `hopsworks`-module: + + !!! example "Connection factory" + For convenience, `hopsworks` provides a factory method, accessible from the top level + module, so you don't have to import the `Connection` class manually: + + ```python + import hopsworks + conn = hopsworks.connection() + ``` + + !!! hint "Save API Key as File" + To get started quickly, you can simply create a file with the previously + created Hopsworks API Key and place it on the environment from which you + wish to connect to Hopsworks. + + You can then connect by simply passing the path to the key file when + instantiating a connection: + + ```python hl_lines="6" + import hopsworks + conn = hopsworks.connection( + 'my_instance', # DNS of your Hopsworks instance + 443, # Port to reach your Hopsworks instance, defaults to 443 + api_key_file='hopsworks.key', # The file containing the API key generated above + hostname_verification=True) # Disable for self-signed certificates + ) + project = conn.get_project("my_project") + ``` + + Clients in external clusters need to connect to the Hopsworks using an + API key. The API key is generated inside the Hopsworks platform, and requires at + least the "project" scope to be able to access a project. + For more information, see the [integration guides](../setup.md). + + # Arguments + host: The hostname of the Hopsworks instance in the form of `[UUID].cloud.hopsworks.ai`, + defaults to `None`. Do **not** use the url including `https://` when connecting + programatically. + port: The port on which the Hopsworks instance can be reached, + defaults to `443`. + project: The name of the project to connect to. When running on Hopsworks, this + defaults to the project from where the client is run from. + Defaults to `None`. + engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, + which initializes the engine to Spark if the environment provides Spark, for + example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not + available, e.g. on local Python environments or AWS SageMaker. This option + allows you to override this behaviour. `"training"` engine is useful when only + feature store metadata is needed, for example training dataset location and label + information when Hopsworks training experiment is conducted. + hostname_verification: Whether or not to verify Hopsworks' certificate, defaults + to `True`. + trust_store_path: Path on the file system containing the Hopsworks certificates, + defaults to `None`. + cert_folder: The directory to store retrieved HopsFS certificates, defaults to + `"/tmp"`. Only required when running without a Spark environment. + api_key_file: Path to a file containing the API Key, defaults to `None`. + api_key_value: API Key as string, if provided, `api_key_file` will be ignored, + however, this should be used with care, especially if the used notebook or + job script is accessible by multiple parties. Defaults to `None`. + + # Returns + `Connection`. Connection handle to perform operations on a + Hopsworks project. + """ return cls( host, port, diff --git a/python/hopsworks_common/project.py b/python/hopsworks_common/project.py index df82b3f79..7705b603b 100644 --- a/python/hopsworks_common/project.py +++ b/python/hopsworks_common/project.py @@ -129,7 +129,7 @@ def get_feature_store( name: Project name of the feature store. engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `"python"` when connected to [Serverless Hopsworks](https://app.hopsworks.ai). - See hsfs.Connection.connection documentation for more information. + See [`hopsworks.connection`](connection.md#connection) documentation for more information. # Returns `hsfs.feature_store.FeatureStore`: The Feature Store API # Raises diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index c1ef352f9..f7ba9044d 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -458,7 +458,7 @@ def sql( For spark engine: Dictionary of read options for Spark. For python engine: If running queries on the online feature store, users can provide an entry `{'external': True}`, - this instructs the library to use the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) to establish the connection to the online feature store. + this instructs the library to use the `host` parameter in the [`hopsworks.login()`](login.md#login) to establish the connection to the online feature store. If not set, or set to False, the online feature store storage connector is used which relies on the private ip. Defaults to `{}`. diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 6dbe7a585..5d3151b18 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -337,7 +337,7 @@ def init_serving( Transformation statistics are fetched from training dataset and applied to the feature vector. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -592,7 +592,7 @@ def get_feature_vector( providing feature values which are not available in the feature store. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -705,7 +705,7 @@ def get_feature_vectors( providing feature values which are not available in the feature store. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -777,7 +777,7 @@ def get_inference_helper( Set of required primary keys is [`feature_view.primary_keys`](#primary_keys) external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -835,7 +835,7 @@ def get_inference_helpers( Set of required primary keys is [`feature_view.primary_keys`](#primary_keys) external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -912,7 +912,7 @@ def find_neighbors( filter: A filter expression to restrict the search space (optional). external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -3567,7 +3567,7 @@ def transform( feature_vector: `Union[List[Any], List[List[Any]], pd.DataFrame, pl.DataFrame]`. The feature vector to be transformed. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py index 94688b692..7d9e89ec8 100644 --- a/python/hsfs/training_dataset.py +++ b/python/hsfs/training_dataset.py @@ -1007,7 +1007,7 @@ def init_prepared_statement( initialised for retrieving serving vectors as a batch. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -1024,7 +1024,7 @@ def get_serving_vector( serving application. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -1046,7 +1046,7 @@ def get_serving_vectors( serving application. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. diff --git a/python/hsml/core/dataset_api.py b/python/hsml/core/dataset_api.py index 06df9fba4..681fe3442 100644 --- a/python/hsml/core/dataset_api.py +++ b/python/hsml/core/dataset_api.py @@ -61,10 +61,11 @@ def upload( """Upload a file to the Hopsworks filesystem. ```python + import hopsworks - conn = hsml.connection(project="my-project") + project = hopsworks.login(project="my-project") - dataset_api = conn.get_dataset_api() + dataset_api = project.get_dataset_api() uploaded_file_path = dataset_api.upload("my_local_file.txt", "Resources") From 7af7098556e682e8bb37789db97f67a9a4b28e33 Mon Sep 17 00:00:00 2001 From: Ralf Date: Wed, 30 Oct 2024 11:21:19 +0200 Subject: [PATCH 09/30] [FSTORE-1564] Managed feature group delta deltastreamer (#359) --- .../logicalclocks/hsfs/beam/FeatureStore.java | 2 +- .../hsfs/beam/StreamFeatureGroup.java | 25 +- .../hsfs/flink/FeatureStore.java | 5 +- .../hsfs/flink/StreamFeatureGroup.java | 28 +- .../logicalclocks/hsfs/FeatureStoreBase.java | 4 +- .../logicalclocks/hsfs/TimeTravelFormat.java | 3 +- .../hsfs/spark/FeatureStore.java | 16 +- .../hsfs/spark/StreamFeatureGroup.java | 44 +-- .../hsfs/spark/engine/FeatureGroupEngine.java | 4 +- .../hsfs/spark/TestFeatureGroup.java | 9 +- python/hsfs/core/feature_group_api.py | 37 ++- python/hsfs/core/feature_group_engine.py | 9 +- python/hsfs/core/hudi_engine.py | 19 -- python/hsfs/core/kafka_engine.py | 2 +- python/hsfs/engine/python.py | 16 +- python/hsfs/engine/spark.py | 108 +++--- python/hsfs/feature_group.py | 17 +- .../tests/core/test_feature_group_engine.py | 2 +- python/tests/core/test_kafka_engine.py | 4 +- python/tests/engine/test_python.py | 23 +- python/tests/engine/test_spark.py | 314 +++++++++++------- utils/python/hsfs_utils.py | 106 +++++- 22 files changed, 517 insertions(+), 280 deletions(-) diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java index c059520f7..db01f295a 100644 --- a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java @@ -160,7 +160,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver @Override public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, StatisticsConfig statisticsConfig, String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { throw new UnsupportedOperationException("Not supported for Beam"); } diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java index 9d3c41ee6..e74b51ade 100644 --- a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java @@ -17,6 +17,14 @@ package com.logicalclocks.hsfs.beam; +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.beam.sdk.values.PCollection; + import com.logicalclocks.hsfs.Feature; import com.logicalclocks.hsfs.FeatureGroupBase; import com.logicalclocks.hsfs.FeatureStoreException; @@ -26,19 +34,14 @@ import com.logicalclocks.hsfs.StatisticsConfig; import com.logicalclocks.hsfs.Storage; import com.logicalclocks.hsfs.StorageConnector; -import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.beam.engine.BeamProducer; +import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; import com.logicalclocks.hsfs.constructor.QueryBase; import com.logicalclocks.hsfs.metadata.Statistics; + import lombok.Builder; import lombok.NonNull; -import org.apache.beam.sdk.values.PCollection; - -import java.io.IOException; -import java.text.ParseException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; public class StreamFeatureGroup extends FeatureGroupBase> { @@ -48,8 +51,9 @@ public class StreamFeatureGroup extends FeatureGroupBase> { @Builder public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, String onlineTopicName, - String eventTime, OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String eventTime, + OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { this(); this.featureStore = featureStore; this.name = name; @@ -61,6 +65,7 @@ public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integ ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; this.onlineEnabled = onlineEnabled; + this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); this.onlineTopicName = onlineTopicName; diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java index b6314bad4..60dcbaeb6 100644 --- a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java @@ -165,8 +165,9 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, - OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, + String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { throw new UnsupportedOperationException("Not supported for Flink"); } diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java index c3cd6cbd0..0fa821fb3 100644 --- a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java @@ -17,6 +17,15 @@ package com.logicalclocks.hsfs.flink; +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; + import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.logicalclocks.hsfs.Feature; import com.logicalclocks.hsfs.FeatureGroupBase; @@ -27,22 +36,14 @@ import com.logicalclocks.hsfs.StatisticsConfig; import com.logicalclocks.hsfs.Storage; import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.constructor.QueryBase; - +import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; import com.logicalclocks.hsfs.metadata.Statistics; -import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.NonNull; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; - -import java.io.IOException; -import java.text.ParseException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; @AllArgsConstructor @JsonIgnoreProperties(ignoreUnknown = true) @@ -53,9 +54,9 @@ public class StreamFeatureGroup extends FeatureGroupBase> { @Builder public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, - String onlineTopicName, String topicName, String notificationTopicName, String eventTime, - OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String topicName, String notificationTopicName, + String eventTime, OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { this(); this.featureStore = featureStore; this.name = name; @@ -67,6 +68,7 @@ public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integ ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; this.onlineEnabled = onlineEnabled; + this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); this.onlineTopicName = onlineTopicName; diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java index ad391ef90..057838cad 100644 --- a/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java +++ b/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java @@ -122,8 +122,8 @@ public abstract Object getOrCreateStreamFeatureGroup(String name, Integer versio public abstract Object getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, - OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, StatisticsConfig statisticsConfig, + String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException; public abstract Object createExternalFeatureGroup(); diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java index 4e0fb0419..d6c3d0b2e 100644 --- a/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java +++ b/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java @@ -19,5 +19,6 @@ public enum TimeTravelFormat { NONE, - HUDI + HUDI, + DELTA } diff --git a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java index 33e3b6058..65dbc66d7 100644 --- a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java +++ b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java @@ -404,7 +404,7 @@ public StreamFeatureGroup.StreamFeatureGroupBuilder createStreamFeatureGroup() { public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, null, - null, null, null, false, null, null, null); + null, null, null, false, TimeTravelFormat.HUDI, null, null, null); } /** @@ -438,7 +438,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, null, - primaryKeys, null, null, onlineEnabled, null, eventTime, null); + primaryKeys, null, null, onlineEnabled, TimeTravelFormat.HUDI, null, eventTime, null); } /** @@ -477,7 +477,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, null, - primaryKeys, partitionKeys, null, onlineEnabled, null, eventTime, null); + primaryKeys, partitionKeys, null, onlineEnabled, TimeTravelFormat.HUDI, null, eventTime, null); } /** @@ -506,6 +506,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver * the first primary key of the feature group will be used as hudi precombine key. * @param onlineEnabled Define whether the feature group should be made available also in the online feature store * for low latency access. + * @param timeTravelFormat Format used for time travel, defaults to `"HUDI"`. * @param statisticsConfig A configuration object, to generally enable descriptive statistics computation for * this feature group, `"correlations`" to turn on feature correlation computation, * `"histograms"` to compute feature value frequencies and `"exact_uniqueness"` to compute @@ -523,13 +524,14 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, - OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, + String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, description, - primaryKeys, partitionKeys, hudiPrecombineKey, onlineEnabled, statisticsConfig, eventTime, - onlineConfig); + primaryKeys, partitionKeys, hudiPrecombineKey, onlineEnabled, timeTravelFormat, + statisticsConfig, eventTime, onlineConfig); } /** diff --git a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java index 0c8b9bae3..4f423e8f3 100644 --- a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java +++ b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java @@ -17,13 +17,23 @@ package com.logicalclocks.hsfs.spark; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.io.IOException; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; -import com.logicalclocks.hsfs.spark.constructor.Query; -import com.logicalclocks.hsfs.spark.engine.FeatureGroupEngine; -import com.logicalclocks.hsfs.spark.engine.StatisticsEngine; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.streaming.StreamingQuery; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.logicalclocks.hsfs.EntityEndpointType; import com.logicalclocks.hsfs.Feature; +import com.logicalclocks.hsfs.FeatureGroupBase; import com.logicalclocks.hsfs.FeatureStoreException; import com.logicalclocks.hsfs.HudiOperationType; import com.logicalclocks.hsfs.JobConfiguration; @@ -31,26 +41,16 @@ import com.logicalclocks.hsfs.StatisticsConfig; import com.logicalclocks.hsfs.Storage; import com.logicalclocks.hsfs.StorageConnector; -import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.metadata.Statistics; +import com.logicalclocks.hsfs.spark.constructor.Query; +import com.logicalclocks.hsfs.spark.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.spark.engine.StatisticsEngine; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.NonNull; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.streaming.StreamingQuery; - -import java.io.IOException; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - @AllArgsConstructor @JsonIgnoreProperties(ignoreUnknown = true) public class StreamFeatureGroup extends FeatureGroupBase> { @@ -61,9 +61,10 @@ public class StreamFeatureGroup extends FeatureGroupBase> { @Builder public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, - String onlineTopicName, String topicName, String notificationTopicName, String eventTime, - OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String topicName, + String notificationTopicName, String eventTime, OnlineConfig onlineConfig, + StorageConnector storageConnector, String path) { this(); this.featureStore = featureStore; this.name = name; @@ -75,6 +76,7 @@ public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integ ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; this.onlineEnabled = onlineEnabled; + this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); this.onlineTopicName = onlineTopicName; diff --git a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java index 96ddfd5f2..f791d8bcd 100644 --- a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java +++ b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java @@ -364,7 +364,8 @@ public List getFeatureGroups(FeatureStore featureStore, String fgN public StreamFeatureGroup getOrCreateStreamFeatureGroup(FeatureStore featureStore, String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, StatisticsConfig statisticsConfig, + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { StreamFeatureGroup featureGroup; @@ -381,6 +382,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(FeatureStore featureStor .partitionKeys(partitionKeys) .hudiPrecombineKey(hudiPrecombineKey) .onlineEnabled(onlineEnabled) + .timeTravelFormat(timeTravelFormat) .statisticsConfig(statisticsConfig) .eventTime(eventTime) .onlineConfig(onlineConfig) diff --git a/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java b/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java index bedd9716e..86a85bbdc 100644 --- a/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java +++ b/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java @@ -20,6 +20,7 @@ import com.logicalclocks.hsfs.Feature; import com.logicalclocks.hsfs.FeatureStoreException; import com.logicalclocks.hsfs.Project; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.metadata.FeatureGroupApi; import com.logicalclocks.hsfs.FeatureGroupBase; import com.logicalclocks.hsfs.metadata.HopsworksClient; @@ -67,7 +68,7 @@ public void testFeatureGroupPrimaryKey() { StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("primaryKey"), Collections.singletonList("partitionKey"), "hudiPrecombineKey", - true, features, null, "onlineTopicName", null, null, null, null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, null, null, null, null); Exception pkException = assertThrows(FeatureStoreException.class, () -> { featureGroupEngine.saveFeatureGroupMetaData(featureGroup, @@ -93,7 +94,7 @@ public void testFeatureGroupEventTimeFeature() { StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("featureA"), null, null, - true, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); Exception eventTimeException = assertThrows(FeatureStoreException.class, () -> { streamFeatureGroupEngine.saveFeatureGroupMetaData(featureGroup, @@ -119,7 +120,7 @@ public void testFeatureGroupPartitionPrecombineKeys() { StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("featureA"), Collections.singletonList("partitionKey"), "hudiPrecombineKey", - true, features, null, "onlineTopicName", null, null, null, null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, null, null, null, null); Exception partitionException = assertThrows(FeatureStoreException.class, () -> { streamFeatureGroupEngine.saveFeatureGroupMetaData(featureGroup, @@ -164,7 +165,7 @@ public void testFeatureGroupAppendFeaturesResetSubject() throws FeatureStoreExce StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("featureA"), null, null, - true, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); featureGroup.featureGroupEngine = featureGroupEngine; // Act diff --git a/python/hsfs/core/feature_group_api.py b/python/hsfs/core/feature_group_api.py index ab05fb9b5..037228c73 100644 --- a/python/hsfs/core/feature_group_api.py +++ b/python/hsfs/core/feature_group_api.py @@ -21,7 +21,12 @@ from hopsworks_common import client from hsfs import feature_group as fg_mod from hsfs import feature_group_commit, util -from hsfs.core import explicit_provenance, ingestion_job, ingestion_job_conf +from hsfs.core import ( + explicit_provenance, + ingestion_job, + ingestion_job_conf, + job, +) class FeatureGroupApi: @@ -416,6 +421,36 @@ def ingestion( ), ) + def update_table_schema( + self, + feature_group_instance: fg_mod.FeatureGroup, + ) -> job.Job: + """ + Setup a Hopsworks job to update table schema + Args: + feature_group_instance: FeatureGroup, required + metadata object of feature group. + job_conf: the configuration for the job application + """ + + _client = client.get_instance() + path_params = [ + "project", + _client._project_id, + "featurestores", + feature_group_instance.feature_store_id, + "featuregroups", + feature_group_instance.id, + "updatetableschema", + ] + + headers = {"content-type": "application/json"} + return job.Job.from_response_json( + _client._send_request( + "POST", path_params, headers=headers + ), + ) + def get_parent_feature_groups( self, feature_group_instance: Union[ diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py index 30d1cbe4b..0eb5c441a 100644 --- a/python/hsfs/core/feature_group_engine.py +++ b/python/hsfs/core/feature_group_engine.py @@ -274,6 +274,8 @@ def commit_delete(feature_group, delete_df, write_options): @staticmethod def delta_vacuum(feature_group, retention_hours): if feature_group.time_travel_format == "DELTA": + # TODO: This should change, DeltaEngine and HudiEngine always assumes spark client! + # Cannot properly manage what should happen when using python. delta_engine_instance = delta_engine.DeltaEngine( feature_group.feature_store_id, feature_group.feature_store_name, @@ -321,12 +323,7 @@ def append_features(self, feature_group, new_features): ) # write empty dataframe to update parquet schema - if feature_group.time_travel_format == "DELTA": - engine.get_instance().add_cols_to_delta_table(feature_group, new_features) - else: - engine.get_instance().save_empty_dataframe( - feature_group, new_features=new_features - ) + engine.get_instance().update_table_schema(feature_group) def update_description(self, feature_group, description): """Updates the description of a feature group.""" diff --git a/python/hsfs/core/hudi_engine.py b/python/hsfs/core/hudi_engine.py index 4492f0a19..e96b8ea56 100644 --- a/python/hsfs/core/hudi_engine.py +++ b/python/hsfs/core/hudi_engine.py @@ -234,25 +234,6 @@ def _setup_hudi_read_opts(self, hudi_fg_alias, read_options): return hudi_options - def reconcile_hudi_schema( - self, save_empty_dataframe_callback, hudi_fg_alias, read_options - ): - if sorted(self._spark_session.table(hudi_fg_alias.alias).columns) != sorted( - [feature.name for feature in hudi_fg_alias.feature_group._features] + self.HUDI_SPEC_FEATURE_NAMES - ): - full_fg = self._feature_group_api.get( - feature_store_id=hudi_fg_alias.feature_group._feature_store_id, - name=hudi_fg_alias.feature_group.name, - version=hudi_fg_alias.feature_group.version, - ) - - save_empty_dataframe_callback(full_fg) - - self.register_temporary_table( - hudi_fg_alias, - read_options, - ) - @staticmethod def _get_last_commit_metadata(spark_context, base_path): hopsfs_conf = spark_context._jvm.org.apache.hadoop.fs.FileSystem.get( diff --git a/python/hsfs/core/kafka_engine.py b/python/hsfs/core/kafka_engine.py index d21b6ec22..ee9e892be 100644 --- a/python/hsfs/core/kafka_engine.py +++ b/python/hsfs/core/kafka_engine.py @@ -141,7 +141,7 @@ def kafka_get_offsets( offsets += f",{partition_metadata.id}:{consumer.get_watermark_offsets(partition)[tuple_value]}" consumer.close() - return f" -initialCheckPointString {topic_name + offsets}" + return f"{topic_name + offsets}" return "" diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 1c001f63a..c0218e847 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -1203,13 +1203,11 @@ def save_stream_dataframe( "Stream ingestion is not available on Python environments, because it requires Spark as engine." ) - def save_empty_dataframe( - self, - feature_group: Union[FeatureGroup, ExternalFeatureGroup], - new_features=None, - ) -> None: - """Wrapper around save_dataframe in order to provide no-op.""" - pass + def update_table_schema(self, feature_group: Union[FeatureGroup, ExternalFeatureGroup]) -> None: + _job = self._feature_group_api.update_table_schema(feature_group) + _job._wait_for_job( + await_termination=True + ) def _get_app_options( self, user_write_options: Optional[Dict[str, Any]] = None @@ -1517,7 +1515,7 @@ def _write_dataframe_kafka( now = datetime.now(timezone.utc) feature_group.materialization_job.run( args=feature_group.materialization_job.config.get("defaultArgs", "") - + initial_check_point, + + (f" -initialCheckPointString {initial_check_point}" if initial_check_point else ""), await_termination=offline_write_options.get("wait_for_job", False), ) offline_backfill_every_hr = offline_write_options.pop( @@ -1547,7 +1545,7 @@ def _write_dataframe_kafka( # provide the initial_check_point as it will reduce the read amplification of materialization job feature_group.materialization_job.run( args=feature_group.materialization_job.config.get("defaultArgs", "") - + initial_check_point, + + (f" -initialCheckPointString {initial_check_point}" if initial_check_point else ""), await_termination=offline_write_options.get("wait_for_job", False), ) return feature_group.materialization_job diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 10d3a9cb1..0c68226d5 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -35,6 +35,7 @@ import tzlocal from hopsworks_common.core.constants import HAS_NUMPY, HAS_PANDAS from hsfs.constructor import query +from hsfs.core import feature_group_api # in case importing in %%local from hsfs.core.vector_db_client import VectorDbClient @@ -221,8 +222,8 @@ def register_hudi_temporary_table( read_options, ) - hudi_engine_instance.reconcile_hudi_schema( - self.save_empty_dataframe, hudi_fg_alias, read_options + self.reconcile_schema( + hudi_fg_alias, read_options, hudi_engine_instance ) def register_delta_temporary_table( @@ -241,6 +242,30 @@ def register_delta_temporary_table( read_options, ) + self.reconcile_schema( + delta_fg_alias, read_options, delta_engine_instance + ) + + def reconcile_schema( + self, fg_alias, read_options, engine_instance + ): + if sorted(self._spark_session.table(fg_alias.alias).columns) != sorted( + [feature.name for feature in fg_alias.feature_group._features] + + hudi_engine.HudiEngine.HUDI_SPEC_FEATURE_NAMES if fg_alias.feature_group.time_travel_format == "HUDI" else [] + ): + full_fg = feature_group_api.FeatureGroupApi().get( + feature_store_id=fg_alias.feature_group._feature_store_id, + name=fg_alias.feature_group.name, + version=fg_alias.feature_group.version, + ) + + self.update_table_schema(full_fg) + + engine_instance.register_temporary_table( + fg_alias, + read_options, + ) + def _return_dataframe_type(self, dataframe, dataframe_type): if dataframe_type.lower() in ["default", "spark"]: return dataframe @@ -462,9 +487,7 @@ def save_stream_dataframe( write_options = kafka_engine.get_kafka_config( feature_group.feature_store_id, write_options, engine="spark" ) - serialized_df = self._online_fg_to_avro( - feature_group, self._encode_complex_features(feature_group, dataframe) - ) + serialized_df = self._serialize_to_avro(feature_group, dataframe) project_id = str(feature_group.feature_store.project_id) feature_group_id = str(feature_group._id) @@ -557,9 +580,7 @@ def _save_online_dataframe(self, feature_group, dataframe, write_options): feature_group.feature_store_id, write_options, engine="spark" ) - serialized_df = self._online_fg_to_avro( - feature_group, self._encode_complex_features(feature_group, dataframe) - ) + serialized_df = self._serialize_to_avro(feature_group, dataframe) project_id = str(feature_group.feature_store.project_id).encode("utf8") feature_group_id = str(feature_group._id).encode("utf8") @@ -579,13 +600,13 @@ def _save_online_dataframe(self, feature_group, dataframe, write_options): "topic", feature_group._online_topic_name ).save() - def _encode_complex_features( + def _serialize_to_avro( self, feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], dataframe: Union[RDD, DataFrame], ): """Encodes all complex type features to binary using their avro type as schema.""" - return dataframe.select( + encoded_dataframe = dataframe.select( [ field["name"] if field["name"] not in feature_group.get_complex_features() @@ -596,15 +617,10 @@ def _encode_complex_features( ] ) - def _online_fg_to_avro( - self, - feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], - dataframe: Union[DataFrame, RDD], - ): """Packs all features into named struct to be serialized to single avro/binary column. And packs primary key into arry to be serialized for partitioning. """ - return dataframe.select( + return encoded_dataframe.select( [ # be aware: primary_key array should always be sorted to_avro( @@ -627,6 +643,30 @@ def _online_fg_to_avro( ] ) + def _deserialize_from_avro( + self, + feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], + dataframe: Union[RDD, DataFrame], + ): + """ + Deserializes 'value' column from binary using avro schema and unpacks it into columns. + """ + decoded_dataframe = dataframe.select( + from_avro("value", feature_group._get_encoded_avro_schema()).alias("value") + ).select(col("value.*")) + + """Decodes all complex type features from binary using their avro type as schema.""" + return decoded_dataframe.select( + [ + field["name"] + if field["name"] not in feature_group.get_complex_features() + else from_avro( + field["name"], feature_group._get_feature_avro_schema(field["name"]) + ).alias(field["name"]) + for field in json.loads(feature_group.avro_schema)["fields"] + ] + ) + def get_training_data( self, training_dataset: training_dataset.TrainingDataset, @@ -1296,21 +1336,20 @@ def is_spark_dataframe(self, dataframe): return True return False - def save_empty_dataframe(self, feature_group, new_features=None): + def update_table_schema(self, feature_group): + if feature_group.time_travel_format == "DELTA": + self._add_cols_to_delta_table(feature_group) + else: + self._save_empty_dataframe(feature_group) + + def _save_empty_dataframe(self, feature_group): location = feature_group.prepare_spark_location() dataframe = self._spark_session.read.format("hudi").load(location) - if new_features is not None: - if isinstance(new_features, list): - for new_feature in new_features: - dataframe = dataframe.withColumn( - new_feature.name, lit(None).cast(new_feature.type) - ) - else: - dataframe = dataframe.withColumn( - new_features.name, lit(None).cast(new_features.type) - ) + for _feature in feature_group.features: + if _feature.name not in dataframe.columns: + dataframe = dataframe.withColumn(_feature.name, lit(None).cast(_feature.type)) self.save_dataframe( feature_group, @@ -1322,21 +1361,14 @@ def save_empty_dataframe(self, feature_group, new_features=None): {}, ) - def add_cols_to_delta_table(self, feature_group, new_features): + def _add_cols_to_delta_table(self, feature_group): location = feature_group.prepare_spark_location() dataframe = self._spark_session.read.format("delta").load(location) - if new_features is not None: - if isinstance(new_features, list): - for new_feature in new_features: - dataframe = dataframe.withColumn( - new_feature.name, lit("").cast(new_feature.type) - ) - else: - dataframe = dataframe.withColumn( - new_features.name, lit("").cast(new_features.type) - ) + for _feature in feature_group.features: + if _feature.name not in dataframe.columns: + dataframe = dataframe.withColumn(_feature.name, lit(None).cast(_feature.type)) dataframe.limit(0).write.format("delta").mode("append").option( "mergeSchema", "true" diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index a3385afda..9d286cc29 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -2327,27 +2327,14 @@ def __init__( # for python engine we always use stream feature group if engine.get_type() == "python": self._stream = True - # for stream feature group time travel format is always HUDI - if self._stream: - expected_format = "HUDI" - if self._time_travel_format != expected_format: - warnings.warn( - ( - "The provided time travel format `{}` has been overwritten " - "because Stream enabled feature groups only support `{}`" - ).format(self._time_travel_format, expected_format), - util.FeatureGroupWarning, - stacklevel=1, - ) - self._time_travel_format = expected_format self.primary_key = primary_key self.partition_key = partition_key self._hudi_precombine_key = ( util.autofix_feature_name(hudi_precombine_key) if hudi_precombine_key is not None - and self._time_travel_format is not None - and self._time_travel_format == "HUDI" + and (self._time_travel_format is None + or self._time_travel_format == "HUDI") else None ) self.statistics_config = statistics_config diff --git a/python/tests/core/test_feature_group_engine.py b/python/tests/core/test_feature_group_engine.py index e5cc55c05..e57f2c0c3 100644 --- a/python/tests/core/test_feature_group_engine.py +++ b/python/tests/core/test_feature_group_engine.py @@ -802,7 +802,7 @@ def test_append_features(self, mocker): # Assert assert ( - mock_engine_get_instance.return_value.save_empty_dataframe.call_count == 1 + mock_engine_get_instance.return_value.update_table_schema.call_count == 1 ) assert len(mock_fg_engine_update_features_metadata.call_args[0][1]) == 4 diff --git a/python/tests/core/test_kafka_engine.py b/python/tests/core/test_kafka_engine.py index e6bb48297..88085689e 100644 --- a/python/tests/core/test_kafka_engine.py +++ b/python/tests/core/test_kafka_engine.py @@ -340,7 +340,7 @@ def test_kafka_get_offsets_high(self, mocker): ) # Assert - assert result == f" -initialCheckPointString {topic_name},0:11" + assert result == f"{topic_name},0:11" def test_kafka_get_offsets_low(self, mocker): # Arrange @@ -372,7 +372,7 @@ def test_kafka_get_offsets_low(self, mocker): ) # Assert - assert result == f" -initialCheckPointString {topic_name},0:0" + assert result == f"{topic_name},0:0" def test_kafka_get_offsets_no_topic(self, mocker): # Arrange diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 84e2ca10a..ea83f618f 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -2519,15 +2519,22 @@ def test_save_stream_dataframe(self): == "Stream ingestion is not available on Python environments, because it requires Spark as engine." ) - def test_save_empty_dataframe(self): + def test_update_table_schema(self, mocker): # Arrange + mock_fg_api = mocker.patch("hsfs.core.feature_group_api.FeatureGroupApi") + python_engine = python.Engine() + mock_fg_api.return_value.update_table_schema.return_value.job = job.Job( + 1, "test_job", None, None, None, None + ) + # Act - result = python_engine.save_empty_dataframe(feature_group=None) + result = python_engine.update_table_schema(feature_group=None) # Assert assert result is None + assert mock_fg_api.return_value.update_table_schema.call_count == 1 def test_get_app_options(self, mocker): # Arrange @@ -3562,7 +3569,7 @@ def test_materialization_kafka_first_job_execution(self, mocker): mocker.patch("hsfs.util.get_job_url") mocker.patch( "hsfs.core.kafka_engine.kafka_get_offsets", - return_value=" tests_offsets", + return_value="tests_offsets", ) mocker.patch( "hsfs.core.job_api.JobApi.last_execution", @@ -3604,7 +3611,7 @@ def test_materialization_kafka_first_job_execution(self, mocker): # Assert assert mock_python_engine_kafka_produce.call_count == 4 job_mock.run.assert_called_once_with( - args="defaults tests_offsets", + args="defaults -initialCheckPointString tests_offsets", await_termination=False, ) @@ -3620,7 +3627,7 @@ def test_materialization_kafka_skip_offsets(self, mocker): mocker.patch("hsfs.util.get_job_url") mocker.patch( "hsfs.core.kafka_engine.kafka_get_offsets", - return_value=" tests_offsets", + return_value="tests_offsets", ) mocker.patch("hopsworks_common.client.get_instance") @@ -3661,7 +3668,7 @@ def test_materialization_kafka_skip_offsets(self, mocker): # Assert assert mock_python_engine_kafka_produce.call_count == 4 job_mock.run.assert_called_once_with( - args="defaults tests_offsets", + args="defaults -initialCheckPointString tests_offsets", await_termination=False, ) @@ -3677,7 +3684,7 @@ def test_materialization_kafka_topic_doesnt_exist(self, mocker): mocker.patch("hsfs.util.get_job_url") mocker.patch( "hsfs.core.kafka_engine.kafka_get_offsets", - side_effect=["", " tests_offsets"], + side_effect=["", "tests_offsets"], ) mocker.patch("hopsworks_common.client.get_instance") @@ -3715,7 +3722,7 @@ def test_materialization_kafka_topic_doesnt_exist(self, mocker): # Assert assert mock_python_engine_kafka_produce.call_count == 4 job_mock.run.assert_called_once_with( - args="defaults tests_offsets", + args="defaults -initialCheckPointString tests_offsets", await_termination=False, ) diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index f74aaf36f..05bb33180 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -39,6 +39,7 @@ from hsfs.training_dataset_feature import TrainingDatasetFeature from hsfs.transformation_function import TransformationType from pyspark.sql import DataFrame +from pyspark.sql.functions import lit from pyspark.sql.types import ( ArrayType, BinaryType, @@ -202,6 +203,7 @@ def test_register_hudi_temporary_table(self, mocker): # Arrange mock_hudi_engine = mocker.patch("hsfs.core.hudi_engine.HudiEngine") mocker.patch("hsfs.feature_group.FeatureGroup.from_response_json") + mock_reconcile_schema = mocker.patch("hsfs.engine.spark.Engine.reconcile_schema") spark_engine = spark.Engine() @@ -219,6 +221,31 @@ def test_register_hudi_temporary_table(self, mocker): # Assert assert mock_hudi_engine.return_value.register_temporary_table.call_count == 1 + assert mock_reconcile_schema.call_count == 1 + + def test_register_delta_temporary_table(self, mocker): + # Arrange + mock_delta_engine = mocker.patch("hsfs.core.delta_engine.DeltaEngine") + mocker.patch("hsfs.feature_group.FeatureGroup.from_response_json") + mock_reconcile_schema = mocker.patch("hsfs.engine.spark.Engine.reconcile_schema") + + spark_engine = spark.Engine() + + hudi_fg_alias = hudi_feature_group_alias.HudiFeatureGroupAlias( + feature_group=None, alias=None + ) + + # Act + spark_engine.register_delta_temporary_table( + delta_fg_alias=hudi_fg_alias, + feature_store_id=None, + feature_store_name=None, + read_options=None, + ) + + # Assert + assert mock_delta_engine.return_value.register_temporary_table.call_count == 1 + assert mock_reconcile_schema.call_count == 1 def test_return_dataframe_type_default(self, mocker): # Arrange @@ -828,9 +855,8 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -876,35 +902,35 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -921,25 +947,25 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == self._get_spark_query_name(project_id, fg) ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 0 ) @@ -949,9 +975,8 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -995,35 +1020,35 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == "/Projects/test_project_name/Resources/test_query_name-checkpoint" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -1040,25 +1065,25 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == "test_query_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 0 ) @@ -1074,9 +1099,8 @@ def test_save_stream_dataframe_checkpoint_dir(self, mocker, backend_fixtures): "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1122,35 +1146,35 @@ def test_save_stream_dataframe_checkpoint_dir(self, mocker, backend_fixtures): # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == "test_checkpoint_dir" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -1167,25 +1191,25 @@ def test_save_stream_dataframe_checkpoint_dir(self, mocker, backend_fixtures): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == self._get_spark_query_name(project_id, fg) ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 0 ) @@ -1195,9 +1219,8 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1243,35 +1266,35 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -1288,29 +1311,29 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == self._get_spark_query_name(project_id, fg) ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 1 ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_args[ 0 ][0] == 123 @@ -1453,9 +1476,8 @@ def test_save_online_dataframe(self, mocker, backend_fixtures): # Arrange mocker.patch("hopsworks_common.client.get_instance") mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1491,19 +1513,19 @@ def test_save_online_dataframe(self, mocker, backend_fixtures): ) # Assert - assert mock_spark_engine_online_fg_to_avro.call_count == 1 + assert mock_spark_engine_serialize_to_avro.call_count == 1 assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.call_args[ 1 ] == { @@ -1520,37 +1542,40 @@ def test_save_online_dataframe(self, mocker, backend_fixtures): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.return_value.save.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.return_value.save.call_count == 1 ) - def test_encode_complex_features(self, mocker): + def test_serialize_to_avro(self, mocker): # Arrange - mocker.patch("hopsworks_common.client.get_instance") - mocker.patch( - "hsfs.feature_group.FeatureGroup.get_complex_features", - return_value=["col_1"], - ) - mocker.patch("hsfs.feature_group.FeatureGroup._get_feature_avro_schema") - spark_engine = spark.Engine() - d = {"col_0": ["test_1", "test_2"], "col_1": ["test_1", "test_2"]} - df = pd.DataFrame(data=d) + mock_to_avro = mocker.patch('hsfs.engine.spark.to_avro') + mock_to_avro.return_value = lit(b'111') - spark_df = spark_engine._spark_session.createDataFrame(df) + fg_data = [] + fg_data.append(("ekarson", ["GRAVITY RUSH 2", "KING'S QUEST"])) + fg_data.append(("ratmilkdrinker", ["NBA 2K", "CALL OF DUTY"])) + pandas_df = pd.DataFrame(fg_data, columns =["account_id", "last_played_games"]) + + df = spark_engine._spark_session.createDataFrame(pandas_df) + + features = [ + feature.Feature(name="account_id", type="str"), + feature.Feature(name="last_played_games", type="array"), + ] fg = feature_group.FeatureGroup( name="test", @@ -1559,37 +1584,41 @@ def test_encode_complex_features(self, mocker): primary_key=[], partition_key=[], id=10, + features=features, ) - fg._subject = {"schema": '{"fields": [{"name": "col_0"}]}'} - - expected = pd.DataFrame(data={"col_0": ["test_1", "test_2"]}) + fg._subject = { + 'id': 1025, + 'subject': 'fg_1', + 'version': 1, + 'schema': '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]}]}' + } # Act - result = spark_engine._encode_complex_features( + serialized_df = spark_engine._serialize_to_avro( feature_group=fg, - dataframe=spark_df, + dataframe=df, ) # Assert - result_df = result.toPandas() - assert list(result_df) == list(expected) - for column in list(result_df): - assert result_df[column].equals(expected[column]) + assert serialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}' - def test_encode_complex_features_col_in_complex_features(self, mocker): + ''' Need spark to run these tests properly + def test_deserialize_from_avro(self, mocker): # Arrange - mocker.patch( - "hsfs.feature_group.FeatureGroup.get_complex_features", - return_value=["col_0"], - ) - mocker.patch("hsfs.feature_group.FeatureGroup._get_feature_avro_schema") - spark_engine = spark.Engine() - d = {"col_0": ["test_1", "test_2"], "col_1": ["test_1", "test_2"]} - df = pd.DataFrame(data=d) + data = [] + data.append((b"2121", b"21212121")) + data.append((b"1212", b"12121212")) + pandas_df = pd.DataFrame(data, columns =["key", "value"]) - spark_df = spark_engine._spark_session.createDataFrame(df) + df = spark_engine._spark_session.createDataFrame(pandas_df) + + features = [ + feature.Feature(name="account_id", type="str"), + feature.Feature(name="last_played_games", type="array"), + feature.Feature(name="event_time", type="timestamp"), + ] fg = feature_group.FeatureGroup( name="test", @@ -1598,29 +1627,42 @@ def test_encode_complex_features_col_in_complex_features(self, mocker): primary_key=[], partition_key=[], id=10, + features=features, ) - fg._subject = {"schema": '{"fields": [{"name": "col_0"}]}'} + fg._subject = { + 'id': 1025, + 'subject': 'fg_1', + 'version': 1, + 'schema': '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]},{"name":"event_time","type":["null",{"type":"long","logicalType":"timestamp-micros"}]}]}' + } # Act - with pytest.raises( - TypeError - ) as e_info: # todo look into this (to_avro has to be mocked) - spark_engine._encode_complex_features( - feature_group=fg, - dataframe=spark_df, - ) + deserialized_df = spark_engine._deserialize_from_avro( + feature_group=fg, + dataframe=df, + ) # Assert - assert str(e_info.value) == "'JavaPackage' object is not callable" + assert deserialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"account_id","nullable":true,"type":"string"},{"metadata":{},"name":"last_played_games","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"event_time","nullable":true,"type":"timestamp"}],"type":"struct"}' - def test_online_fg_to_avro(self): + def test_serialize_deserialize_avro(self, mocker): # Arrange spark_engine = spark.Engine() - d = {"col_0": ["test_1", "test_2"], "col_1": ["test_1", "test_2"]} - df = pd.DataFrame(data=d) + now = datetime.datetime.now() - spark_df = spark_engine._spark_session.createDataFrame(df) + fg_data = [] + fg_data.append(("ekarson", ["GRAVITY RUSH 2", "KING'S QUEST"], pd.Timestamp(now.timestamp()))) + fg_data.append(("ratmilkdrinker", ["NBA 2K", "CALL OF DUTY"], pd.Timestamp(now.timestamp()))) + pandas_df = pd.DataFrame(fg_data, columns =["account_id", "last_played_games", "event_time"]) + + df = spark_engine._spark_session.createDataFrame(pandas_df) + + features = [ + feature.Feature(name="account_id", type="str"), + feature.Feature(name="last_played_games", type="array"), + feature.Feature(name="event_time", type="timestamp"), + ] fg = feature_group.FeatureGroup( name="test", @@ -1629,20 +1671,31 @@ def test_online_fg_to_avro(self): primary_key=[], partition_key=[], id=10, + features=features, ) - fg._avro_schema = '{"fields": [{"name": "col_0"}]}' + fg._subject = { + 'id': 1025, + 'subject': 'fg_1', + 'version': 1, + 'schema': '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]},{"name":"event_time","type":["null",{"type":"long","logicalType":"timestamp-micros"}]}]}' + } # Act - with pytest.raises( - TypeError - ) as e_info: # todo look into this (to_avro has to be mocked) - spark_engine._online_fg_to_avro( - feature_group=fg, - dataframe=spark_df, - ) + serialized_df = spark_engine._serialize_to_avro( + feature_group=fg, + dataframe=df, + ) + + deserialized_df = spark_engine._deserialize_from_avro( + feature_group=fg, + dataframe=serialized_df, + ) # Assert - assert str(e_info.value) == "'JavaPackage' object is not callable" + assert serialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}' + assert df.schema == deserialized_df.schema + assert df.collect() == deserialized_df.collect() + ''' def test_get_training_data(self, mocker): # Arrange @@ -4337,7 +4390,7 @@ def test_is_spark_dataframe_spark_dataframe(self): # Assert assert result is True - def test_save_empty_dataframe(self, mocker): + def test_update_table_schema_hudi(self, mocker): # Arrange mock_spark_engine_save_dataframe = mocker.patch( "hsfs.engine.spark.Engine.save_dataframe" @@ -4357,15 +4410,42 @@ def test_save_empty_dataframe(self, mocker): partition_key=[], id=10, featurestore_name="test_featurestore", + time_travel_format="HUDI", ) # Act - spark_engine.save_empty_dataframe(feature_group=fg) + spark_engine.update_table_schema(feature_group=fg) # Assert assert mock_spark_engine_save_dataframe.call_count == 1 assert mock_spark_read.format.call_count == 1 + def test_update_table_schema_delta(self, mocker): + # Arrange + mock_spark_read = mocker.patch("pyspark.sql.SparkSession.read") + mock_format = mocker.Mock() + mock_spark_read.format.return_value = mock_format + + # Arrange + spark_engine = spark.Engine() + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + featurestore_name="test_featurestore", + time_travel_format="DELTA", + ) + + # Act + spark_engine.update_table_schema(feature_group=fg) + + # Assert + assert mock_spark_read.format.call_count == 1 + def test_apply_transformation_function_single_output_udf_default_mode(self, mocker): # Arrange mocker.patch("hopsworks_common.client.get_instance") diff --git a/utils/python/hsfs_utils.py b/utils/python/hsfs_utils.py index 6b8c49311..15d7c5d9e 100644 --- a/utils/python/hsfs_utils.py +++ b/utils/python/hsfs_utils.py @@ -13,12 +13,14 @@ hopsfs = pfs.HadoopFileSystem("default", user=os.environ["HADOOP_USER_NAME"]) from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StructType, _parse_datatype_string +from pyspark.sql.functions import max, expr import hopsworks +from hsfs import engine from hsfs.constructor import query from hsfs.statistics_config import StatisticsConfig -from hsfs.core import feature_monitoring_config_engine, feature_view_engine +from hsfs.core import feature_monitoring_config_engine, feature_view_engine, kafka_engine def read_job_conf(path: str) -> Dict[Any, Any]: @@ -258,6 +260,96 @@ def delta_vacuum_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None: entity.delta_vacuum() +def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], initial_check_point_string: str) -> None: + """ + Run materialization job on a feature group. + """ + feature_store = job_conf.pop("feature_store") + fs = get_feature_store_handle(feature_store) + + entity = fs.get_feature_group(name=job_conf["name"], version=job_conf["version"]) + + read_options = kafka_engine.get_kafka_config( + entity.feature_store_id, {}, engine="spark" + ) + + # get offsets + offset_location = entity.prepare_spark_location() + "/kafka_offsets" + try: + if initial_check_point_string: + offset_string = json.dumps(_build_starting_offsets(initial_check_point_string)) + else: + offset_string = spark.read.json(offset_location).toJSON().first() + except Exception as e: + print(f"An unexpected error occurred: {e}") + # if all else fails read from the beggining + initial_check_point_string = kafka_engine.kafka_get_offsets( + topic_name=entity._online_topic_name, + feature_store_id=entity.feature_store_id, + offline_write_options={}, + high=False, + ) + offset_string = json.dumps(_build_starting_offsets(initial_check_point_string)) + print(f"startingOffsets: {offset_string}") + + # read kafka topic + df = ( + spark.read.format("kafka") + .options(**read_options) + .option("subscribe", entity._online_topic_name) + .option("startingOffsets", offset_string) + .option("includeHeaders", "true") + .load() + .limit(5000000) + ) + + # filter only the necassary entries + df = df.filter(expr("CAST(filter(headers, header -> header.key = 'featureGroupId')[0].value AS STRING)") == str(entity._id)) + df = df.filter(expr("CAST(filter(headers, header -> header.key = 'subjectId')[0].value AS STRING)") == str(entity.subject["id"])) + + # deserialize dataframe so that it can be properly saved + deserialized_df = engine.get_instance()._deserialize_from_avro(entity, df) + + # insert data + entity.stream = False # to make sure we dont write to kafka + entity.insert(deserialized_df) + + # update offsets + df_offsets = df.groupBy('partition').agg(max('offset').alias('offset')).collect() + offset_dict = json.loads(offset_string) + for offset_row in df_offsets: + offset_dict[f"{entity._online_topic_name}"][f"{offset_row.partition}"] = offset_row.offset + 1 + + # save offsets + offset_df = spark.createDataFrame([offset_dict]) + offset_df.coalesce(1).write.mode("overwrite").json(offset_location) + +def update_table_schema_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None: + """ + Run table schema update job on a feature group. + """ + feature_store = job_conf.pop("feature_store") + fs = get_feature_store_handle(feature_store) + + entity = fs.get_feature_group(name=job_conf["name"], version=job_conf["version"]) + + entity.stream = False + engine.get_instance().update_table_schema(entity) + +def _build_starting_offsets(initial_check_point_string: str): + if not initial_check_point_string: + return "" + + # Split the input string into the topic and partition-offset pairs + topic, offsets = initial_check_point_string.split(',', 1) + + # Split the offsets and build a dictionary from them + offsets_dict = {partition: int(offset) for partition, offset in (pair.split(':') for pair in offsets.split(','))} + + # Create the final dictionary structure + result = {topic: offsets_dict} + + return result if __name__ == "__main__": # Setup spark first so it fails faster in case of args errors @@ -278,6 +370,8 @@ def delta_vacuum_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None: "import_fg", "run_feature_monitoring", "delta_vacuum_fg", + "offline_fg_materialization", + "update_table_schema_fg", ], help="Operation type", ) @@ -297,6 +391,12 @@ def parse_isoformat_date(da: str) -> datetime: help="Job start time", ) + parser.add_argument( + "-initialCheckPointString", + type=str, + help="Kafka offset to start consuming from", + ) + args = parser.parse_args() job_conf = read_job_conf(args.path) @@ -318,6 +418,10 @@ def parse_isoformat_date(da: str) -> datetime: run_feature_monitoring(job_conf) elif args.op == "delta_vacuum_fg": delta_vacuum_fg(spark, job_conf) + elif args.op == "offline_fg_materialization": + offline_fg_materialization(spark, job_conf, args.initialCheckPointString) + elif args.op == "update_table_schema_fg": + update_table_schema_fg(spark, job_conf) success = True except Exception: From ebae3d8f1af7c9c720636fd08f7adb941f5539c5 Mon Sep 17 00:00:00 2001 From: Ralf Date: Thu, 31 Oct 2024 10:54:39 +0200 Subject: [PATCH 10/30] [FSTORE-1564][APPEND] Delta deltastreamer dont write to online store (#381) --- utils/python/hsfs_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/python/hsfs_utils.py b/utils/python/hsfs_utils.py index 15d7c5d9e..3cc1eb615 100644 --- a/utils/python/hsfs_utils.py +++ b/utils/python/hsfs_utils.py @@ -312,7 +312,7 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in # insert data entity.stream = False # to make sure we dont write to kafka - entity.insert(deserialized_df) + entity.insert(deserialized_df, storage="offline") # update offsets df_offsets = df.groupBy('partition').agg(max('offset').alias('offset')).collect() From 0221954b5ec7bc72eb88c6a11a37c99d475467ea Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Thu, 31 Oct 2024 13:58:19 +0100 Subject: [PATCH 11/30] Add engine parameter to hopsworks.login (#385) * Add engine parameter to hopsworks.login * Fix the docstring for the engine parameter * Fix typing * Remove engine param from get_feature_store * Remove redundant code from get_feature_store --- python/hopsworks/__init__.py | 13 ++++++++++++- python/hopsworks_common/connection.py | 20 ++------------------ python/hopsworks_common/project.py | 7 ++----- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/python/hopsworks/__init__.py b/python/hopsworks/__init__.py index 79d500769..220dcadb8 100644 --- a/python/hopsworks/__init__.py +++ b/python/hopsworks/__init__.py @@ -22,6 +22,7 @@ import tempfile import warnings from pathlib import Path +from typing import Literal, Union from hopsworks import client, constants, project, version from hopsworks.client.exceptions import ( @@ -83,6 +84,7 @@ def login( api_key_file: str = None, hostname_verification: bool = False, trust_store_path: str = None, + engine: Union[None, Literal["spark"], Literal["python"], Literal["training"]] = None, ) -> project.Project: """Connect to [Serverless Hopsworks](https://app.hopsworks.ai) by calling the `hopsworks.login()` function with no arguments. @@ -122,6 +124,13 @@ def login( api_key_file: Path to file wih Api Key hostname_verification: Whether to verify Hopsworks' certificate trust_store_path: Path on the file system containing the Hopsworks certificates + engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, + which initializes the engine to Spark if the environment provides Spark, for + example on Hopsworks and Databricks, or falls back to Python if Spark is not + available, e.g. on local Python environments or AWS SageMaker. This option + allows you to override this behaviour. `"training"` engine is useful when only + feature store metadata is needed, for example training dataset location and label + information when Hopsworks training experiment is conducted. # Returns `Project`: The Project object to perform operations on # Raises @@ -138,7 +147,7 @@ def login( # If inside hopsworks, just return the current project for now if "REST_ENDPOINT" in os.environ: - _hw_connection = _hw_connection(hostname_verification=hostname_verification) + _hw_connection = _hw_connection(hostname_verification=hostname_verification, engine=engine) _connected_project = _hw_connection.get_project() _initialize_module_apis() print("\nLogged in to project, explore it here " + _connected_project.get_url()) @@ -207,6 +216,7 @@ def login( _hw_connection = _hw_connection( host=host, port=port, + engine=engine, api_key_file=api_key_path, hostname_verification=hostname_verification, trust_store_path=trust_store_path, @@ -246,6 +256,7 @@ def login( _hw_connection = _hw_connection( host=host, port=port, + engine=engine, api_key_value=api_key, hostname_verification=hostname_verification, trust_store_path=trust_store_path, diff --git a/python/hopsworks_common/connection.py b/python/hopsworks_common/connection.py index 49f504932..43a64bc76 100644 --- a/python/hopsworks_common/connection.py +++ b/python/hopsworks_common/connection.py @@ -100,7 +100,7 @@ class Connection: Defaults to `None`. engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, which initializes the engine to Spark if the environment provides Spark, for - example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not + example on Hopsworks and Databricks, or falls back to Python if Spark is not available, e.g. on local Python environments or AWS SageMaker. This option allows you to override this behaviour. `"training"` engine is useful when only feature store metadata is needed, for example training dataset location and label @@ -151,7 +151,6 @@ def __init__( def get_feature_store( self, name: Optional[str] = None, - engine: Optional[str] = None, ): # -> feature_store.FeatureStore # the typing is commented out due to circular dependency, it breaks auto_doc.py """Get a reference to a feature store to perform operations on. @@ -161,25 +160,10 @@ def get_feature_store( # Arguments name: The name of the feature store, defaults to `None`. - engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, - which initializes the engine to Spark if the environment provides Spark, for - example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not - available, e.g. on local Python environments or AWS SageMaker. This option - allows you to override this behaviour. `"training"` engine is useful when only - feature store metadata is needed, for example training dataset location and label - information when Hopsworks training experiment is conducted. # Returns `FeatureStore`. A feature store handle object to perform operations on. """ - # Ensure the engine is initialized and of right type - from hsfs import engine as hsfs_engine - - if engine: - global _hsfs_engine_type - _hsfs_engine_type = engine - hsfs_engine.get_instance() - if not name: name = client.get_instance()._project_name return self._feature_store_api.get(util.append_feature_store_suffix(name)) @@ -532,7 +516,7 @@ def connection( Defaults to `None`. engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, which initializes the engine to Spark if the environment provides Spark, for - example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not + example on Hopsworks and Databricks, or falls back to Python if Spark is not available, e.g. on local Python environments or AWS SageMaker. This option allows you to override this behaviour. `"training"` engine is useful when only feature store metadata is needed, for example training dataset location and label diff --git a/python/hopsworks_common/project.py b/python/hopsworks_common/project.py index 7705b603b..b35cac288 100644 --- a/python/hopsworks_common/project.py +++ b/python/hopsworks_common/project.py @@ -109,7 +109,7 @@ def project_namespace(self): return self._project_namespace def get_feature_store( - self, name: Optional[str] = None, engine: Optional[str] = None + self, name: Optional[str] = None ): # -> hsfs.feature_store.FeatureStore """Connect to Project's Feature Store. @@ -127,15 +127,12 @@ def get_feature_store( # Arguments name: Project name of the feature store. - engine: Which engine to use, `"spark"`, `"python"` or `"training"`. - Defaults to `"python"` when connected to [Serverless Hopsworks](https://app.hopsworks.ai). - See [`hopsworks.connection`](connection.md#connection) documentation for more information. # Returns `hsfs.feature_store.FeatureStore`: The Feature Store API # Raises `RestAPIError`: If unable to connect """ - return client.get_connection().get_feature_store(name, engine) + return client.get_connection().get_feature_store(name) def get_model_registry(self): """Connect to Project's Model Registry API. From 8eb5c79b6aae70d417e1e62af06ed0b1f5c4f976 Mon Sep 17 00:00:00 2001 From: Javier Cabrera Date: Fri, 1 Nov 2024 09:35:28 +0100 Subject: [PATCH 12/30] Workflow for kube locust benchmark image (#386) --- locust_benchmark/Jenkinsfile | 20 ++++++++++++++++++++ locust_benchmark/KUBE_IMAGE_VERSION | 1 + locust_benchmark/build-manifest.json | 11 +++++++++++ 3 files changed, 32 insertions(+) create mode 100644 locust_benchmark/Jenkinsfile create mode 100644 locust_benchmark/KUBE_IMAGE_VERSION create mode 100644 locust_benchmark/build-manifest.json diff --git a/locust_benchmark/Jenkinsfile b/locust_benchmark/Jenkinsfile new file mode 100644 index 000000000..9d4465e97 --- /dev/null +++ b/locust_benchmark/Jenkinsfile @@ -0,0 +1,20 @@ +@Library("jenkins-library@main") + +import com.logicalclocks.jenkins.k8s.ImageBuilder + + +node("local") { + stage('Clone repository') { + checkout scm + } + + stage('Build and push image(s)') { + version = readFile "${env.WORKSPACE}/locust_benchmark/KUBE_IMAGE_VERSION" + withEnv(["VERSION=${version.trim()}"]) { + + def builder = new ImageBuilder(this) + m = readFile "${env.WORKSPACE}/locust_benchmark/build-manifest.json" + builder.run(m) + } + } +} \ No newline at end of file diff --git a/locust_benchmark/KUBE_IMAGE_VERSION b/locust_benchmark/KUBE_IMAGE_VERSION new file mode 100644 index 000000000..8b25206ff --- /dev/null +++ b/locust_benchmark/KUBE_IMAGE_VERSION @@ -0,0 +1 @@ +master \ No newline at end of file diff --git a/locust_benchmark/build-manifest.json b/locust_benchmark/build-manifest.json new file mode 100644 index 000000000..b09a9e8b2 --- /dev/null +++ b/locust_benchmark/build-manifest.json @@ -0,0 +1,11 @@ +[ + { + "name": "hopsworks/locust-hsfs", + "version": "env:VERSION", + "dockerFile": "locust_benchmark/Dockerfile", + "platforms": { + "append": ["linux/arm64"] + }, + "canUseCache": "true" + } +] \ No newline at end of file From cd14180c43540b0f0176c309fac0a2e0c51c927d Mon Sep 17 00:00:00 2001 From: Javier Cabrera Date: Fri, 1 Nov 2024 11:29:39 +0100 Subject: [PATCH 13/30] [HWORKS-1779][Append] Disable arm64 from build-manifest.json (#387) --- locust_benchmark/build-manifest.json | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/locust_benchmark/build-manifest.json b/locust_benchmark/build-manifest.json index b09a9e8b2..48599c6dc 100644 --- a/locust_benchmark/build-manifest.json +++ b/locust_benchmark/build-manifest.json @@ -3,9 +3,6 @@ "name": "hopsworks/locust-hsfs", "version": "env:VERSION", "dockerFile": "locust_benchmark/Dockerfile", - "platforms": { - "append": ["linux/arm64"] - }, "canUseCache": "true" } -] \ No newline at end of file +] From 90b1e386137e9be71488d70381a5a6ced1696918 Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Fri, 1 Nov 2024 12:03:26 +0100 Subject: [PATCH 14/30] [FSTORE-1593] S3 Connector doesn't refetch credentials when preparing Spark write (#389) --- python/hsfs/storage_connector.py | 1 + python/tests/test_feature_group.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/python/hsfs/storage_connector.py b/python/hsfs/storage_connector.py index 7ed887cd9..15ccdc8d6 100644 --- a/python/hsfs/storage_connector.py +++ b/python/hsfs/storage_connector.py @@ -369,6 +369,7 @@ def prepare_spark(self, path: Optional[str] = None) -> Optional[str]: # Arguments path: Path to prepare for reading from cloud storage. Defaults to `None`. """ + self.refetch() return engine.get_instance().setup_storage_connector(self, path) def connector_options(self) -> Dict[str, Any]: diff --git a/python/tests/test_feature_group.py b/python/tests/test_feature_group.py index 5e01b5a10..ea25bbff3 100644 --- a/python/tests/test_feature_group.py +++ b/python/tests/test_feature_group.py @@ -928,6 +928,7 @@ def test_prepare_spark_location_with_s3_connector(self, mocker, backend_fixtures # Arrange engine = spark.Engine() engine_instance = mocker.patch("hsfs.engine.get_instance", return_value=engine) + refetch_api = mocker.patch("hsfs.storage_connector.S3Connector.refetch") json = backend_fixtures["feature_group"]["get_basic_info"]["response"] fg = feature_group.FeatureGroup.from_response_json(json) fg._location = f"{fg.name}_{fg.version}" @@ -939,11 +940,13 @@ def test_prepare_spark_location_with_s3_connector(self, mocker, backend_fixtures # Assert assert fg.location == path engine_instance.assert_called_once() + refetch_api.assert_called_once() def test_prepare_spark_location_with_s3_connector_python(self, mocker, backend_fixtures): # Arrange engine = python.Engine() engine_instance = mocker.patch("hsfs.engine.get_instance", return_value=engine) + mocker.patch("hsfs.storage_connector.S3Connector.refetch") json = backend_fixtures["feature_group"]["get_basic_info"]["response"] fg = feature_group.FeatureGroup.from_response_json(json) fg._location = f"{fg.name}_{fg.version}" From bbf9326843e366c22098e982e7ba76336a8a86fb Mon Sep 17 00:00:00 2001 From: Dhananjay Mukhedkar <55157590+dhananjay-mk@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:13:21 +0100 Subject: [PATCH 15/30] [FSTORE-1589] refactor locust client as per 4.x hopsworks login (#379) --- locust_benchmark/Dockerfile | 2 +- locust_benchmark/common/hopsworks_client.py | 25 ++++++-------- locust_benchmark/locustfile.py | 37 ++++++--------------- locust_benchmark/requirements.txt | 3 +- 4 files changed, 24 insertions(+), 43 deletions(-) diff --git a/locust_benchmark/Dockerfile b/locust_benchmark/Dockerfile index e437ab9b2..47ef44106 100644 --- a/locust_benchmark/Dockerfile +++ b/locust_benchmark/Dockerfile @@ -1,4 +1,4 @@ -FROM locustio/locust:2.17.0 +FROM locustio/locust:2.23.1 USER root diff --git a/locust_benchmark/common/hopsworks_client.py b/locust_benchmark/common/hopsworks_client.py index b9fbcae04..83963d3ff 100644 --- a/locust_benchmark/common/hopsworks_client.py +++ b/locust_benchmark/common/hopsworks_client.py @@ -7,10 +7,8 @@ import pandas as pd from locust.runners import MasterRunner, LocalRunner -import hsfs -from hsfs import client -from hsfs.client.exceptions import RestAPIError +import hopsworks class HopsworksClient: @@ -21,14 +19,14 @@ def __init__(self, environment=None): environment.runner, (MasterRunner, LocalRunner) ): print(self.hopsworks_config) - self.connection = hsfs.connection( + self.project = hopsworks.login( project=self.hopsworks_config.get("project", "test"), host=self.hopsworks_config.get("host", "localhost"), port=self.hopsworks_config.get("port", 443), api_key_file=".api_key", engine="python", ) - self.fs = self.connection.get_feature_store() + self.fs = self.project.get_feature_store() # test settings self.external = self.hopsworks_config.get("external", False) @@ -59,18 +57,15 @@ def insert_data(self, locust_fg): return locust_fg def get_or_create_fv(self, fg=None): - try: - return self.fs.get_feature_view("locust_fv", version=1) - except RestAPIError: - return self.fs.create_feature_view( - name="locust_fv", - query=fg.select_all(), - version=1, - ) + if fg is None: + fg = self.get_or_create_fg() + return self.fs.get_or_create_feature_view( + name="locust_fv", version=1, query=fg.select_all() + ) def close(self): - if client._client is not None: - self.connection.close() + if self.project is not None: + hopsworks.logout() def generate_insert_df(self, rows, schema_repetitions): data = {"ip": range(0, rows)} diff --git a/locust_benchmark/locustfile.py b/locust_benchmark/locustfile.py index d2d3ff933..105d80abd 100644 --- a/locust_benchmark/locustfile.py +++ b/locust_benchmark/locustfile.py @@ -3,7 +3,7 @@ from common.hopsworks_client import HopsworksClient from common.stop_watch import stopwatch from locust import HttpUser, User, task, constant, events -from locust.runners import MasterRunner, LocalRunner +from locust.runners import MasterRunner from urllib3 import PoolManager import nest_asyncio @@ -11,12 +11,8 @@ @events.init.add_listener def on_locust_init(environment, **kwargs): print("Locust process init") - - if isinstance(environment.runner, (MasterRunner, LocalRunner)): - # create feature view - environment.hopsworks_client = HopsworksClient(environment) - fg = environment.hopsworks_client.get_or_create_fg() - environment.hopsworks_client.get_or_create_fv(fg) + environment.hopsworks_client = HopsworksClient(environment) + environment.hopsworks_client.get_or_create_fg() @events.quitting.add_listener @@ -61,27 +57,21 @@ def get_feature_vector(self): class MySQLFeatureVectorLookup(User): - wait_time = constant(0) - weight = 5 - # fixed_count = 1 + wait_time = constant(0.001) + weight = 2 def __init__(self, environment): super().__init__(environment) - self.env = environment - self.client = HopsworksClient(environment) - self.fv = self.client.get_or_create_fv() + self.client = environment.hopsworks_client def on_start(self): - print("Init user") + self.fv = self.client.get_or_create_fv() self.fv.init_serving(external=self.client.external) nest_asyncio.apply() - def on_stop(self): - print("Closing user") - @task def get_feature_vector(self): - self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)}) + return self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)}) @stopwatch def _get_feature_vector(self, pk): @@ -89,14 +79,12 @@ def _get_feature_vector(self, pk): class MySQLFeatureVectorBatchLookup(User): - wait_time = constant(0) + wait_time = constant(0.001) weight = 1 - # fixed_count = 1 def __init__(self, environment): super().__init__(environment) - self.env = environment - self.client = HopsworksClient(environment) + self.client = environment.hopsworks_client self.fv = self.client.get_or_create_fv() def on_start(self): @@ -104,16 +92,13 @@ def on_start(self): self.fv.init_serving(external=self.client.external) nest_asyncio.apply() - def on_stop(self): - print("Closing user") - @task def get_feature_vector_batch(self): pks = [ {"ip": random.randint(0, self.client.rows - 1)} for i in range(self.client.batch_size) ] - self._get_feature_vectors(pks) + return self._get_feature_vectors(pks) @stopwatch def _get_feature_vectors(self, pk): diff --git a/locust_benchmark/requirements.txt b/locust_benchmark/requirements.txt index 2eef53a7f..d992f8066 100644 --- a/locust_benchmark/requirements.txt +++ b/locust_benchmark/requirements.txt @@ -1,3 +1,4 @@ markupsafe==2.0.1 -locust==2.17.0 +locust==2.23.1 +nest_asyncio==1.6.0 git+https://github.com/logicalclocks/hopsworks-api@main#egg=hopsworks[python]&subdirectory=python \ No newline at end of file From 550bdb4e665791b676b868f29e230ad99323c1b2 Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Fri, 1 Nov 2024 14:34:12 +0100 Subject: [PATCH 16/30] Add tablespace support to locust (#390) Co-authored-by: Javier Cabrera --- locust_benchmark/README.md | 1 + locust_benchmark/common/hopsworks_client.py | 2 ++ locust_benchmark/hopsworks_config.json | 9 +++++---- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/locust_benchmark/README.md b/locust_benchmark/README.md index c390b39db..eda8b440e 100644 --- a/locust_benchmark/README.md +++ b/locust_benchmark/README.md @@ -87,6 +87,7 @@ echo "[YOUR KEY]" > .api_key - `schema_repetitions`: This controls the number of features for the lookup. One schema repetition will result in 10 features plus primary key. Five repetitions will result in 50 features plus primary key. - `recreate_feature_group`: This controls if the previous feature group should be dropped and recreated. Set this to true when rerunning the benchmark with different size of rows or schema repetitions. - `batch_size`: This is relevant for the actual benchmark and controls how many feature vectors are looked up in the batch benchmark. +- `tablespace`: (Optional) If set creates a feature group using on-disk data. 3. Create the feature group diff --git a/locust_benchmark/common/hopsworks_client.py b/locust_benchmark/common/hopsworks_client.py index 83963d3ff..d82409892 100644 --- a/locust_benchmark/common/hopsworks_client.py +++ b/locust_benchmark/common/hopsworks_client.py @@ -36,6 +36,7 @@ def __init__(self, environment=None): "recreate_feature_group", False ) self.batch_size = self.hopsworks_config.get("batch_size", 100) + self.tablespace = self.hopsworks_config.get("tablespace", None) def get_or_create_fg(self): locust_fg = self.fs.get_or_create_feature_group( @@ -44,6 +45,7 @@ def get_or_create_fg(self): primary_key=["ip"], online_enabled=True, stream=True, + online_config={'table_space': self.tablespace} if self.tablespace else None ) return locust_fg diff --git a/locust_benchmark/hopsworks_config.json b/locust_benchmark/hopsworks_config.json index 6a8e60862..6e92b6739 100644 --- a/locust_benchmark/hopsworks_config.json +++ b/locust_benchmark/hopsworks_config.json @@ -1,10 +1,11 @@ { - "host": "localhost", + "host": "mercury.hops.works", "port": 443, - "project": "test", + "project": "fabio_demo", "external": true, - "rows": 100000, + "rows": 1000, "schema_repetitions": 1, "recreate_feature_group": true, - "batch_size": 100 + "batch_size": 100, + "tablespace": "ts1" } From e278ac0a84bcdafaf1a59633b3fc7ed03cb9d6c9 Mon Sep 17 00:00:00 2001 From: rcnnnghm Date: Fri, 1 Nov 2024 16:23:21 +0000 Subject: [PATCH 17/30] Close ``` to fix delta vacuum scroll in docs. (#391) --- python/hsfs/feature_group.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index 9d286cc29..e2c42f1a3 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -3276,7 +3276,7 @@ def delta_vacuum( fg = fs.get_or_create_feature_group(...) commit_details = fg.delta_vacuum(retention_hours = 168) - + ``` # Arguments retention_hours: User provided retention period. The default retention threshold for the files is 7 days. From b828b0775e8ef376b41ffc7a3f805a7e156fc81e Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Fri, 1 Nov 2024 18:22:13 +0100 Subject: [PATCH 18/30] Sort API documentation pages alphabetically (#393) --- docs/js/inject-api-links.js | 4 --- mkdocs.yml | 70 ++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/docs/js/inject-api-links.js b/docs/js/inject-api-links.js index 6c8a4a3b3..b6aca0755 100644 --- a/docs/js/inject-api-links.js +++ b/docs/js/inject-api-links.js @@ -5,8 +5,6 @@ window.addEventListener("DOMContentLoaded", function () { if (majorVersionRegex.test(windowPathNameSplits[1])) { // On landing page docs.hopsworks.api/3.0 - URL contains major version // Version API dropdown document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/generated/api/login/"; - document.getElementById("hsfs_api_link").href = "https://docs.hopsworks.ai/feature-store-api/" + windowPathNameSplits[1] + "/generated/api/connection_api/"; - document.getElementById("hsml_api_link").href = "https://docs.hopsworks.ai/machine-learning-api/" + windowPathNameSplits[1] + "/generated/connection_api/"; } else { // on docs.hopsworks.api/feature-store-api/3.0 / docs.hopsworks.api/hopsworks-api/3.0 / docs.hopsworks.api/machine-learning-api/3.0 if (latestRegex.test(windowPathNameSplits[2]) || latestRegex.test(windowPathNameSplits[1])) { var majorVersion = "latest"; @@ -24,8 +22,6 @@ window.addEventListener("DOMContentLoaded", function () { document.getElementsByClassName("md-tabs__link")[6].href = "https://docs.hopsworks.ai/" + majorVersion + "/admin/"; // Version API dropdown document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/generated/api/login/"; - document.getElementById("hsfs_api_link").href = "https://docs.hopsworks.ai/feature-store-api/" + majorVersion + "/generated/api/connection_api/"; document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/feature-store-api/" + majorVersion + "/javadoc"; - document.getElementById("hsml_api_link").href = "https://docs.hopsworks.ai/machine-learning-api/" + majorVersion + "/generated/connection_api/"; } }); diff --git a/mkdocs.yml b/mkdocs.yml index 823e3c8f2..2341c5ae1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,69 +17,69 @@ nav: - Setup and Installation: https://docs.hopsworks.ai/ - Administration: https://docs.hopsworks.ai/ - API: + - Login: generated/api/login.md - Platform API: - - Login: generated/api/login.md - Connection: generated/api/connection.md - - Projects: generated/api/projects.md - - Jobs: generated/api/jobs.md + - Datasets: generated/api/datasets.md + - Environment: generated/api/environment.md - Executions: generated/api/executions.md - FlinkCluster: generated/api/flink_cluster.md - - Environment: generated/api/environment.md - - GitRepo: generated/api/git_repo.md - GitProvider: generated/api/git_provider.md - GitRemote: generated/api/git_remote.md - - Datasets: generated/api/datasets.md - - KafkaTopic: generated/api/kafka_topic.md + - GitRepo: generated/api/git_repo.md + - Jobs: generated/api/jobs.md - KafkaSchema: generated/api/kafka_schema.md - - Secrets: generated/api/secrets.md + - KafkaTopic: generated/api/kafka_topic.md - OpenSearch: generated/api/opensearch.md + - Projects: generated/api/projects.md + - Secrets: generated/api/secrets.md - Feature Store API: + - Embedding: + - EmbeddingFeature: generated/api/embedding_feature_api.md + - EmbeddingIndex: generated/api/embedding_index_api.md + - SimilarityFunctionType: generated/api/similarity_function_type_api.md - ExpectationSuite: generated/api/expectation_suite_api.md - - FeatureStore: generated/api/feature_store_api.md - - FeatureGroup: generated/api/feature_group_api.md - ExternalFeatureGroup: generated/api/external_feature_group_api.md - - SpineGroup: generated/api/spine_group_api.md - - FeatureView: generated/api/feature_view_api.md - - TrainingDataset: generated/api/training_dataset_api.md - - Storage Connector: generated/api/storage_connector_api.md - Feature: generated/api/feature_api.md + - Feature Monitoring: + - Configuration: generated/api/feature_monitoring_config_api.md + - Result: generated/api/feature_monitoring_result_api.md + - Window: generated/api/feature_monitoring_window_config_api.md + - FeatureGroup: generated/api/feature_group_api.md + - FeatureStore: generated/api/feature_store_api.md + - FeatureView: generated/api/feature_view_api.md + - Provenance Links: generated/api/links.md - Query: generated/api/query_api.md + - SpineGroup: generated/api/spine_group_api.md + - Statistics: + - Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md + - Split Statistics: generated/api/split_statistics_api.md + - Statistics: generated/api/statistics_api.md + - Storage Connector: generated/api/storage_connector_api.md + - TrainingDataset: generated/api/training_dataset_api.md - Transformation Functions: - - UDF: generated/api/udf.md - HopsworksUDF: generated/api/hopsworks_udf.md - - TransformationFunction: generated/api/transformation_functions_api.md - Transformation Statistics: - - TransformationStatistics: generated/api/transformation_statistics.md - FeatureTransformationStatistics: generated/api/feature_transformation_statistics.md + - TransformationStatistics: generated/api/transformation_statistics.md + - TransformationFunction: generated/api/transformation_functions_api.md + - UDF: generated/api/udf.md - ValidationReport: generated/api/validation_report_api.md - - Provenance Links: generated/api/links.md - - Statistics: - - Statistics: generated/api/statistics_api.md - - Split Statistics: generated/api/split_statistics_api.md - - Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md - - Feature Monitoring: - - Configuration: generated/api/feature_monitoring_config_api.md - - Result: generated/api/feature_monitoring_result_api.md - - Window: generated/api/feature_monitoring_window_config_api.md - - Embedding: - - EmbeddingIndex: generated/api/embedding_index_api.md - - EmbeddingFeature: generated/api/embedding_feature_api.md - - SimilarityFunctionType: generated/api/similarity_function_type_api.md - Machine Learning API: - Model Registry: - - Model Registry: generated/model-registry/model_registry_api.md - Model: generated/model-registry/model_api.md + - Model Registry: generated/model-registry/model_registry_api.md - Model Schema: generated/model-registry/model_schema_api.md - Model Serving: - - Model Serving: generated/model-serving/model_serving_api.md - Deployment: generated/model-serving/deployment_api.md - Deployment state: generated/model-serving/predictor_state_api.md - Deployment state condition: generated/model-serving/predictor_state_condition_api.md - - Predictor: generated/model-serving/predictor_api.md - - Transformer: generated/model-serving/transformer_api.md - - Inference Logger: generated/model-serving/inference_logger_api.md - Inference Batcher: generated/model-serving/inference_batcher_api.md + - Inference Logger: generated/model-serving/inference_logger_api.md + - Model Serving: generated/model-serving/model_serving_api.md + - Predictor: generated/model-serving/predictor_api.md - Resources: generated/model-serving/resources_api.md + - Transformer: generated/model-serving/transformer_api.md # Added to allow navigation using the side drawer - Feature Store JavaDoc: https://docs.hopsworks.ai/feature-store-javadoc/latest/ - Contributing: CONTRIBUTING.md From 3b8fed0573ddb50c9e9f93afaa810ad2c6921157 Mon Sep 17 00:00:00 2001 From: manu-sj <152865565+manu-sj@users.noreply.github.com> Date: Wed, 6 Nov 2024 13:53:44 +0100 Subject: [PATCH 19/30] [HWORKS-1535] Infer model schema from feature view (#380) * [HWORKS-1535] Infer model schema from feature view * using feature view schema function to get model schema * removing training dataset since it is not being used anywhere * importing training dataset inside the init function to prevent circular dependencies * fixing unit tests --------- Co-authored-by: Alexandru Ormenisan --- python/hsml/model.py | 58 +++++++++++-------- python/hsml/utils/schema/columnar_schema.py | 13 ++--- python/tests/fixtures/model_fixtures.json | 8 --- python/tests/test_model.py | 1 - .../utils/schema/test_columnar_schema.py | 4 +- 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/python/hsml/model.py b/python/hsml/model.py index a545fa5ea..838d84f68 100644 --- a/python/hsml/model.py +++ b/python/hsml/model.py @@ -29,8 +29,10 @@ from hsml.engine import model_engine from hsml.inference_batcher import InferenceBatcher from hsml.inference_logger import InferenceLogger +from hsml.model_schema import ModelSchema from hsml.predictor import Predictor from hsml.resources import PredictorResources +from hsml.schema import Schema from hsml.transformer import Transformer @@ -54,7 +56,6 @@ def __init__( program=None, user_full_name=None, model_schema=None, - training_dataset=None, input_example=None, framework=None, model_registry_id=None, @@ -84,7 +85,6 @@ def __init__( self._input_example = input_example self._framework = framework self._model_schema = model_schema - self._training_dataset = training_dataset # This is needed for update_from_response_json function to not overwrite name of the shared registry this model originates from if not hasattr(self, "_shared_registry_project_name"): @@ -95,17 +95,6 @@ def __init__( self._model_engine = model_engine.ModelEngine() self._feature_view = feature_view self._training_dataset_version = training_dataset_version - if training_dataset_version is None and feature_view is not None: - if feature_view.get_last_accessed_training_dataset() is not None: - self._training_dataset_version = ( - feature_view.get_last_accessed_training_dataset() - ) - else: - warnings.warn( - "Provenance cached data - feature view provided, but training dataset version is missing", - util.ProvenanceWarning, - stacklevel=1, - ) @usage.method_logger def save( @@ -131,6 +120,39 @@ def save( # Returns `Model`: The model metadata object. """ + if self._training_dataset_version is None and self._feature_view is not None: + if self._feature_view.get_last_accessed_training_dataset() is not None: + self._training_dataset_version = ( + self._feature_view.get_last_accessed_training_dataset() + ) + else: + warnings.warn( + "Provenance cached data - feature view provided, but training dataset version is missing", + util.ProvenanceWarning, + stacklevel=1, + ) + if self._model_schema is None: + if ( + self._feature_view is not None + and self._training_dataset_version is not None + ): + all_features = self._feature_view.get_training_dataset_schema( + self._training_dataset_version + ) + features, labels = [], [] + for feature in all_features: + (labels if feature.label else features).append(feature.to_dict()) + self._model_schema = ModelSchema( + input_schema=Schema(features) if features else None, + output_schema=Schema(labels) if labels else None, + ) + else: + warnings.warn( + "Model schema cannot not be inferred without both the feature view and the training dataset version.", + util.ProvenanceWarning, + stacklevel=1, + ) + return self._model_engine.save( model_instance=self, model_path=model_path, @@ -375,7 +397,6 @@ def to_dict(self): "inputExample": self._input_example, "framework": self._framework, "metrics": self._training_metrics, - "trainingDataset": self._training_dataset, "environment": self._environment, "program": self._program, "featureView": util.feature_view_to_json(self._feature_view), @@ -510,15 +531,6 @@ def model_schema(self): def model_schema(self, model_schema): self._model_schema = model_schema - @property - def training_dataset(self): - """training_dataset of the model.""" - return self._training_dataset - - @training_dataset.setter - def training_dataset(self, training_dataset): - self._training_dataset = training_dataset - @property def project_name(self): """project_name of the model.""" diff --git a/python/hsml/utils/schema/columnar_schema.py b/python/hsml/utils/schema/columnar_schema.py index 3aa5fde0e..a7468401f 100644 --- a/python/hsml/utils/schema/columnar_schema.py +++ b/python/hsml/utils/schema/columnar_schema.py @@ -20,11 +20,6 @@ from hsml.utils.schema.column import Column -try: - import hsfs -except ImportError: - pass - try: import pyspark except ImportError: @@ -35,6 +30,10 @@ class ColumnarSchema: """Metadata object representing a columnar schema for a model.""" def __init__(self, columnar_obj=None): + from hsfs.training_dataset import ( + TrainingDataset, # import performed here to prevent circular dependencies when importing ModelSchema + ) + if isinstance(columnar_obj, list): self.columns = self._convert_list_to_schema(columnar_obj) elif isinstance(columnar_obj, pandas.DataFrame): @@ -45,9 +44,7 @@ def __init__(self, columnar_obj=None): columnar_obj, pyspark.sql.dataframe.DataFrame ): self.columns = self._convert_spark_to_schema(columnar_obj) - elif importlib.util.find_spec("hsfs") is not None and isinstance( - columnar_obj, hsfs.training_dataset.TrainingDataset - ): + elif isinstance(columnar_obj, TrainingDataset): self.columns = self._convert_td_to_schema(columnar_obj) else: raise TypeError( diff --git a/python/tests/fixtures/model_fixtures.json b/python/tests/fixtures/model_fixtures.json index cf44c3111..a937eab40 100644 --- a/python/tests/fixtures/model_fixtures.json +++ b/python/tests/fixtures/model_fixtures.json @@ -16,7 +16,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -42,7 +41,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -69,7 +67,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -96,7 +93,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -123,7 +119,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -150,7 +145,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -177,7 +171,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -197,7 +190,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], diff --git a/python/tests/test_model.py b/python/tests/test_model.py index b153b5742..841f3ce6e 100644 --- a/python/tests/test_model.py +++ b/python/tests/test_model.py @@ -372,7 +372,6 @@ def assert_model(self, mocker, m, m_json, model_framework): assert m.project_name == m_json["project_name"] assert m.training_metrics == m_json["metrics"] assert m._user_full_name == m_json["user_full_name"] - assert m.training_dataset == m_json["training_dataset"] assert m.model_registry_id == m_json["model_registry_id"] if model_framework is None: diff --git a/python/tests/utils/schema/test_columnar_schema.py b/python/tests/utils/schema/test_columnar_schema.py index c01c3c33d..6ddffea5d 100644 --- a/python/tests/utils/schema/test_columnar_schema.py +++ b/python/tests/utils/schema/test_columnar_schema.py @@ -57,7 +57,7 @@ def test_constructor_default(self, mocker): mock_convert_pandas_series_to_schema.assert_not_called() mock_convert_spark_to_schema.assert_not_called() mock_convert_td_to_schema.assert_not_called() - assert mock_find_spec.call_count == 2 + assert mock_find_spec.call_count == 1 def test_constructor_list(self, mocker): # Arrange @@ -257,7 +257,7 @@ def test_constructor_hsfs_td(self, mocker): mock_convert_pandas_series_to_schema.assert_not_called() mock_convert_spark_to_schema.assert_not_called() mock_convert_td_to_schema.assert_called_once_with(columnar_obj) - assert mock_find_spec.call_count == 2 + assert mock_find_spec.call_count == 1 # convert list to schema From 63dbad805e5774a368848836c4d01f802f3e6cef Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Fri, 8 Nov 2024 11:39:41 +0100 Subject: [PATCH 20/30] Sync inject-api-links with logicalclocks.github.io (#395) --- docs/js/inject-api-links.js | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/js/inject-api-links.js b/docs/js/inject-api-links.js index b6aca0755..89082c67d 100644 --- a/docs/js/inject-api-links.js +++ b/docs/js/inject-api-links.js @@ -1,16 +1,17 @@ window.addEventListener("DOMContentLoaded", function () { var windowPathNameSplits = window.location.pathname.split("/"); - var majorVersionRegex = new RegExp("(\\d+[.]\\d+)") + var majorVersionRegex = new RegExp("(\\d+[.]\\d+)"); var latestRegex = new RegExp("latest"); - if (majorVersionRegex.test(windowPathNameSplits[1])) { // On landing page docs.hopsworks.api/3.0 - URL contains major version + if (majorVersionRegex.test(windowPathNameSplits[1])) { // On landing page docs.hopsworks.api/4.0 - URL contains major version // Version API dropdown document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/generated/api/login/"; - } else { // on docs.hopsworks.api/feature-store-api/3.0 / docs.hopsworks.api/hopsworks-api/3.0 / docs.hopsworks.api/machine-learning-api/3.0 + document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/javadoc"; + } else { // on / docs.hopsworks.api/hopsworks-api/4.0 if (latestRegex.test(windowPathNameSplits[2]) || latestRegex.test(windowPathNameSplits[1])) { - var majorVersion = "latest"; + var majorVersion = "latest"; } else { - var apiVersion = windowPathNameSplits[2]; - var majorVersion = apiVersion.match(majorVersionRegex)[0]; + var apiVersion = windowPathNameSplits[2]; + var majorVersion = apiVersion.match(majorVersionRegex)[0]; } // Version main navigation document.getElementsByClassName("md-tabs__link")[0].href = "https://docs.hopsworks.ai/" + majorVersion; @@ -22,6 +23,6 @@ window.addEventListener("DOMContentLoaded", function () { document.getElementsByClassName("md-tabs__link")[6].href = "https://docs.hopsworks.ai/" + majorVersion + "/admin/"; // Version API dropdown document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/generated/api/login/"; - document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/feature-store-api/" + majorVersion + "/javadoc"; + document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/javadoc"; } }); From bcc435bb501c40d8e23690b429b1620efb4ec0ff Mon Sep 17 00:00:00 2001 From: Ralf Date: Sun, 10 Nov 2024 06:11:18 +0200 Subject: [PATCH 21/30] [FSTORE-1591] Example on how to create a table with table space (#388) --- python/hsfs/feature_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index f7ba9044d..4b45c9c77 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -556,7 +556,7 @@ def plus_two(value): online_enabled=True, event_time='date', transformation_functions=transformation_functions, - online_config={'online_comments': ['NDB_TABLE=READ_BACKUP=1']} + online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1']} ) ``` @@ -721,7 +721,7 @@ def get_or_create_feature_group( online_enabled=True, event_time="timestamp", transformation_functions=transformation_functions, - online_config={'online_comments': ['NDB_TABLE=READ_BACKUP=1']} + online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1']} ) ``` @@ -1023,7 +1023,7 @@ def create_external_feature_group( primary_key=['ss_store_sk'], event_time='sale_date', online_enabled=True, - online_config={'online_comments': ['NDB_TABLE=READ_BACKUP=1']} + online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1']} ) external_fg.save() From 0ed069fc6272f525e78d834338de406274ab6967 Mon Sep 17 00:00:00 2001 From: Alex Ormenisan Date: Sun, 10 Nov 2024 23:45:49 +0100 Subject: [PATCH 22/30] Model provenance - argument description (#377) --- python/hsml/llm/signature.py | 2 ++ python/hsml/python/signature.py | 2 ++ python/hsml/sklearn/signature.py | 2 ++ python/hsml/tensorflow/signature.py | 2 ++ python/hsml/torch/signature.py | 2 ++ 5 files changed, 10 insertions(+) diff --git a/python/hsml/llm/signature.py b/python/hsml/llm/signature.py index 9ac7db9ff..05ff003eb 100644 --- a/python/hsml/llm/signature.py +++ b/python/hsml/llm/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/python/signature.py b/python/hsml/python/signature.py index 1bb5fa8f7..fa704aaab 100644 --- a/python/hsml/python/signature.py +++ b/python/hsml/python/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/sklearn/signature.py b/python/hsml/sklearn/signature.py index f8816febb..4c145a96a 100644 --- a/python/hsml/sklearn/signature.py +++ b/python/hsml/sklearn/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/tensorflow/signature.py b/python/hsml/tensorflow/signature.py index 1f83c5496..e24d20e65 100644 --- a/python/hsml/tensorflow/signature.py +++ b/python/hsml/tensorflow/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/torch/signature.py b/python/hsml/torch/signature.py index 5234d110a..bab488974 100644 --- a/python/hsml/torch/signature.py +++ b/python/hsml/torch/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. From e4af9053219362bc058b64f8f4484b07efdd153f Mon Sep 17 00:00:00 2001 From: Robin Andersson Date: Tue, 12 Nov 2024 12:21:12 +0100 Subject: [PATCH 23/30] [HWORKS-1802] Exporting model on windows with directories in the model directory creates invalid upload path (#398) --- python/hsml/engine/model_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsml/engine/model_engine.py b/python/hsml/engine/model_engine.py index d2e8a85af..d1f21365f 100644 --- a/python/hsml/engine/model_engine.py +++ b/python/hsml/engine/model_engine.py @@ -213,7 +213,7 @@ def _upload_local_model( # we need to replace the local path prefix with the hdfs path prefix (i.e., /srv/hops/....../root with /Projects/.../) remote_base_path = root.replace( from_local_model_path, to_model_files_path - ) + ).replace(os.sep, "/") for d_name in dirs: self._engine.mkdir(remote_base_path + "/" + d_name) n_dirs += 1 From 243c5ef26f5cf05ae297639e523305eb35cf974c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Thu, 14 Nov 2024 10:36:29 -0500 Subject: [PATCH 24/30] [HWORKS-1805] Increase timeouts for starting/stopping deployments (#399) --- python/hsml/deployment.py | 6 +++--- python/tests/test_deployment.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/hsml/deployment.py b/python/hsml/deployment.py index f6c064759..f23eec2ed 100644 --- a/python/hsml/deployment.py +++ b/python/hsml/deployment.py @@ -67,7 +67,7 @@ def __init__( self._model_registry_id = None @usage.method_logger - def save(self, await_update: Optional[int] = 60): + def save(self, await_update: Optional[int] = 120): """Persist this deployment including the predictor and metadata to Model Serving. # Arguments @@ -79,7 +79,7 @@ def save(self, await_update: Optional[int] = 60): self._serving_engine.save(self, await_update) @usage.method_logger - def start(self, await_running: Optional[int] = 60): + def start(self, await_running: Optional[int] = 120): """Start the deployment # Arguments @@ -91,7 +91,7 @@ def start(self, await_running: Optional[int] = 60): self._serving_engine.start(self, await_status=await_running) @usage.method_logger - def stop(self, await_stopped: Optional[int] = 60): + def stop(self, await_stopped: Optional[int] = 120): """Stop the deployment # Arguments diff --git a/python/tests/test_deployment.py b/python/tests/test_deployment.py index 63e791126..88cffe9fb 100644 --- a/python/tests/test_deployment.py +++ b/python/tests/test_deployment.py @@ -145,7 +145,7 @@ def test_save_default(self, mocker, backend_fixtures): d.save() # Assert - mock_serving_engine_save.assert_called_once_with(d, 60) + mock_serving_engine_save.assert_called_once_with(d, 120) def test_save(self, mocker, backend_fixtures): # Arrange @@ -176,7 +176,7 @@ def test_start_default(self, mocker, backend_fixtures): d.start() # Assert - mock_serving_engine_start.assert_called_once_with(d, await_status=60) + mock_serving_engine_start.assert_called_once_with(d, await_status=120) def test_start(self, mocker, backend_fixtures): # Arrange @@ -207,7 +207,7 @@ def test_stop_default(self, mocker, backend_fixtures): d.stop() # Assert - mock_serving_engine_stop.assert_called_once_with(d, await_status=60) + mock_serving_engine_stop.assert_called_once_with(d, await_status=120) def test_stop(self, mocker, backend_fixtures): # Arrange From 53961cd2019962a5b1b8d2b07a25a8109ab66ac7 Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Thu, 14 Nov 2024 14:53:35 -0800 Subject: [PATCH 25/30] [FSTORE-1604] Add option to avoid setting s3a global options (#401) --- python/hsfs/engine/spark.py | 8 +- python/tests/engine/test_spark.py | 136 +++++++++++++++++++++++++++--- 2 files changed, 130 insertions(+), 14 deletions(-) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 0c68226d5..67e15468b 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -1277,8 +1277,12 @@ def setup_storage_connector(self, storage_connector, path=None): return path def _setup_s3_hadoop_conf(self, storage_connector, path): - # For legacy behaviour set the S3 values at global level - self._set_s3_hadoop_conf(storage_connector, "fs.s3a") + FS_S3_GLOBAL_CONF = "fs.s3a.global-conf" + + # The argument arrive here as strings + if storage_connector.arguments.get(FS_S3_GLOBAL_CONF, "True").lower() == "true": + # For legacy behaviour set the S3 values at global level + self._set_s3_hadoop_conf(storage_connector, "fs.s3a") # Set credentials at bucket level as well to allow users to use multiple # storage connector in the same application. diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 05bb33180..da3449270 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -15,6 +15,8 @@ # from __future__ import annotations +from unittest.mock import call + import hopsworks_common import numpy import pandas as pd @@ -203,7 +205,9 @@ def test_register_hudi_temporary_table(self, mocker): # Arrange mock_hudi_engine = mocker.patch("hsfs.core.hudi_engine.HudiEngine") mocker.patch("hsfs.feature_group.FeatureGroup.from_response_json") - mock_reconcile_schema = mocker.patch("hsfs.engine.spark.Engine.reconcile_schema") + mock_reconcile_schema = mocker.patch( + "hsfs.engine.spark.Engine.reconcile_schema" + ) spark_engine = spark.Engine() @@ -227,7 +231,9 @@ def test_register_delta_temporary_table(self, mocker): # Arrange mock_delta_engine = mocker.patch("hsfs.core.delta_engine.DeltaEngine") mocker.patch("hsfs.feature_group.FeatureGroup.from_response_json") - mock_reconcile_schema = mocker.patch("hsfs.engine.spark.Engine.reconcile_schema") + mock_reconcile_schema = mocker.patch( + "hsfs.engine.spark.Engine.reconcile_schema" + ) spark_engine = spark.Engine() @@ -1562,13 +1568,13 @@ def test_serialize_to_avro(self, mocker): # Arrange spark_engine = spark.Engine() - mock_to_avro = mocker.patch('hsfs.engine.spark.to_avro') - mock_to_avro.return_value = lit(b'111') + mock_to_avro = mocker.patch("hsfs.engine.spark.to_avro") + mock_to_avro.return_value = lit(b"111") fg_data = [] fg_data.append(("ekarson", ["GRAVITY RUSH 2", "KING'S QUEST"])) fg_data.append(("ratmilkdrinker", ["NBA 2K", "CALL OF DUTY"])) - pandas_df = pd.DataFrame(fg_data, columns =["account_id", "last_played_games"]) + pandas_df = pd.DataFrame(fg_data, columns=["account_id", "last_played_games"]) df = spark_engine._spark_session.createDataFrame(pandas_df) @@ -1587,10 +1593,10 @@ def test_serialize_to_avro(self, mocker): features=features, ) fg._subject = { - 'id': 1025, - 'subject': 'fg_1', - 'version': 1, - 'schema': '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]}]}' + "id": 1025, + "subject": "fg_1", + "version": 1, + "schema": '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]}]}', } # Act @@ -1600,9 +1606,12 @@ def test_serialize_to_avro(self, mocker): ) # Assert - assert serialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}' + assert ( + serialized_df.schema.json() + == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}' + ) - ''' Need spark to run these tests properly + """ Need spark to run these tests properly def test_deserialize_from_avro(self, mocker): # Arrange spark_engine = spark.Engine() @@ -1695,7 +1704,7 @@ def test_serialize_deserialize_avro(self, mocker): assert serialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}' assert df.schema == deserialized_df.schema assert df.collect() == deserialized_df.collect() - ''' + """ def test_get_training_data(self, mocker): # Arrange @@ -4265,6 +4274,109 @@ def test_setup_s3_hadoop_conf_legacy(self, mocker): "fs.s3a.endpoint", s3_connector.arguments.get("fs.s3a.endpoint") ) + def test_setup_s3_hadoop_conf_disable_legacy(self, mocker): + # Arrange + mock_pyspark_getOrCreate = mocker.patch( + "pyspark.sql.session.SparkSession.builder.getOrCreate" + ) + + spark_engine = spark.Engine() + + s3_connector = storage_connector.S3Connector( + id=1, + name="test_connector", + featurestore_id=99, + bucket="bucket-name", + access_key="1", + secret_key="2", + server_encryption_algorithm="3", + server_encryption_key="4", + session_token="5", + arguments=[ + {"name": "fs.s3a.endpoint", "value": "testEndpoint"}, + {"name": "fs.s3a.global-conf", "value": "False"}, + ], + ) + + # Act + result = spark_engine._setup_s3_hadoop_conf( + storage_connector=s3_connector, + path="s3://_test_path", + ) + + # Assert + assert result == "s3a://_test_path" + assert ( + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.call_count + == 7 # Options should only be set at bucket level + ) + assert ( + call("fs.s3a.access.key", s3_connector.access_key) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + assert ( + call("fs.s3a.secret.key", s3_connector.secret_key) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + assert ( + call( + "fs.s3a.server-side-encryption-algorithm", + s3_connector.server_encryption_algorithm, + ) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call( + "fs.s3a.server-side-encryption-key", s3_connector.server_encryption_key + ) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call( + "fs.s3a.aws.credentials.provider", + "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider", + ) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call("fs.s3a.session.token", s3_connector.session_token) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call("fs.s3a.endpoint", s3_connector.arguments.get("fs.s3a.endpoint")) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.access.key", s3_connector.access_key + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.secret.key", s3_connector.secret_key + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.server-side-encryption-algorithm", + s3_connector.server_encryption_algorithm, + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.server-side-encryption-key", + s3_connector.server_encryption_key, + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.aws.credentials.provider", + "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider", + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.session.token", s3_connector.session_token + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.endpoint", + s3_connector.arguments.get("fs.s3a.endpoint"), + ) + def test_setup_s3_hadoop_conf_bucket_scope(self, mocker): # Arrange mock_pyspark_getOrCreate = mocker.patch( From ce5cf9fe4c014129f83e160601840350cb441492 Mon Sep 17 00:00:00 2001 From: Robin Andersson Date: Tue, 19 Nov 2024 15:53:48 +0100 Subject: [PATCH 26/30] [HWORKS-1442] Extend the dataset upload method to be able to upload a directory (#402) --- python/hopsworks_common/core/dataset_api.py | 89 ++++++++++++++++----- 1 file changed, 71 insertions(+), 18 deletions(-) diff --git a/python/hopsworks_common/core/dataset_api.py b/python/hopsworks_common/core/dataset_api.py index 4c9672cc8..8dac9a421 100644 --- a/python/hopsworks_common/core/dataset_api.py +++ b/python/hopsworks_common/core/dataset_api.py @@ -145,10 +145,11 @@ def upload( overwrite: bool = False, chunk_size=DEFAULT_FLOW_CHUNK_SIZE, simultaneous_uploads=3, + simultaneous_chunks=3, max_chunk_retries=1, chunk_retry_interval=1, ): - """Upload a file to the Hopsworks filesystem. + """Upload a file or directory to the Hopsworks filesystem. ```python @@ -158,44 +159,93 @@ def upload( dataset_api = project.get_dataset_api() + # upload a file to Resources dataset uploaded_file_path = dataset_api.upload("my_local_file.txt", "Resources") + # upload a directory to Resources dataset + uploaded_file_path = dataset_api.upload("my_dir", "Resources") + ``` # Arguments - local_path: local path to file to upload + local_path: local path to file or directory to upload, can be relative or absolute upload_path: path to directory where to upload the file in Hopsworks Filesystem - overwrite: overwrite file if exists + overwrite: overwrite file or directory if exists chunk_size: upload chunk size in bytes. Default 1048576 bytes - simultaneous_uploads: number of simultaneous chunks to upload. Default 3 + simultaneous_chunks: number of simultaneous chunks to upload for each file upload. Default 3 + simultaneous_uploads: number of simultaneous files to be uploaded for directories. Default 3 max_chunk_retries: maximum retry for a chunk. Default is 1 chunk_retry_interval: chunk retry interval in seconds. Default is 1sec # Returns - `str`: Path to uploaded file + `str`: Path to uploaded file or directory # Raises - `RestAPIError`: If unable to upload the file + `RestAPIError`: If unable to upload the file or directory """ + # local path could be absolute or relative, if not os.path.isabs(local_path) and os.path.exists( os.path.join(os.getcwd(), local_path) ): local_path = os.path.join(os.getcwd(), local_path) - file_size = os.path.getsize(local_path) - _, file_name = os.path.split(local_path) destination_path = upload_path + "/" + file_name if self.exists(destination_path): if overwrite: - self.remove(destination_path) + if 'datasetType' in self._get(destination_path): + raise DatasetException("overwrite=True not supported on a top-level dataset") + else: + self.remove(destination_path) else: raise DatasetException( "{} already exists, set overwrite=True to overwrite it".format( - local_path + destination_path ) ) + if os.path.isdir(local_path): + self.mkdir(destination_path) + + if os.path.isdir(local_path): + with ThreadPoolExecutor(simultaneous_uploads) as executor: + # if path is a dir, upload files and folders iteratively + for root, dirs, files in os.walk(local_path): + # os.walk(local_model_path), where local_model_path is expected to be an absolute path + # - root is the absolute path of the directory being walked + # - dirs is the list of directory names present in the root dir + # - files is the list of file names present in the root dir + # we need to replace the local path prefix with the hdfs path prefix (i.e., /srv/hops/....../root with /Projects/.../) + remote_base_path = root.replace( + local_path, destination_path + ).replace(os.sep, "/") + for d_name in dirs: + self.mkdir(remote_base_path + "/" + d_name) + + # uploading files in the same folder is done concurrently + futures = [ + executor.submit( + self._upload_file, f_name, root + os.sep + f_name, remote_base_path, chunk_size, simultaneous_chunks, max_chunk_retries, chunk_retry_interval + ) + for f_name in files + ] + + # wait for all upload tasks to complete + _, _ = wait(futures) + try: + _ = [future.result() for future in futures] + except Exception as e: + raise e + else: + self._upload_file(file_name, local_path, upload_path, chunk_size, simultaneous_chunks, max_chunk_retries, chunk_retry_interval) + + return upload_path + "/" + os.path.basename(local_path) + + + def _upload_file(self, file_name, local_path, upload_path, chunk_size, simultaneous_chunks, max_chunk_retries, chunk_retry_interval): + + file_size = os.path.getsize(local_path) + num_chunks = math.ceil(file_size / chunk_size) base_params = self._get_flow_base_params( @@ -209,15 +259,15 @@ def upload( pbar = tqdm( total=file_size, bar_format="{desc}: {percentage:.3f}%|{bar}| {n_fmt}/{total_fmt} elapsed<{elapsed} remaining<{remaining}", - desc="Uploading", + desc="Uploading {}".format(local_path), ) except Exception: self._log.exception("Failed to initialize progress bar.") self._log.info("Starting upload") - with ThreadPoolExecutor(simultaneous_uploads) as executor: + with ThreadPoolExecutor(simultaneous_chunks) as executor: while True: chunks = [] - for _ in range(simultaneous_uploads): + for _ in range(simultaneous_chunks): chunk = f.read(chunk_size) if not chunk: break @@ -255,8 +305,6 @@ def upload( else: self._log.info("Upload finished") - return upload_path + "/" + os.path.basename(local_path) - def _upload_chunk( self, base_params, @@ -415,7 +463,10 @@ def copy(self, source_path: str, destination_path: str, overwrite: bool = False) """ if self.exists(destination_path): if overwrite: - self.remove(destination_path) + if 'datasetType' in self._get(destination_path): + raise DatasetException("overwrite=True not supported on a top-level dataset") + else: + self.remove(destination_path) else: raise DatasetException( "{} already exists, set overwrite=True to overwrite it".format( @@ -453,10 +504,12 @@ def move(self, source_path: str, destination_path: str, overwrite: bool = False) # Raises `RestAPIError`: If unable to perform the move """ - if self.exists(destination_path): if overwrite: - self.remove(destination_path) + if 'datasetType' in self._get(destination_path): + raise DatasetException("overwrite=True not supported on a top-level dataset") + else: + self.remove(destination_path) else: raise DatasetException( "{} already exists, set overwrite=True to overwrite it".format( From ef383bce465ce9adf6457724865a3d81387d536c Mon Sep 17 00:00:00 2001 From: manu-sj <152865565+manu-sj@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:57:39 +0530 Subject: [PATCH 27/30] =?UTF-8?q?[FSTORE-1606]=20Allow=20=C2=B4entries?= =?UTF-8?q?=C2=B4=20to=20be=20None=20while=20retrieving=20feature=20vector?= =?UTF-8?q?s=20from=20a=20feature=20view=20with=20only=20on-demand=20featu?= =?UTF-8?q?res=20(#405)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * allow entries to be None when feature view contains only on-demand features * adddressing review comments --- python/hsfs/core/vector_server.py | 38 +++++++++++++++++++++++++++++-- python/hsfs/feature_view.py | 4 ++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 277b25051..d354a5400 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -149,6 +149,7 @@ def __init__( self._feature_to_handle_if_sql: Optional[Set[str]] = None self._valid_serving_keys: Set[str] = set() self._serving_initialized: bool = False + self.__all_features_on_demand: Optional[bool] = None def init_serving( self, @@ -415,14 +416,23 @@ def get_feature_vectors( request_parameters is None or len(request_parameters) == 0 or isinstance(request_parameters, dict) + or not entries or len(request_parameters) == len(entries) - ), "Request Parameters should be a Dictionary, None, empty or have the same length as the entries" + ), "Request Parameters should be a Dictionary, None, empty or have the same length as the entries if they are not None or empty." online_client_choice = self.which_client_and_ensure_initialised( force_rest_client=force_rest_client, force_sql_client=force_sql_client ) rondb_entries = [] skipped_empty_entries = [] + + if not entries: + entries = ( + [[] * len(request_parameters)] + if isinstance(request_parameters, list) + else [[]] + ) + for (idx, entry), passed, vector_features in itertools.zip_longest( enumerate(entries), passed_features, @@ -547,7 +557,11 @@ def assemble_feature_vector( # for backward compatibility, before 3.4, if result is empty, # instead of throwing error, it skips the result # Maybe we drop this behaviour for 4.0 - if len(result_dict) == 0 and not allow_missing: + if ( + len(result_dict) == 0 + and not allow_missing + and not self._all_features_on_demand + ): return None if not allow_missing and len(missing_features) > 0: @@ -1255,6 +1269,17 @@ def validate_entry( Keys relevant to vector_db are filtered out. """ + _logger.debug( + "Checking if entry is None and all features in the feature view are on-demand." + ) + if not entry: + if self._all_features_on_demand: + return {} + else: + raise exceptions.FeatureStoreException( + "The required argument `entries` is missing. If the feature view includes only on-demand features, entries may be left empty or set to None." + ) + _logger.debug("Checking keys in entry are valid serving keys.") for key in entry.keys(): if key not in self.valid_serving_keys: @@ -1584,3 +1609,12 @@ def transformed_feature_vector_col_name(self): ] self._transformed_feature_vector_col_name.extend(output_column_names) return self._transformed_feature_vector_col_name + + @property + def _all_features_on_demand(self) -> bool: + """True if all features in the feature view are on-demand.""" + if self.__all_features_on_demand is None: + self.__all_features_on_demand = all( + feature.on_demand_transformation_function for feature in self._features + ) + return self.__all_features_on_demand diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 5d3151b18..b61b3e09a 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -520,7 +520,7 @@ def get_batch_query( def get_feature_vector( self, - entry: Dict[str, Any], + entry: Optional[Dict[str, Any]] = None, passed_features: Optional[Dict[str, Any]] = None, external: Optional[bool] = None, return_type: Literal["list", "polars", "numpy", "pandas"] = "list", @@ -635,7 +635,7 @@ def get_feature_vector( def get_feature_vectors( self, - entry: List[Dict[str, Any]], + entry: Optional[List[Dict[str, Any]]] = None, passed_features: Optional[List[Dict[str, Any]]] = None, external: Optional[bool] = None, return_type: Literal["list", "polars", "numpy", "pandas"] = "list", From aca9a602cad3ec0b98b77f9c037ec7c3d7c04757 Mon Sep 17 00:00:00 2001 From: Ralf Date: Thu, 21 Nov 2024 11:07:10 +0200 Subject: [PATCH 28/30] [FSTORE-1615] Inserting into streaming FG fails after cluster upgrade (#407) --- python/hsfs/engine/python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index c0218e847..eeacf8e27 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -1510,7 +1510,7 @@ def _write_dataframe_kafka( topic_name=feature_group._online_topic_name, feature_store_id=feature_group.feature_store_id, offline_write_options=offline_write_options, - high=True, + high=False, ) now = datetime.now(timezone.utc) feature_group.materialization_job.run( From 96a67fec7394f6d13864abd42bf2650332d98341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Thu, 21 Nov 2024 17:30:40 +0100 Subject: [PATCH 29/30] [HWORKS-1822] Infer valid serving name based on the model name --- python/hsml/model.py | 6 +++++- python/hsml/model_serving.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/hsml/model.py b/python/hsml/model.py index 838d84f68..2dfa7d7ab 100644 --- a/python/hsml/model.py +++ b/python/hsml/model.py @@ -17,6 +17,7 @@ import json import logging import os +import re import warnings from typing import Any, Dict, Optional, Union @@ -236,7 +237,7 @@ def deploy( """ if name is None: - name = self._name + name = self._get_default_serving_name() predictor = Predictor.for_model( self, @@ -366,6 +367,9 @@ def get_training_dataset_provenance(self): """ return self._model_engine.get_training_dataset_provenance(model_instance=self) + def _get_default_serving_name(self): + return re.sub(r"[^a-zA-Z0-9]", "", self._name) + @classmethod def from_response_json(cls, json_dict): json_decamelized = humps.decamelize(json_dict) diff --git a/python/hsml/model_serving.py b/python/hsml/model_serving.py index d448193aa..9b3c34ba5 100644 --- a/python/hsml/model_serving.py +++ b/python/hsml/model_serving.py @@ -126,7 +126,7 @@ def get_deployments(self, model: Model = None, status: str = None): `RestAPIError`: If unable to retrieve deployments from model serving. """ - model_name = model.name if model is not None else None + model_name = model._get_default_serving_name() if model is not None else None if status is not None: self._validate_deployment_status(status) @@ -208,7 +208,7 @@ def create_predictor( """ if name is None: - name = model.name + name = model._get_default_serving_name() return Predictor.for_model( model, From baf922f73b1925c1eab7e5a16c8ec8d326cada67 Mon Sep 17 00:00:00 2001 From: Javier Cabrera Date: Tue, 26 Nov 2024 10:08:14 +0100 Subject: [PATCH 30/30] Create fv in master code (#411) --- locust_benchmark/create_feature_group.py | 1 + 1 file changed, 1 insertion(+) diff --git a/locust_benchmark/create_feature_group.py b/locust_benchmark/create_feature_group.py index 2ac6cf568..dbc237e27 100644 --- a/locust_benchmark/create_feature_group.py +++ b/locust_benchmark/create_feature_group.py @@ -4,4 +4,5 @@ hopsworks_client = HopsworksClient() fg = hopsworks_client.get_or_create_fg() hopsworks_client.insert_data(fg) + hopsworks_client.get_or_create_fv() hopsworks_client.close()