Merge branch 'main' into sync-inject-api-links

logicalclocks · Nov 5, 2024 · c0e5885 · c0e5885
2 parents 280bd39 + b828b07
commit c0e5885
Show file tree

Hide file tree

Showing 16 changed files with 117 additions and 107 deletions.
diff --git a/locust_benchmark/Dockerfile b/locust_benchmark/Dockerfile
@@ -1,4 +1,4 @@
-FROM locustio/locust:2.17.0
+FROM locustio/locust:2.23.1
 
 USER root
 

diff --git a/locust_benchmark/Jenkinsfile b/locust_benchmark/Jenkinsfile
@@ -0,0 +1,20 @@
+@Library("jenkins-library@main")
+
+import com.logicalclocks.jenkins.k8s.ImageBuilder
+
+
+node("local") {
+    stage('Clone repository') {
+      checkout scm
+    }
+
+    stage('Build and push image(s)') {
+    version = readFile "${env.WORKSPACE}/locust_benchmark/KUBE_IMAGE_VERSION"
+      withEnv(["VERSION=${version.trim()}"]) {
+
+        def builder = new ImageBuilder(this)
+        m = readFile "${env.WORKSPACE}/locust_benchmark/build-manifest.json"
+        builder.run(m)
+     }
+    }
+}
diff --git a/locust_benchmark/KUBE_IMAGE_VERSION b/locust_benchmark/KUBE_IMAGE_VERSION
@@ -0,0 +1 @@
+master
diff --git a/locust_benchmark/README.md b/locust_benchmark/README.md
@@ -87,6 +87,7 @@ echo "[YOUR KEY]" > .api_key
 - `schema_repetitions`: This controls the number of features for the lookup. One schema repetition will result in 10 features plus primary key. Five repetitions will result in 50 features plus primary key.
 - `recreate_feature_group`: This controls if the previous feature group should be dropped and recreated. Set this to true when rerunning the benchmark with different size of rows or schema repetitions.
 - `batch_size`: This is relevant for the actual benchmark and controls how many feature vectors are looked up in the batch benchmark.
+- `tablespace`: (Optional) If set creates a feature group using on-disk data.
 
 3. Create the feature group
 

diff --git a/locust_benchmark/build-manifest.json b/locust_benchmark/build-manifest.json
@@ -0,0 +1,8 @@
+[
+    {
+        "name": "hopsworks/locust-hsfs",
+        "version": "env:VERSION",
+        "dockerFile": "locust_benchmark/Dockerfile",
+        "canUseCache": "true"
+    }
+]
diff --git a/locust_benchmark/common/hopsworks_client.py b/locust_benchmark/common/hopsworks_client.py
@@ -7,10 +7,8 @@
 import pandas as pd
 
 from locust.runners import MasterRunner, LocalRunner
-import hsfs
 
-from hsfs import client
-from hsfs.client.exceptions import RestAPIError
+import hopsworks
 
 
 class HopsworksClient:
@@ -21,14 +19,14 @@ def __init__(self, environment=None):
             environment.runner, (MasterRunner, LocalRunner)
         ):
             print(self.hopsworks_config)
-        self.connection = hsfs.connection(
+        self.project = hopsworks.login(
             project=self.hopsworks_config.get("project", "test"),
             host=self.hopsworks_config.get("host", "localhost"),
             port=self.hopsworks_config.get("port", 443),
             api_key_file=".api_key",
             engine="python",
         )
-        self.fs = self.connection.get_feature_store()
+        self.fs = self.project.get_feature_store()
 
         # test settings
         self.external = self.hopsworks_config.get("external", False)
@@ -38,6 +36,7 @@ def __init__(self, environment=None):
             "recreate_feature_group", False
         )
         self.batch_size = self.hopsworks_config.get("batch_size", 100)
+        self.tablespace = self.hopsworks_config.get("tablespace", None)
 
     def get_or_create_fg(self):
         locust_fg = self.fs.get_or_create_feature_group(
@@ -46,6 +45,7 @@ def get_or_create_fg(self):
             primary_key=["ip"],
             online_enabled=True,
             stream=True,
+            online_config={'table_space': self.tablespace} if self.tablespace else None
         )
         return locust_fg
 
@@ -59,18 +59,15 @@ def insert_data(self, locust_fg):
         return locust_fg
 
     def get_or_create_fv(self, fg=None):
-        try:
-            return self.fs.get_feature_view("locust_fv", version=1)
-        except RestAPIError:
-            return self.fs.create_feature_view(
-                name="locust_fv",
-                query=fg.select_all(),
-                version=1,
-            )
+        if fg is None:
+            fg = self.get_or_create_fg()
+        return self.fs.get_or_create_feature_view(
+            name="locust_fv", version=1, query=fg.select_all()
+        )
 
     def close(self):
-        if client._client is not None:
-            self.connection.close()
+        if self.project is not None:
+            hopsworks.logout()
 
     def generate_insert_df(self, rows, schema_repetitions):
         data = {"ip": range(0, rows)}

diff --git a/locust_benchmark/hopsworks_config.json b/locust_benchmark/hopsworks_config.json
@@ -1,10 +1,11 @@
 {
-    "host": "localhost",
+    "host": "mercury.hops.works",
     "port": 443,
-    "project": "test",
+    "project": "fabio_demo",
     "external": true,
-    "rows": 100000,
+    "rows": 1000,
     "schema_repetitions": 1,
     "recreate_feature_group": true,
-    "batch_size": 100
+    "batch_size": 100,
+    "tablespace": "ts1"
 }
diff --git a/locust_benchmark/locustfile.py b/locust_benchmark/locustfile.py
@@ -3,20 +3,16 @@
 from common.hopsworks_client import HopsworksClient
 from common.stop_watch import stopwatch
 from locust import HttpUser, User, task, constant, events
-from locust.runners import MasterRunner, LocalRunner
+from locust.runners import MasterRunner
 from urllib3 import PoolManager
 import nest_asyncio
 
 
 @events.init.add_listener
 def on_locust_init(environment, **kwargs):
     print("Locust process init")
-
-    if isinstance(environment.runner, (MasterRunner, LocalRunner)):
-        # create feature view
-        environment.hopsworks_client = HopsworksClient(environment)
-        fg = environment.hopsworks_client.get_or_create_fg()
-        environment.hopsworks_client.get_or_create_fv(fg)
+    environment.hopsworks_client = HopsworksClient(environment)
+    environment.hopsworks_client.get_or_create_fg()
 
 
 @events.quitting.add_listener
@@ -61,59 +57,48 @@ def get_feature_vector(self):
 
 
 class MySQLFeatureVectorLookup(User):
-    wait_time = constant(0)
-    weight = 5
-    # fixed_count = 1
+    wait_time = constant(0.001)
+    weight = 2
 
     def __init__(self, environment):
         super().__init__(environment)
-        self.env = environment
-        self.client = HopsworksClient(environment)
-        self.fv = self.client.get_or_create_fv()
+        self.client = environment.hopsworks_client
 
     def on_start(self):
-        print("Init user")
+        self.fv = self.client.get_or_create_fv()
         self.fv.init_serving(external=self.client.external)
         nest_asyncio.apply()
 
-    def on_stop(self):
-        print("Closing user")
-
     @task
     def get_feature_vector(self):
-        self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)})
+        return self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)})
 
     @stopwatch
     def _get_feature_vector(self, pk):
         self.fv.get_feature_vector(pk)
 
 
 class MySQLFeatureVectorBatchLookup(User):
-    wait_time = constant(0)
+    wait_time = constant(0.001)
     weight = 1
-    # fixed_count = 1
 
     def __init__(self, environment):
         super().__init__(environment)
-        self.env = environment
-        self.client = HopsworksClient(environment)
+        self.client = environment.hopsworks_client
         self.fv = self.client.get_or_create_fv()
 
     def on_start(self):
         print("Init user")
         self.fv.init_serving(external=self.client.external)
         nest_asyncio.apply()
 
-    def on_stop(self):
-        print("Closing user")
-
     @task
     def get_feature_vector_batch(self):
         pks = [
             {"ip": random.randint(0, self.client.rows - 1)}
             for i in range(self.client.batch_size)
         ]
-        self._get_feature_vectors(pks)
+        return self._get_feature_vectors(pks)
 
     @stopwatch
     def _get_feature_vectors(self, pk):

diff --git a/locust_benchmark/requirements.txt b/locust_benchmark/requirements.txt
@@ -1,3 +1,4 @@
 markupsafe==2.0.1
-locust==2.17.0
+locust==2.23.1
+nest_asyncio==1.6.0
 git+https://github.com/logicalclocks/hopsworks-api@main#egg=hopsworks[python]&subdirectory=python
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -17,69 +17,69 @@ nav:
   - Setup and Installation: https://docs.hopsworks.ai/
   - Administration: https://docs.hopsworks.ai/
   - API<div class="dropdown"><button class="dropbtn"> API </button> <div id="myDropdown" class="dropdown-content"> <a id="hopsworks_api_link" href="https://docs.hopsworks.ai/hopsworks-api/latest">Python API</a> <a id="hsfs_javadoc_link" href="https://docs.hopsworks.ai/hopsworks-api/latest/javadoc">Feature Store JavaDoc</a> </div></div>:
+    - Login: generated/api/login.md
     - Platform API:
-      - Login: generated/api/login.md
       - Connection: generated/api/connection.md
-      - Projects: generated/api/projects.md
-      - Jobs: generated/api/jobs.md
+      - Datasets: generated/api/datasets.md
+      - Environment: generated/api/environment.md
       - Executions: generated/api/executions.md
       - FlinkCluster: generated/api/flink_cluster.md
-      - Environment: generated/api/environment.md
-      - GitRepo: generated/api/git_repo.md
       - GitProvider: generated/api/git_provider.md
       - GitRemote: generated/api/git_remote.md
-      - Datasets: generated/api/datasets.md
-      - KafkaTopic: generated/api/kafka_topic.md
+      - GitRepo: generated/api/git_repo.md
+      - Jobs: generated/api/jobs.md
       - KafkaSchema: generated/api/kafka_schema.md
-      - Secrets: generated/api/secrets.md
+      - KafkaTopic: generated/api/kafka_topic.md
       - OpenSearch: generated/api/opensearch.md
+      - Projects: generated/api/projects.md
+      - Secrets: generated/api/secrets.md
     - Feature Store API:
+      - Embedding:
+        - EmbeddingFeature: generated/api/embedding_feature_api.md
+        - EmbeddingIndex: generated/api/embedding_index_api.md
+        - SimilarityFunctionType: generated/api/similarity_function_type_api.md
       - ExpectationSuite: generated/api/expectation_suite_api.md
-      - FeatureStore: generated/api/feature_store_api.md
-      - FeatureGroup: generated/api/feature_group_api.md
       - ExternalFeatureGroup: generated/api/external_feature_group_api.md
-      - SpineGroup: generated/api/spine_group_api.md
-      - FeatureView: generated/api/feature_view_api.md
-      - TrainingDataset: generated/api/training_dataset_api.md
-      - Storage Connector: generated/api/storage_connector_api.md
       - Feature: generated/api/feature_api.md
+      - Feature Monitoring:
+        - Configuration: generated/api/feature_monitoring_config_api.md
+        - Result: generated/api/feature_monitoring_result_api.md
+        - Window: generated/api/feature_monitoring_window_config_api.md
+      - FeatureGroup: generated/api/feature_group_api.md
+      - FeatureStore: generated/api/feature_store_api.md
+      - FeatureView: generated/api/feature_view_api.md
+      - Provenance Links: generated/api/links.md
       - Query: generated/api/query_api.md
+      - SpineGroup: generated/api/spine_group_api.md
+      - Statistics:
+        - Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md
+        - Split Statistics: generated/api/split_statistics_api.md
+        - Statistics: generated/api/statistics_api.md
+      - Storage Connector: generated/api/storage_connector_api.md
+      - TrainingDataset: generated/api/training_dataset_api.md
       - Transformation Functions:
-        - UDF: generated/api/udf.md
         - HopsworksUDF: generated/api/hopsworks_udf.md
-        - TransformationFunction: generated/api/transformation_functions_api.md
         - Transformation Statistics:
-          - TransformationStatistics: generated/api/transformation_statistics.md
           - FeatureTransformationStatistics: generated/api/feature_transformation_statistics.md
+          - TransformationStatistics: generated/api/transformation_statistics.md
+        - TransformationFunction: generated/api/transformation_functions_api.md
+        - UDF: generated/api/udf.md
       - ValidationReport: generated/api/validation_report_api.md
-      - Provenance Links: generated/api/links.md
-      - Statistics:
-          - Statistics: generated/api/statistics_api.md
-          - Split Statistics: generated/api/split_statistics_api.md
-          - Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md
-      - Feature Monitoring:
-          - Configuration: generated/api/feature_monitoring_config_api.md
-          - Result: generated/api/feature_monitoring_result_api.md
-          - Window: generated/api/feature_monitoring_window_config_api.md
-      - Embedding:
-          - EmbeddingIndex: generated/api/embedding_index_api.md
-          - EmbeddingFeature: generated/api/embedding_feature_api.md
-          - SimilarityFunctionType: generated/api/similarity_function_type_api.md
     - Machine Learning API:
       - Model Registry:
-        - Model Registry: generated/model-registry/model_registry_api.md
         - Model: generated/model-registry/model_api.md
+        - Model Registry: generated/model-registry/model_registry_api.md
         - Model Schema: generated/model-registry/model_schema_api.md
       - Model Serving:
-        - Model Serving: generated/model-serving/model_serving_api.md
         - Deployment: generated/model-serving/deployment_api.md
         - Deployment state: generated/model-serving/predictor_state_api.md
         - Deployment state condition: generated/model-serving/predictor_state_condition_api.md
-        - Predictor: generated/model-serving/predictor_api.md
-        - Transformer: generated/model-serving/transformer_api.md
-        - Inference Logger: generated/model-serving/inference_logger_api.md
         - Inference Batcher: generated/model-serving/inference_batcher_api.md
+        - Inference Logger: generated/model-serving/inference_logger_api.md
+        - Model Serving: generated/model-serving/model_serving_api.md
+        - Predictor: generated/model-serving/predictor_api.md
         - Resources: generated/model-serving/resources_api.md
+        - Transformer: generated/model-serving/transformer_api.md
     # Added to allow navigation using the side drawer
     - Feature Store JavaDoc: https://docs.hopsworks.ai/feature-store-javadoc/latest/
     - Contributing: CONTRIBUTING.md

diff --git a/python/hopsworks/__init__.py b/python/hopsworks/__init__.py
@@ -22,6 +22,7 @@
 import tempfile
 import warnings
 from pathlib import Path
+from typing import Literal, Union
 
 from hopsworks import client, constants, project, version
 from hopsworks.client.exceptions import (
@@ -83,6 +84,7 @@ def login(
     api_key_file: str = None,
     hostname_verification: bool = False,
     trust_store_path: str = None,
+    engine: Union[None, Literal["spark"], Literal["python"], Literal["training"]] = None,
 ) -> project.Project:
     """Connect to [Serverless Hopsworks](https://app.hopsworks.ai) by calling the `hopsworks.login()` function with no arguments.
 
@@ -122,6 +124,13 @@ def login(
         api_key_file: Path to file wih Api Key
         hostname_verification: Whether to verify Hopsworks' certificate
         trust_store_path: Path on the file system containing the Hopsworks certificates
+        engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
+            which initializes the engine to Spark if the environment provides Spark, for
+            example on Hopsworks and Databricks, or falls back to Python if Spark is not
+            available, e.g. on local Python environments or AWS SageMaker. This option
+            allows you to override this behaviour. `"training"` engine is useful when only
+            feature store metadata is needed, for example training dataset location and label
+            information when Hopsworks training experiment is conducted.
     # Returns
         `Project`: The Project object to perform operations on
     # Raises
@@ -138,7 +147,7 @@ def login(
 
     # If inside hopsworks, just return the current project for now
     if "REST_ENDPOINT" in os.environ:
-        _hw_connection = _hw_connection(hostname_verification=hostname_verification)
+        _hw_connection = _hw_connection(hostname_verification=hostname_verification, engine=engine)
         _connected_project = _hw_connection.get_project()
         _initialize_module_apis()
         print("\nLogged in to project, explore it here " + _connected_project.get_url())
@@ -207,6 +216,7 @@ def login(
             _hw_connection = _hw_connection(
                 host=host,
                 port=port,
+                engine=engine,
                 api_key_file=api_key_path,
                 hostname_verification=hostname_verification,
                 trust_store_path=trust_store_path,
@@ -246,6 +256,7 @@ def login(
         _hw_connection = _hw_connection(
             host=host,
             port=port,
+            engine=engine,
             api_key_value=api_key,
             hostname_verification=hostname_verification,
             trust_store_path=trust_store_path,