Skip to content

Commit

Permalink
Merge branch 'main' into sync-inject-api-links
Browse files Browse the repository at this point in the history
  • Loading branch information
aversey authored Nov 5, 2024
2 parents 280bd39 + b828b07 commit c0e5885
Show file tree
Hide file tree
Showing 16 changed files with 117 additions and 107 deletions.
2 changes: 1 addition & 1 deletion locust_benchmark/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM locustio/locust:2.17.0
FROM locustio/locust:2.23.1

USER root

Expand Down
20 changes: 20 additions & 0 deletions locust_benchmark/Jenkinsfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
@Library("jenkins-library@main")

import com.logicalclocks.jenkins.k8s.ImageBuilder


node("local") {
stage('Clone repository') {
checkout scm
}

stage('Build and push image(s)') {
version = readFile "${env.WORKSPACE}/locust_benchmark/KUBE_IMAGE_VERSION"
withEnv(["VERSION=${version.trim()}"]) {

def builder = new ImageBuilder(this)
m = readFile "${env.WORKSPACE}/locust_benchmark/build-manifest.json"
builder.run(m)
}
}
}
1 change: 1 addition & 0 deletions locust_benchmark/KUBE_IMAGE_VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
master
1 change: 1 addition & 0 deletions locust_benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ echo "[YOUR KEY]" > .api_key
- `schema_repetitions`: This controls the number of features for the lookup. One schema repetition will result in 10 features plus primary key. Five repetitions will result in 50 features plus primary key.
- `recreate_feature_group`: This controls if the previous feature group should be dropped and recreated. Set this to true when rerunning the benchmark with different size of rows or schema repetitions.
- `batch_size`: This is relevant for the actual benchmark and controls how many feature vectors are looked up in the batch benchmark.
- `tablespace`: (Optional) If set creates a feature group using on-disk data.

3. Create the feature group

Expand Down
8 changes: 8 additions & 0 deletions locust_benchmark/build-manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[
{
"name": "hopsworks/locust-hsfs",
"version": "env:VERSION",
"dockerFile": "locust_benchmark/Dockerfile",
"canUseCache": "true"
}
]
27 changes: 12 additions & 15 deletions locust_benchmark/common/hopsworks_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
import pandas as pd

from locust.runners import MasterRunner, LocalRunner
import hsfs

from hsfs import client
from hsfs.client.exceptions import RestAPIError
import hopsworks


class HopsworksClient:
Expand All @@ -21,14 +19,14 @@ def __init__(self, environment=None):
environment.runner, (MasterRunner, LocalRunner)
):
print(self.hopsworks_config)
self.connection = hsfs.connection(
self.project = hopsworks.login(
project=self.hopsworks_config.get("project", "test"),
host=self.hopsworks_config.get("host", "localhost"),
port=self.hopsworks_config.get("port", 443),
api_key_file=".api_key",
engine="python",
)
self.fs = self.connection.get_feature_store()
self.fs = self.project.get_feature_store()

# test settings
self.external = self.hopsworks_config.get("external", False)
Expand All @@ -38,6 +36,7 @@ def __init__(self, environment=None):
"recreate_feature_group", False
)
self.batch_size = self.hopsworks_config.get("batch_size", 100)
self.tablespace = self.hopsworks_config.get("tablespace", None)

def get_or_create_fg(self):
locust_fg = self.fs.get_or_create_feature_group(
Expand All @@ -46,6 +45,7 @@ def get_or_create_fg(self):
primary_key=["ip"],
online_enabled=True,
stream=True,
online_config={'table_space': self.tablespace} if self.tablespace else None
)
return locust_fg

Expand All @@ -59,18 +59,15 @@ def insert_data(self, locust_fg):
return locust_fg

def get_or_create_fv(self, fg=None):
try:
return self.fs.get_feature_view("locust_fv", version=1)
except RestAPIError:
return self.fs.create_feature_view(
name="locust_fv",
query=fg.select_all(),
version=1,
)
if fg is None:
fg = self.get_or_create_fg()
return self.fs.get_or_create_feature_view(
name="locust_fv", version=1, query=fg.select_all()
)

def close(self):
if client._client is not None:
self.connection.close()
if self.project is not None:
hopsworks.logout()

def generate_insert_df(self, rows, schema_repetitions):
data = {"ip": range(0, rows)}
Expand Down
9 changes: 5 additions & 4 deletions locust_benchmark/hopsworks_config.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
{
"host": "localhost",
"host": "mercury.hops.works",
"port": 443,
"project": "test",
"project": "fabio_demo",
"external": true,
"rows": 100000,
"rows": 1000,
"schema_repetitions": 1,
"recreate_feature_group": true,
"batch_size": 100
"batch_size": 100,
"tablespace": "ts1"
}
37 changes: 11 additions & 26 deletions locust_benchmark/locustfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,16 @@
from common.hopsworks_client import HopsworksClient
from common.stop_watch import stopwatch
from locust import HttpUser, User, task, constant, events
from locust.runners import MasterRunner, LocalRunner
from locust.runners import MasterRunner
from urllib3 import PoolManager
import nest_asyncio


@events.init.add_listener
def on_locust_init(environment, **kwargs):
print("Locust process init")

if isinstance(environment.runner, (MasterRunner, LocalRunner)):
# create feature view
environment.hopsworks_client = HopsworksClient(environment)
fg = environment.hopsworks_client.get_or_create_fg()
environment.hopsworks_client.get_or_create_fv(fg)
environment.hopsworks_client = HopsworksClient(environment)
environment.hopsworks_client.get_or_create_fg()


@events.quitting.add_listener
Expand Down Expand Up @@ -61,59 +57,48 @@ def get_feature_vector(self):


class MySQLFeatureVectorLookup(User):
wait_time = constant(0)
weight = 5
# fixed_count = 1
wait_time = constant(0.001)
weight = 2

def __init__(self, environment):
super().__init__(environment)
self.env = environment
self.client = HopsworksClient(environment)
self.fv = self.client.get_or_create_fv()
self.client = environment.hopsworks_client

def on_start(self):
print("Init user")
self.fv = self.client.get_or_create_fv()
self.fv.init_serving(external=self.client.external)
nest_asyncio.apply()

def on_stop(self):
print("Closing user")

@task
def get_feature_vector(self):
self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)})
return self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)})

@stopwatch
def _get_feature_vector(self, pk):
self.fv.get_feature_vector(pk)


class MySQLFeatureVectorBatchLookup(User):
wait_time = constant(0)
wait_time = constant(0.001)
weight = 1
# fixed_count = 1

def __init__(self, environment):
super().__init__(environment)
self.env = environment
self.client = HopsworksClient(environment)
self.client = environment.hopsworks_client
self.fv = self.client.get_or_create_fv()

def on_start(self):
print("Init user")
self.fv.init_serving(external=self.client.external)
nest_asyncio.apply()

def on_stop(self):
print("Closing user")

@task
def get_feature_vector_batch(self):
pks = [
{"ip": random.randint(0, self.client.rows - 1)}
for i in range(self.client.batch_size)
]
self._get_feature_vectors(pks)
return self._get_feature_vectors(pks)

@stopwatch
def _get_feature_vectors(self, pk):
Expand Down
3 changes: 2 additions & 1 deletion locust_benchmark/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
markupsafe==2.0.1
locust==2.17.0
locust==2.23.1
nest_asyncio==1.6.0
git+https://github.com/logicalclocks/hopsworks-api@main#egg=hopsworks[python]&subdirectory=python
70 changes: 35 additions & 35 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,69 +17,69 @@ nav:
- Setup and Installation: https://docs.hopsworks.ai/
- Administration: https://docs.hopsworks.ai/
- API<div class="dropdown"><button class="dropbtn"> API </button> <div id="myDropdown" class="dropdown-content"> <a id="hopsworks_api_link" href="https://docs.hopsworks.ai/hopsworks-api/latest">Python API</a> <a id="hsfs_javadoc_link" href="https://docs.hopsworks.ai/hopsworks-api/latest/javadoc">Feature Store JavaDoc</a> </div></div>:
- Login: generated/api/login.md
- Platform API:
- Login: generated/api/login.md
- Connection: generated/api/connection.md
- Projects: generated/api/projects.md
- Jobs: generated/api/jobs.md
- Datasets: generated/api/datasets.md
- Environment: generated/api/environment.md
- Executions: generated/api/executions.md
- FlinkCluster: generated/api/flink_cluster.md
- Environment: generated/api/environment.md
- GitRepo: generated/api/git_repo.md
- GitProvider: generated/api/git_provider.md
- GitRemote: generated/api/git_remote.md
- Datasets: generated/api/datasets.md
- KafkaTopic: generated/api/kafka_topic.md
- GitRepo: generated/api/git_repo.md
- Jobs: generated/api/jobs.md
- KafkaSchema: generated/api/kafka_schema.md
- Secrets: generated/api/secrets.md
- KafkaTopic: generated/api/kafka_topic.md
- OpenSearch: generated/api/opensearch.md
- Projects: generated/api/projects.md
- Secrets: generated/api/secrets.md
- Feature Store API:
- Embedding:
- EmbeddingFeature: generated/api/embedding_feature_api.md
- EmbeddingIndex: generated/api/embedding_index_api.md
- SimilarityFunctionType: generated/api/similarity_function_type_api.md
- ExpectationSuite: generated/api/expectation_suite_api.md
- FeatureStore: generated/api/feature_store_api.md
- FeatureGroup: generated/api/feature_group_api.md
- ExternalFeatureGroup: generated/api/external_feature_group_api.md
- SpineGroup: generated/api/spine_group_api.md
- FeatureView: generated/api/feature_view_api.md
- TrainingDataset: generated/api/training_dataset_api.md
- Storage Connector: generated/api/storage_connector_api.md
- Feature: generated/api/feature_api.md
- Feature Monitoring:
- Configuration: generated/api/feature_monitoring_config_api.md
- Result: generated/api/feature_monitoring_result_api.md
- Window: generated/api/feature_monitoring_window_config_api.md
- FeatureGroup: generated/api/feature_group_api.md
- FeatureStore: generated/api/feature_store_api.md
- FeatureView: generated/api/feature_view_api.md
- Provenance Links: generated/api/links.md
- Query: generated/api/query_api.md
- SpineGroup: generated/api/spine_group_api.md
- Statistics:
- Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md
- Split Statistics: generated/api/split_statistics_api.md
- Statistics: generated/api/statistics_api.md
- Storage Connector: generated/api/storage_connector_api.md
- TrainingDataset: generated/api/training_dataset_api.md
- Transformation Functions:
- UDF: generated/api/udf.md
- HopsworksUDF: generated/api/hopsworks_udf.md
- TransformationFunction: generated/api/transformation_functions_api.md
- Transformation Statistics:
- TransformationStatistics: generated/api/transformation_statistics.md
- FeatureTransformationStatistics: generated/api/feature_transformation_statistics.md
- TransformationStatistics: generated/api/transformation_statistics.md
- TransformationFunction: generated/api/transformation_functions_api.md
- UDF: generated/api/udf.md
- ValidationReport: generated/api/validation_report_api.md
- Provenance Links: generated/api/links.md
- Statistics:
- Statistics: generated/api/statistics_api.md
- Split Statistics: generated/api/split_statistics_api.md
- Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md
- Feature Monitoring:
- Configuration: generated/api/feature_monitoring_config_api.md
- Result: generated/api/feature_monitoring_result_api.md
- Window: generated/api/feature_monitoring_window_config_api.md
- Embedding:
- EmbeddingIndex: generated/api/embedding_index_api.md
- EmbeddingFeature: generated/api/embedding_feature_api.md
- SimilarityFunctionType: generated/api/similarity_function_type_api.md
- Machine Learning API:
- Model Registry:
- Model Registry: generated/model-registry/model_registry_api.md
- Model: generated/model-registry/model_api.md
- Model Registry: generated/model-registry/model_registry_api.md
- Model Schema: generated/model-registry/model_schema_api.md
- Model Serving:
- Model Serving: generated/model-serving/model_serving_api.md
- Deployment: generated/model-serving/deployment_api.md
- Deployment state: generated/model-serving/predictor_state_api.md
- Deployment state condition: generated/model-serving/predictor_state_condition_api.md
- Predictor: generated/model-serving/predictor_api.md
- Transformer: generated/model-serving/transformer_api.md
- Inference Logger: generated/model-serving/inference_logger_api.md
- Inference Batcher: generated/model-serving/inference_batcher_api.md
- Inference Logger: generated/model-serving/inference_logger_api.md
- Model Serving: generated/model-serving/model_serving_api.md
- Predictor: generated/model-serving/predictor_api.md
- Resources: generated/model-serving/resources_api.md
- Transformer: generated/model-serving/transformer_api.md
# Added to allow navigation using the side drawer
- Feature Store JavaDoc: https://docs.hopsworks.ai/feature-store-javadoc/latest/
- Contributing: CONTRIBUTING.md
Expand Down
13 changes: 12 additions & 1 deletion python/hopsworks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import tempfile
import warnings
from pathlib import Path
from typing import Literal, Union

from hopsworks import client, constants, project, version
from hopsworks.client.exceptions import (
Expand Down Expand Up @@ -83,6 +84,7 @@ def login(
api_key_file: str = None,
hostname_verification: bool = False,
trust_store_path: str = None,
engine: Union[None, Literal["spark"], Literal["python"], Literal["training"]] = None,
) -> project.Project:
"""Connect to [Serverless Hopsworks](https://app.hopsworks.ai) by calling the `hopsworks.login()` function with no arguments.
Expand Down Expand Up @@ -122,6 +124,13 @@ def login(
api_key_file: Path to file wih Api Key
hostname_verification: Whether to verify Hopsworks' certificate
trust_store_path: Path on the file system containing the Hopsworks certificates
engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
which initializes the engine to Spark if the environment provides Spark, for
example on Hopsworks and Databricks, or falls back to Python if Spark is not
available, e.g. on local Python environments or AWS SageMaker. This option
allows you to override this behaviour. `"training"` engine is useful when only
feature store metadata is needed, for example training dataset location and label
information when Hopsworks training experiment is conducted.
# Returns
`Project`: The Project object to perform operations on
# Raises
Expand All @@ -138,7 +147,7 @@ def login(

# If inside hopsworks, just return the current project for now
if "REST_ENDPOINT" in os.environ:
_hw_connection = _hw_connection(hostname_verification=hostname_verification)
_hw_connection = _hw_connection(hostname_verification=hostname_verification, engine=engine)
_connected_project = _hw_connection.get_project()
_initialize_module_apis()
print("\nLogged in to project, explore it here " + _connected_project.get_url())
Expand Down Expand Up @@ -207,6 +216,7 @@ def login(
_hw_connection = _hw_connection(
host=host,
port=port,
engine=engine,
api_key_file=api_key_path,
hostname_verification=hostname_verification,
trust_store_path=trust_store_path,
Expand Down Expand Up @@ -246,6 +256,7 @@ def login(
_hw_connection = _hw_connection(
host=host,
port=port,
engine=engine,
api_key_value=api_key,
hostname_verification=hostname_verification,
trust_store_path=trust_store_path,
Expand Down
Loading

0 comments on commit c0e5885

Please sign in to comment.