From 426f730bb15753e1a9a5309163ae64ab6893e0fa Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Mon, 28 Oct 2024 13:28:14 +0100 Subject: [PATCH] Replace hsfs with hopsworks where it is possible in docs (#374) --- .../client/online_store_rest_client.py | 2 +- python/hopsworks_common/connection.py | 69 ++++++++++++++++++- python/hopsworks_common/project.py | 2 +- python/hsfs/feature_store.py | 2 +- python/hsfs/feature_view.py | 14 ++-- python/hsfs/training_dataset.py | 6 +- python/hsml/core/dataset_api.py | 5 +- 7 files changed, 84 insertions(+), 16 deletions(-) diff --git a/python/hopsworks_common/client/online_store_rest_client.py b/python/hopsworks_common/client/online_store_rest_client.py index 9ad05e9a3..b66897b09 100644 --- a/python/hopsworks_common/client/online_store_rest_client.py +++ b/python/hopsworks_common/client/online_store_rest_client.py @@ -305,7 +305,7 @@ def _check_hopsworks_connection(self) -> None: assert ( client.get_instance() is not None and client.get_instance()._connected ), """Hopsworks Client is not connected. Please connect to Hopsworks cluster - via hopsworks.login or hsfs.connection before initialising the Online Store REST Client. + via hopsworks.login before initialising the Online Store REST Client. """ _logger.debug("Hopsworks connection is active.") diff --git a/python/hopsworks_common/connection.py b/python/hopsworks_common/connection.py index da7ca9e52..08ad1f8a2 100644 --- a/python/hopsworks_common/connection.py +++ b/python/hopsworks_common/connection.py @@ -477,7 +477,74 @@ def connection( api_key_file: Optional[str] = None, api_key_value: Optional[str] = None, ) -> Connection: - """Connection factory method, accessible through `hopsworks.connection()`.""" + """Connection factory method, accessible through `hopsworks.connection()`. + + This class provides convenience classmethods accessible from the `hopsworks`-module: + + !!! example "Connection factory" + For convenience, `hopsworks` provides a factory method, accessible from the top level + module, so you don't have to import the `Connection` class manually: + + ```python + import hopsworks + conn = hopsworks.connection() + ``` + + !!! hint "Save API Key as File" + To get started quickly, you can simply create a file with the previously + created Hopsworks API Key and place it on the environment from which you + wish to connect to Hopsworks. + + You can then connect by simply passing the path to the key file when + instantiating a connection: + + ```python hl_lines="6" + import hopsworks + conn = hopsworks.connection( + 'my_instance', # DNS of your Hopsworks instance + 443, # Port to reach your Hopsworks instance, defaults to 443 + api_key_file='hopsworks.key', # The file containing the API key generated above + hostname_verification=True) # Disable for self-signed certificates + ) + project = conn.get_project("my_project") + ``` + + Clients in external clusters need to connect to the Hopsworks using an + API key. The API key is generated inside the Hopsworks platform, and requires at + least the "project" scope to be able to access a project. + For more information, see the [integration guides](../setup.md). + + # Arguments + host: The hostname of the Hopsworks instance in the form of `[UUID].cloud.hopsworks.ai`, + defaults to `None`. Do **not** use the url including `https://` when connecting + programatically. + port: The port on which the Hopsworks instance can be reached, + defaults to `443`. + project: The name of the project to connect to. When running on Hopsworks, this + defaults to the project from where the client is run from. + Defaults to `None`. + engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, + which initializes the engine to Spark if the environment provides Spark, for + example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not + available, e.g. on local Python environments or AWS SageMaker. This option + allows you to override this behaviour. `"training"` engine is useful when only + feature store metadata is needed, for example training dataset location and label + information when Hopsworks training experiment is conducted. + hostname_verification: Whether or not to verify Hopsworks' certificate, defaults + to `True`. + trust_store_path: Path on the file system containing the Hopsworks certificates, + defaults to `None`. + cert_folder: The directory to store retrieved HopsFS certificates, defaults to + `"/tmp"`. Only required when running without a Spark environment. + api_key_file: Path to a file containing the API Key, defaults to `None`. + api_key_value: API Key as string, if provided, `api_key_file` will be ignored, + however, this should be used with care, especially if the used notebook or + job script is accessible by multiple parties. Defaults to `None`. + + # Returns + `Connection`. Connection handle to perform operations on a + Hopsworks project. + """ return cls( host, port, diff --git a/python/hopsworks_common/project.py b/python/hopsworks_common/project.py index df82b3f79..7705b603b 100644 --- a/python/hopsworks_common/project.py +++ b/python/hopsworks_common/project.py @@ -129,7 +129,7 @@ def get_feature_store( name: Project name of the feature store. engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `"python"` when connected to [Serverless Hopsworks](https://app.hopsworks.ai). - See hsfs.Connection.connection documentation for more information. + See [`hopsworks.connection`](connection.md#connection) documentation for more information. # Returns `hsfs.feature_store.FeatureStore`: The Feature Store API # Raises diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 2a384c961..d12c501aa 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -456,7 +456,7 @@ def sql( For spark engine: Dictionary of read options for Spark. For python engine: If running queries on the online feature store, users can provide an entry `{'external': True}`, - this instructs the library to use the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) to establish the connection to the online feature store. + this instructs the library to use the `host` parameter in the [`hopsworks.login()`](login.md#login) to establish the connection to the online feature store. If not set, or set to False, the online feature store storage connector is used which relies on the private ip. Defaults to `{}`. diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 7527f4de7..4ba22bc4b 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -332,7 +332,7 @@ def init_serving( Transformation statistics are fetched from training dataset and applied to the feature vector. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -587,7 +587,7 @@ def get_feature_vector( providing feature values which are not available in the feature store. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -700,7 +700,7 @@ def get_feature_vectors( providing feature values which are not available in the feature store. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -772,7 +772,7 @@ def get_inference_helper( Set of required primary keys is [`feature_view.primary_keys`](#primary_keys) external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -830,7 +830,7 @@ def get_inference_helpers( Set of required primary keys is [`feature_view.primary_keys`](#primary_keys) external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -907,7 +907,7 @@ def find_neighbors( filter: A filter expression to restrict the search space (optional). external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -3562,7 +3562,7 @@ def transform( feature_vector: `Union[List[Any], List[List[Any]], pd.DataFrame, pl.DataFrame]`. The feature vector to be transformed. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py index 92db6d23e..8d66efa65 100644 --- a/python/hsfs/training_dataset.py +++ b/python/hsfs/training_dataset.py @@ -1003,7 +1003,7 @@ def init_prepared_statement( initialised for retrieving serving vectors as a batch. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -1020,7 +1020,7 @@ def get_serving_vector( serving application. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -1042,7 +1042,7 @@ def get_serving_vectors( serving application. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. diff --git a/python/hsml/core/dataset_api.py b/python/hsml/core/dataset_api.py index 06df9fba4..681fe3442 100644 --- a/python/hsml/core/dataset_api.py +++ b/python/hsml/core/dataset_api.py @@ -61,10 +61,11 @@ def upload( """Upload a file to the Hopsworks filesystem. ```python + import hopsworks - conn = hsml.connection(project="my-project") + project = hopsworks.login(project="my-project") - dataset_api = conn.get_dataset_api() + dataset_api = project.get_dataset_api() uploaded_file_path = dataset_api.upload("my_local_file.txt", "Resources")