diff --git a/python/hopsworks/__init__.py b/python/hopsworks/__init__.py index 220dcadb8..532376283 100644 --- a/python/hopsworks/__init__.py +++ b/python/hopsworks/__init__.py @@ -84,7 +84,7 @@ def login( api_key_file: str = None, hostname_verification: bool = False, trust_store_path: str = None, - engine: Union[None, Literal["spark"], Literal["python"], Literal["training"]] = None, + engine: Union[None, Literal["spark"], Literal["python"], Literal["training"], Literal["spark-no-metastore"], Literal["spark-delta"]] = None, ) -> project.Project: """Connect to [Serverless Hopsworks](https://app.hopsworks.ai) by calling the `hopsworks.login()` function with no arguments. @@ -124,13 +124,12 @@ def login( api_key_file: Path to file wih Api Key hostname_verification: Whether to verify Hopsworks' certificate trust_store_path: Path on the file system containing the Hopsworks certificates - engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, - which initializes the engine to Spark if the environment provides Spark, for - example on Hopsworks and Databricks, or falls back to Python if Spark is not - available, e.g. on local Python environments or AWS SageMaker. This option - allows you to override this behaviour. `"training"` engine is useful when only - feature store metadata is needed, for example training dataset location and label - information when Hopsworks training experiment is conducted. + engine: Specifies the engine to use. Possible options are "spark", "python", "training", "spark-no-metastore", or "spark-delta". The default value is None, which automatically selects the engine based on the environment: + "spark": Used if Spark is available, such as in Hopsworks or Databricks environments. + "python": Used in local Python environments or AWS SageMaker when Spark is not available. + "training": Used when only feature store metadata is needed, such as for obtaining training dataset locations and label information during Hopsworks training experiments. + "spark-no-metastore": Functions like "spark" but does not rely on the Hive metastore. + "spark-delta": Minimizes dependencies further by avoiding both Hive metastore and HopsFS. # Returns `Project`: The Project object to perform operations on # Raises diff --git a/python/hopsworks_common/client/external.py b/python/hopsworks_common/client/external.py index 12f04de04..24317c7d6 100644 --- a/python/hopsworks_common/client/external.py +++ b/python/hopsworks_common/client/external.py @@ -148,6 +148,15 @@ def provide_project(self, project): for conf_key, conf_value in configuration_dict.items(): _spark_session._jsc.hadoopConfiguration().set(conf_key, conf_value) + elif self._engine == "spark-delta": + _logger.debug( + "Running in Spark environment with no metastore and hopsfs, initializing Spark session" + ) + _spark_session = SparkSession.builder \ + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \ + .getOrCreate() + hopsworks_common.client.get_connection()._provide_project() def download_certs(self): diff --git a/python/hopsworks_common/connection.py b/python/hopsworks_common/connection.py index b04109b7d..0b727a38f 100644 --- a/python/hopsworks_common/connection.py +++ b/python/hopsworks_common/connection.py @@ -99,8 +99,8 @@ class Connection: defaults to the project from where the client is run from. Defaults to `None`. engine: Specifies the engine to use. Possible options are "spark", "python", "training", "spark-no-metastore", or "spark-delta". The default value is None, which automatically selects the engine based on the environment: - "spark": Used if Spark is available and the connection is not to serverless Hopsworks, such as in Hopsworks or Databricks environments. - "python": Used in local Python environments or AWS SageMaker when Spark is not available or the connection is done to serverless Hopsworks. + "spark": Used if Spark is available, such as in Hopsworks or Databricks environments. + "python": Used in local Python environments or AWS SageMaker when Spark is not available. "training": Used when only feature store metadata is needed, such as for obtaining training dataset locations and label information during Hopsworks training experiments. "spark-no-metastore": Functions like "spark" but does not rely on the Hive metastore. "spark-delta": Minimizes dependencies further by avoiding both Hive metastore and HopsFS. @@ -361,7 +361,7 @@ def connect(self) -> None: else: raise ConnectionError( "Engine you are trying to initialize is unknown. " - "Supported engines are `'spark'`, `'python'` and `'training'`." + "Supported engines are `'spark'`, `'python'`, `'training'`, `'spark-no-metastore'`, and `'spark-delta'`." ) # init client @@ -518,7 +518,7 @@ def connection( project: The name of the project to connect to. When running on Hopsworks, this defaults to the project from where the client is run from. Defaults to `None`. - engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, + engine: Which engine to use, `"spark"`, `"python"`, `"training"`, `"spark-no-metastore"` or `"spark-delta"`. Defaults to `None`, which initializes the engine to Spark if the environment provides Spark, for example on Hopsworks and Databricks, or falls back to Python if Spark is not available, e.g. on local Python environments or AWS SageMaker. This option diff --git a/python/hsfs/engine/__init__.py b/python/hsfs/engine/__init__.py index a4ee95daa..29fcda25d 100644 --- a/python/hsfs/engine/__init__.py +++ b/python/hsfs/engine/__init__.py @@ -41,7 +41,7 @@ def init(engine_type: str) -> None: raise ValueError( "Hive engine is not supported in hopsworks client version >= 4.0." ) - elif engine_type == "spark-no-metastore": + elif engine_type == "spark-no-metastore" or engine_type == "spark-delta": _engine = spark_no_metastore.Engine() elif engine_type in python_types: try: diff --git a/python/hsfs/engine/spark_no_metastore.py b/python/hsfs/engine/spark_no_metastore.py index 89505b797..bea80ddca 100644 --- a/python/hsfs/engine/spark_no_metastore.py +++ b/python/hsfs/engine/spark_no_metastore.py @@ -32,6 +32,6 @@ def __init__(self) -> None: super().__init__() - def _sql_offline(self, sql_query): + def _sql_offline(self, sql_query, feature_store): # Spark no metastore does not require the return self._spark_session.sql(sql_query)