getindata · Lasica · Jun 20, 2023 · Apr 25, 2023 · Apr 25, 2023 · Apr 25, 2023
diff --git a/docs/images/mlflow-support.png b/docs/images/mlflow-support.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -15,6 +15,7 @@ Welcome to Kedro Snowflake plugin documentation!
    Quickstart <source/03_quickstart.rst>
    Data Assets <source/04_data_assets.rst>
    Development <source/05_development.md>
+   MLflow support <source/06_mlflow.md>
 
 
 Indices and tables

diff --git a/docs/source/06_mlflow.md b/docs/source/06_mlflow.md
@@ -0,0 +1,88 @@
+# [Beta] MLflow support
+
+## High level architecture
+The key challenge is to provide access to the external service endpoints (like MLflow)
+that is currently not yet supported natively in Snowpark (External Access feature is on the Snowflake   roadmap). Snowflake external
+functions are the preferred workaround.
+![MLflow and Kedro-snowflake](../images/mlflow-support.png)
+
+## Implementation details
+Kedro-Snowflake <-> MLflow integration is based on the following concepts:
+ * [Snowflake external functions](https://docs.snowflake.com/en/sql-reference/external-functions-introduction) that
+are used for wrapping POST requests to the MLflow instance. In the minimal setup the following wrapping external functions for MLflow REST API calls must be created:
+    * [Create run](https://mlflow.org/docs/latest/rest-api.html#create-run)
+    * [Update run](https://mlflow.org/docs/latest/rest-api.html#update-run)
+    * [Log param](https://mlflow.org/docs/latest/rest-api.html#log-param)
+    * [Log metric](https://mlflow.org/docs/latest/rest-api.html#log-metric)  
+    * [Search experiment](https://mlflow.org/docs/latest/rest-api.html#search-experiments)
+ * [Snowflake externa function translators](https://docs.snowflake.com/en/sql-reference/external-functions-translators) for
+ changing the format of the data sent/received from the MLflow instance.
+ * [Snowflake API integration](https://docs.snowflake.com/en/sql-reference/sql/create-api-integration) for setting up
+ a communication channel from the Snowflake instance to the cloud HTTPS proxy/gateway service
+ where your MLflow instance is hosted (e.g. Amazon API Gateway, Google Cloud API Gateway or Azure API Management). 
+ * [Snowflake storage integration](https://docs.snowflake.com/en/sql-reference/sql/create-storage-integration) to enable
+ your Snowflake instance to upload artifacts (e.g. serialized models) to the cloud storage (Amazon S3, Azure Blob Storage, Google Cloud Storage) used by the
+ MLflow instance. 
+## Configuration example
+
+```yaml
+  mlflow:
+    # MLflow experiment name for tracking runs
+    experiment_name: demo-mlops
+    stage: "@MLFLOW_STAGE"
+    # Snowflake external functions needed for calling MLflow instance
+    functions:
+      experiment_get_by_name: demo.demo.mlflow_experiment_get_by_name
+      run_create: demo.demo.mlflow_run_create
+      run_update: demo.demo.mlflow_run_update
+      run_log_metric: demo.demo.mlflow_run_log_metric
+      run_log_parameter: demo.demo.mlflow_run_log_parameter
+```
+
+## Kedro starter
+The provided Kedro starter (Snowflights) has a builtin MLflow support.
+You can enable it during the project setup, i.e.:
+```bash
+TBD
+```
+
+## Deployment to Snowflake and inference
+
+### Deployment
+
+### Inference with User Defined Function (UDF)
+```sql
+select
+    MLFLOW$SNOWFLIGHTS_MODEL(
+        "engines",
+        "passenger_capacity",
+        "crew",
+        "d_check_complete",
+        "moon_clearance_complete",
+        "iata_approved",
+        "company_rating",
+        "review_scores_rating"
+    ) AS price
+from
+    (
+        select
+            1 as "engines",
+            100 as "passenger_capacity",
+            5 as "crew",
+            true as "d_check_complete",
+            true as "moon_clearance_complete",
+            true as "iata_approved",
+            10.0 as "company_rating",
+            5.0 as "review_scores_rating"
+        union all
+        select
+            2,
+            20,
+            5,
+            false,
+            false,
+            false,
+            3.0,
+            5.0
+    );
+```
diff --git a/docs/spellcheck_exceptions.txt b/docs/spellcheck_exceptions.txt
@@ -105,3 +105,4 @@ kedroazureml
 ly
 svg
 MLOps
+natively
diff --git a/kedro_snowflake/config.py b/kedro_snowflake/config.py
@@ -43,6 +43,7 @@ def check_credentials(cls, values):
 class DependenciesConfig(BaseModel):
     packages: List[str] = [
         "snowflake-snowpark-python",
+        "mlflow",
         "cachetools",
         "pluggy",
         "PyYAML==6.0",
@@ -80,9 +81,25 @@ class SnowflakeRuntimeConfig(BaseModel):
     pipeline_name_mapping: Optional[Dict[str, str]] = {"__default__": "default"}
 
 
+class MLflowFunctionsConfig(BaseModel):
+    experiment_get_by_name: str = "mlflow_experiment_get_by_name"
+    run_create: str = "mlflow_run_create"
+    run_update: str = "mlflow_run_update"
+    run_log_metric: str = "mlflow_run_log_metric"
+    run_log_parameter: str = "mlflow_run_log_parameter"
+
+
+class SnowflakeMLflowConfig(BaseModel):
+    experiment_name: Optional[str]
+    functions: MLflowFunctionsConfig
+    run_id: Optional[str]
+    stage: Optional[str]
+
+
 class SnowflakeConfig(BaseModel):
     connection: SnowflakeConnectionConfig
     runtime: SnowflakeRuntimeConfig
+    mlflow: SnowflakeMLflowConfig
 
 
 class KedroSnowflakeConfig(BaseModel):
@@ -136,6 +153,7 @@ class KedroSnowflakeConfig(BaseModel):
       # https://repo.anaconda.com/pkgs/snowflake/
       packages:
       - snowflake-snowpark-python
+      - mlflow
       - cachetools
       - pluggy
       - PyYAML==6.0
@@ -152,9 +170,22 @@ class KedroSnowflakeConfig(BaseModel):
       - more-itertools
       - openpyxl
       - backoff
+      - pydantic
     # Optionally provide mapping for user-friendly pipeline names
     pipeline_name_mapping:
      __default__: default
+  # EXPERIMENTAL: Either MLflow experiment name to enable MLflow tracking
+  # or leave empty
+  mlflow:
+    experiment_name: ~
+    stage: ~
+    # Snowflake external functions needed for calling MLflow instance
+    functions:
+        experiment_get_by_name: mlflow_experiment_get_by_name
+        run_create: mlflow_run_create
+        run_update: mlflow_run_update
+        run_log_metric: mlflow_run_log_metric
+        run_log_parameter: mlflow_run_log_parameter
 """.strip()
 
 # This auto-validates the template above during import

diff --git a/kedro_snowflake/generator.py b/kedro_snowflake/generator.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import os
 import re
@@ -59,6 +60,9 @@ def __init__(
         self.config = config
         self.pipeline_name = pipeline_name
         self.extra_env = extra_env
+        self.mlflow_enabled = (
+            True if self.config.snowflake.mlflow.experiment_name else False
+        )
 
     def _get_pipeline_name_for_snowflake(self):
         return (self.config.snowflake.runtime.pipeline_name_mapping or {}).get(
@@ -78,7 +82,9 @@ def _generate_task_sql(
             warehouse=self.connection_parameters["warehouse"],
             after_tasks=",".join(after_tasks),
             task_body=self.TASK_BODY_TEMPLATE.format(
-                root_task_name=self._root_task_name,
+                root_task_name=self._root_task_name
+                if not self.mlflow_enabled
+                else self._mlflow_root_task_name,
                 environment=self.kedro_environment,
                 sproc_name=self.SPROC_NAME,
                 pipeline_name=pipeline_name,
@@ -101,31 +107,71 @@ def _generate_root_task_sql(self):
             schedule=self.config.snowflake.runtime.schedule,
         )
 
-    def _sanitize_node_name(self, node_name: str) -> str:
-        return re.sub(r"\W", "_", node_name)
+    def _generate_root_task_suspend_sql(self):
+        return """
+alter task {task_name} suspend;
+        """.strip().format(
+            task_name=self._root_task_name
+        )
+
+    def _generate_mlflow_drop_task_sql(self):
+        return """
+drop task if exists {task_name};
+        """.strip().format(
+            task_name=self._mlflow_root_task_name
+        )
+
+    def _generate_mlflow_root_task_sql(self):
+        return """
+create or replace task {task_name}
+warehouse = '{warehouse}'
+after {after_task}
+as
+call {root_sproc}();
+""".strip().format(
+            task_name=self._mlflow_root_task_name,
+            warehouse=self.connection_parameters["warehouse"],
+            root_sproc=self._mlfow_root_sproc_name,
+            after_task=self._root_task_name,
+        )
+
+    def _standardize_node_name(self, node_name: str) -> str:
+        sanity_node_name = re.sub(r"\W", "_", node_name)
+        return f"kedro_{self._get_pipeline_name_for_snowflake()}_{sanity_node_name}"
 
     def _generate_snowflake_tasks_sql(
         self,
         pipeline: Pipeline,
     ) -> List[str]:
-        sql_statements = [self._generate_root_task_sql()]
+        sql_statements = [
+            self._generate_root_task_sql(),
+            self._generate_root_task_suspend_sql(),
+        ]
+        if self.mlflow_enabled:
+            sql_statements.append(self._generate_mlflow_root_task_sql())
+        else:
+            sql_statements.append(self._generate_mlflow_drop_task_sql())
 
         node_dependencies = (
             pipeline.node_dependencies
         )  # <-- this one is not topological
         for node in pipeline.nodes:  # <-- this one is topological
             after_tasks = [self._root_task_name] + [
-                self._sanitize_node_name(n.name) for n in node_dependencies[node]
+                f"{self._standardize_node_name(n.name)}"
+                for n in node_dependencies[node]
             ]
+            if self.mlflow_enabled:
+                after_tasks.append(self._mlflow_root_task_name)
             sql_statements.append(
                 self._generate_task_sql(
-                    self._sanitize_node_name(node.name),
+                    self._standardize_node_name(node.name),
                     after_tasks,
                     self.pipeline_name,
                     [node.name],
                     self.extra_params,
                 )
             )
+
         return sql_statements
 
     def _generate_task_execute_sql(self):
@@ -137,14 +183,25 @@ def _generate_task_execute_sql(self):
 
     @property
     def _root_task_name(self):
-        root_task_name = f"kedro_snowflake_start_{self._get_pipeline_name_for_snowflake()}_task".upper()
+        root_task_name = (
+            f"kedro_{self._get_pipeline_name_for_snowflake()}_start_task".upper()
+        )
         return root_task_name
 
     @property
-    def _root_sproc_name(self):
-        return (
-            f"kedro_snowflake_start_{self._get_pipeline_name_for_snowflake()}".upper()
+    def _mlflow_root_task_name(self):
+        mlflow_root_task_name = (
+            f"kedro_{self._get_pipeline_name_for_snowflake()}_mlflow_start_task".upper()
         )
+        return mlflow_root_task_name
+
+    @property
+    def _root_sproc_name(self):
+        return f"kedro_{self._get_pipeline_name_for_snowflake()}_start".upper()
+
+    @property
+    def _mlfow_root_sproc_name(self):
+        return f"kedro_{self._get_pipeline_name_for_snowflake()}_start_mlflow".upper()
 
     def generate(self) -> KedroSnowflakePipeline:
         """Generate a SnowflakePipeline object from a Kedro pipeline.
@@ -202,6 +259,13 @@ def generate(self) -> KedroSnowflakePipeline:
                 snowflake_stage_name
             )
 
+            if self.mlflow_enabled:
+                mlflow_root_sproc = (  # noqa: F841
+                    self._construct_kedro_snowflake_mlflow_root_sproc(
+                        snowflake_stage_name
+                    )
+                )
+
             logger.info("Creating Kedro Snowflake Sproc")
             snowflake_sproc = self._construct_kedro_snowflake_sproc(
                 imports=self._generate_imports_for_sproc(
@@ -219,7 +283,7 @@ def generate(self) -> KedroSnowflakePipeline:
                 pipeline_sql_statements,
                 self._generate_task_execute_sql(),
                 self._root_task_name,
-                [self._sanitize_node_name(n.name) for n in pipeline.nodes],
+                [self._standardize_node_name(n.name) for n in pipeline.nodes],
             )
 
     def _generate_imports_for_sproc(self, dependencies_dir, snowflake_stage_name):
@@ -270,6 +334,43 @@ def _drop_and_recreate_stages(self, *stages):
     def snowflake_session(self):
         return Session.builder.configs(self.connection_parameters).create()
 
+    def _construct_kedro_snowflake_mlflow_root_sproc(self, stage_location: str):
+        experiment_name = self.config.snowflake.mlflow.experiment_name
+        experiment_get_by_name_func = (
+            self.config.snowflake.mlflow.functions.experiment_get_by_name
+        )
+        run_create_func = self.config.snowflake.mlflow.functions.run_create
+        experiment_id = (
+            self.snowflake_session.sql(
+                f"SELECT {experiment_get_by_name_func}('{experiment_name}'):body.experiments[0].experiment_id"
+            ).collect()[0][0]
+        ).strip(" \"'\t\r\n")
+        mlflow_config = self.config.snowflake.mlflow.dict()
+
+        def mlflow_start_run(session: Session) -> str:
+            run_id = (
+                session.sql(
+                    f"SELECT {run_create_func}({experiment_id}):body.run.info.run_id"
+                ).collect()[0][0]
+            ).strip(" \"'\t\r\n")
+            mlflow_config["run_id"] = run_id
+            mlflow_config_json = json.dumps(mlflow_config)
+            session.sql(
+                f"call system$set_return_value('{mlflow_config_json}');"
+            ).collect()
+            return run_id
+
+        return sproc(
+            func=mlflow_start_run,
+            name=self._mlfow_root_sproc_name,
+            is_permanent=True,
+            replace=True,
+            stage_location=stage_location,
+            packages=["snowflake-snowpark-python"],
+            execute_as="caller",
+            session=self.snowflake_session,
+        )
+
     def _construct_kedro_snowflake_root_sproc(self, stage_location: str):
         def kedro_start_run(session: Session) -> str:
             from uuid import uuid4

diff --git a/kedro_snowflake/starters/snowflights/cookiecutter.json b/kedro_snowflake/starters/snowflights/cookiecutter.json
@@ -6,7 +6,9 @@
     "snowflake_account": "",
     "snowflake_user": "",
     "snowflake_warehouse": "",
-    "snowflake_database": "KEDRO",
-    "snowflake_schema": "PUBLIC",
-    "snowflake_password_env_variable": "SNOWFLAKE_PASSWORD"
+    "snowflake_database": "DEMO",
+    "snowflake_schema": "DEMO",
+    "snowflake_password_env_variable": "SNOWFLAKE_PASSWORD",
+    "pipeline_name": "default",
+    "enable_mlflow_integration": "False"
 }