ML/AutoML: Fix testing after streamlining the connectivity configuration

Concatenating the `schema` query parameter to the SQLAlchemy connection string correctly is crucial. In order to avoid anomalies or confusion, this patch makes it so that both types of connection strings (regular data vs. MLflow tracking) are configured side-by-side now, so it is easier to understand what is going on.
crate · Dec 4, 2023 · 5f6e1f4 · 5f6e1f4
1 parent 1d0afc6
commit 5f6e1f4
Show file tree

Hide file tree

Showing 8 changed files with 114 additions and 78 deletions.
diff --git a/topic/machine-learning/automl/README.md b/topic/machine-learning/automl/README.md
@@ -71,11 +71,31 @@ and [CrateDB].
   performing model. The notebook also shows how to use CrateDB as storage for
   both the raw data and the expirement tracking and model registry data.
 
-- Accompanied to the Jupyter Notebook files, there are also basic variants of
-  the above examples,
-  [automl_timeseries_forecasting_with_pycaret.py](automl_timeseries_forecasting_with_pycaret.py),
-  [automl_classification_with_pycaret.py](automl_classification_with_pycaret.py).
+- Accompanied to the Jupyter Notebook files, there are also basic standalone
+  program variants of the above examples.
+  - [automl_timeseries_forecasting_with_pycaret.py](automl_timeseries_forecasting_with_pycaret.py),
+  - [automl_classification_with_pycaret.py](automl_classification_with_pycaret.py).
+
+
+## Software Tests
+
+The resources are validated by corresponding software tests on CI. You can
+also use those on your workstation. For example, to invoke the test cases
+validating the Notebook about timeseries classification with PyCaret, run:
+
+```shell
+pytest -k automl_classification_with_pycaret.ipynb
+```
+
+Alternatively, you can validate all resources in this folder by invoking a
+test runner program on the top-level folder of this repository. This is the
+same code path the CI jobs are taking.
+```shell
+pip install -r requirements.txt
+ngr test topic/machine-learning/automl
+```
+
 
-[PyCaret]: https://github.com/pycaret/pycaret
 [CrateDB]: https://github.com/crate/crate
 [Introduction to hyperparameter tuning]: https://medium.com/analytics-vidhya/comparison-of-hyperparameter-tuning-algorithms-grid-search-random-search-bayesian-optimization-5326aaef1bd1
+[PyCaret]: https://github.com/pycaret/pycaret
diff --git a/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb b/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb
@@ -167,17 +167,21 @@
    "source": [
     "import os\n",
     "\n",
-    "# For CrateDB Cloud, use:\n",
+    "# Define database connectivity when connecting to CrateDB Cloud.\n",
     "CONNECTION_STRING = os.environ.get(\n",
     "    \"CRATEDB_CONNECTION_STRING\",\n",
     "    \"crate://username:password@hostname/?ssl=true\",\n",
     ")\n",
     "\n",
-    "# For an self-deployed CrateDB, e.g. via Docker, please use:\n",
+    "# Define database connectivity when connecting to CrateDB on localhost.\n",
     "# CONNECTION_STRING = os.environ.get(\n",
     "#     \"CRATEDB_CONNECTION_STRING\",\n",
     "#     \"crate://crate@localhost/?ssl=false\",\n",
-    "# )"
+    "# )\n",
+    "\n",
+    "# Compute derived connection strings for SQLAlchemy use vs. MLflow use.\n",
+    "DBURI_DATA = f\"{CONNECTION_STRING}&schema=testdrive\"\n",
+    "DBURI_MLFLOW = f\"{CONNECTION_STRING}&schema=mlflow\""
    ]
   },
   {
@@ -188,11 +192,13 @@
     "\n",
     "For convenience, this notebook comes with an accompanying CSV dataset which you\n",
     "can quickly import into the database. Upload the CSV file to your CrateDB cloud\n",
-    "cluster, as described [here](https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import).\n",
+    "cluster, as described at [CrateDB Cloud » Import].\n",
     "To follow this notebook, choose `pycaret_churn` for your table name.\n",
     "\n",
     "This will automatically create a new database table and import the data.\n",
     "\n",
+    "[CrateDB Cloud » Import]: https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import\n",
+    "\n",
     "### Alternative data import using code\n",
     "\n",
     "If you prefer to use code to import your data, please execute the following lines which read the CSV\n",
@@ -212,12 +218,16 @@
     "if os.path.exists(\".env\"):\n",
     "    dotenv.load_dotenv(\".env\", override=True)\n",
     "\n",
-    "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n",
+    "# Connect to database.\n",
+    "engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get('DEBUG')))\n",
+    "\n",
+    "# Import data.\n",
     "df = pd.read_csv(\"https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv\")\n",
+    "df.to_sql(\"pycaret_churn\", engine, schema=\"testdrive\", index=False, chunksize=1000, if_exists=\"replace\")\n",
     "\n",
+    "# CrateDB is eventually consistent, so synchronize write operations.\n",
     "with engine.connect() as conn:\n",
-    "    df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n",
-    "    conn.execute(sa.text(\"REFRESH TABLE pycaret_churn;\"))"
+    "    conn.execute(sa.text(\"REFRESH TABLE pycaret_churn\"))"
    ]
   },
   {
@@ -250,16 +260,14 @@
     "if os.path.exists(\".env\"):\n",
     "    dotenv.load_dotenv(\".env\", override=True)\n",
     "\n",
-    "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n",
+    "engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get('DEBUG')))\n",
     "\n",
     "with engine.connect() as conn:\n",
     "    with conn.execute(sa.text(\"SELECT * FROM pycaret_churn\")) as cursor:\n",
     "        data = pd.DataFrame(cursor.fetchall(), columns=cursor.keys())\n",
     "\n",
-    "# We set the MLFLOW_TRACKING_URI to our CrateDB instance. We'll see later why\n",
-    "os.environ[\n",
-    "    \"MLFLOW_TRACKING_URI\"\n",
-    "] = f\"{CONNECTION_STRING}&schema=mlflow\""
+    "# Configure MLflow to use CrateDB.\n",
+    "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW"
    ]
   },
   {
@@ -3441,9 +3449,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ[\n",
-    "    \"MLFLOW_TRACKING_URI\"\n",
-    "] = f\"{CONNECTION_STRING}&schema=mlflow\""
+    "# Configure MLflow to use CrateDB.\n",
+    "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW"
    ]
   },
   {

diff --git a/topic/machine-learning/automl/automl_classification_with_pycaret.py b/topic/machine-learning/automl/automl_classification_with_pycaret.py
@@ -17,16 +17,26 @@
     dotenv.load_dotenv(".env", override=True)
 
 
-# Configure database connection string.
-dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}"
-os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow"
+# Configure to connect to CrateDB server on localhost.
+CONNECTION_STRING = os.environ.get(
+    "CRATEDB_CONNECTION_STRING",
+    "crate://crate@localhost/?ssl=false",
+)
+
+# Compute derived connection strings for SQLAlchemy use vs. MLflow use.
+DBURI_DATA = f"{CONNECTION_STRING}&schema=testdrive"
+DBURI_MLFLOW = f"{CONNECTION_STRING}&schema=mlflow"
+
+# Propagate database connectivity settings.
+engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get("DEBUG")))
+os.environ["MLFLOW_TRACKING_URI"] = DBURI_MLFLOW
 
 
 def fetch_data():
     """
     Fetch data from CrateDB, using SQL and SQLAlchemy, and wrap result into pandas data frame.
     """
-    engine = sa.create_engine(dburi, echo=True)
+    engine = sa.create_engine(DBURI_DATA, echo=True)
 
     with engine.connect() as conn:
         with conn.execute(sa.text("SELECT * FROM pycaret_churn")) as cursor:

diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb
@@ -160,17 +160,21 @@
    "source": [
     "import os\n",
     "\n",
-    "# For CrateDB Cloud, use:\n",
+    "# Define database connectivity when connecting to CrateDB Cloud.\n",
     "CONNECTION_STRING = os.environ.get(\n",
     "    \"CRATEDB_CONNECTION_STRING\",\n",
     "    \"crate://username:password@hostname/?ssl=true\",\n",
     ")\n",
     "\n",
-    "# For an self-deployed CrateDB, e.g. via Docker, please use:\n",
+    "# Define database connectivity when connecting to CrateDB on localhost.\n",
     "# CONNECTION_STRING = os.environ.get(\n",
     "#     \"CRATEDB_CONNECTION_STRING\",\n",
     "#     \"crate://crate@localhost/?ssl=false\",\n",
-    "# )"
+    "# )\n",
+    "\n",
+    "# Compute derived connection strings for SQLAlchemy use vs. MLflow use.\n",
+    "DBURI_DATA = f\"{CONNECTION_STRING}&schema=testdrive\"\n",
+    "DBURI_MLFLOW = f\"{CONNECTION_STRING}&schema=mlflow\""
    ]
   },
   {
@@ -239,21 +243,21 @@
     "data[\"total_sales\"] = data[\"unit_price\"] * data[\"quantity\"]\n",
     "data[\"date\"] = pd.to_datetime(data[\"date\"])\n",
     "\n",
-    "# Insert the data into CrateDB\n",
-    "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get(\"DEBUG\"))\n",
+    "# Connect to database.\n",
+    "engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get(\"DEBUG\")))\n",
     "\n",
-    "with engine.connect() as conn:\n",
-    "    data.to_sql(\n",
-    "        \"sales_data_for_forecast\",\n",
-    "        conn,\n",
-    "        index=False,\n",
-    "        chunksize=1000,\n",
-    "        if_exists=\"replace\",\n",
-    "    )\n",
+    "# Import data.\n",
+    "data.to_sql(\n",
+    "    \"sales_data_for_forecast\",\n",
+    "    engine,\n",
+    "    index=False,\n",
+    "    chunksize=1000,\n",
+    "    if_exists=\"replace\",\n",
+    ")\n",
     "\n",
-    "    # Refresh table to make sure the data is available for querying - as CrateDB\n",
-    "    # is eventually consistent\n",
-    "    conn.execute(sa.text(\"REFRESH TABLE sales_data_for_forecast;\"))"
+    "# CrateDB is eventually consistent, so synchronize write operations.\n",
+    "with engine.connect() as conn:\n",
+    "    conn.execute(sa.text(\"REFRESH TABLE sales_data_for_forecast\"))"
    ]
   },
   {
@@ -288,8 +292,8 @@
     "\n",
     "data[\"month\"] = pd.to_datetime(data['month'], unit='ms')\n",
     "\n",
-    "# We set the MLFLOW_TRACKING_URI to our CrateDB instance. We'll see later why\n",
-    "os.environ[\"MLFLOW_TRACKING_URI\"] = f\"{CONNECTION_STRING}&schema=mlflow\""
+    "# Configure MLflow to use CrateDB.\n",
+    "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW"
    ]
   },
   {
@@ -2122,9 +2126,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ[\n",
-    "    \"MLFLOW_TRACKING_URI\"\n",
-    "] = f\"{CONNECTION_STRING}&schema=mlflow\""
+    "# Configure MLflow to use CrateDB.\n",
+    "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW"
    ]
   },
   {

diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py
@@ -17,10 +17,19 @@
 if os.path.isfile(".env"):
     load_dotenv(".env", override=True)
 
-# Configure database connection string.
-dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}"
-engine = sa.create_engine(dburi, echo=os.environ.get("DEBUG"))
-os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow"
+# Configure to connect to CrateDB server on localhost.
+CONNECTION_STRING = os.environ.get(
+    "CRATEDB_CONNECTION_STRING",
+    "crate://crate@localhost/?ssl=false",
+)
+
+# Compute derived connection strings for SQLAlchemy use vs. MLflow use.
+DBURI_DATA = f"{CONNECTION_STRING}&schema=testdrive"
+DBURI_MLFLOW = f"{CONNECTION_STRING}&schema=mlflow"
+
+# Propagate database connectivity settings.
+engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get("DEBUG")))
+os.environ["MLFLOW_TRACKING_URI"] = DBURI_MLFLOW
 
 
 def prepare_data():
@@ -37,7 +46,7 @@ def prepare_data():
     data["date"] = pd.to_datetime(data["date"])
 
     # Insert the data into CrateDB
-    engine = sa.create_engine(dburi, echo=os.environ.get("DEBUG"))
+    engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get("DEBUG")))
 
     with engine.connect() as conn:
         data.to_sql(

diff --git a/topic/machine-learning/automl/backlog.md b/topic/machine-learning/automl/backlog.md
@@ -1,3 +1,4 @@
 # Backlog
 
 - Describe / program how to import `churn-dataset.csv`.
+- Format and lint notebooks using `black` and `ruff`.
diff --git a/topic/machine-learning/automl/pyproject.toml b/topic/machine-learning/automl/pyproject.toml
@@ -1,15 +1,11 @@
 [tool.pytest.ini_options]
 minversion = "2.0"
 addopts = """
-  -rfEX -p pytester --strict-markers --verbosity=3 --capture=no
+  -rfEX -p pytester --strict-markers --verbosity=3
   """
 # --cov=. --cov-report=term-missing --cov-report=xml
 env = [
-    "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive",
-    "CRATE_USER=crate",
-    "CRATE_PASSWORD=",
-    "CRATE_HOST=localhost",
-    "CRATE_SSL=false",
+    "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?ssl=false",
     "PYDEVD_DISABLE_FILE_VALIDATION=1",
 ]
 
@@ -26,8 +22,8 @@ markers = [
 # pytest-notebook settings
 nb_test_files = true
 nb_coverage = false
-# 120 seconds is too less on CI/GHA
-nb_exec_timeout = 300
+# Default cell timeout is 120 seconds. For heavy computing, it needs to be increased.
+nb_exec_timeout = 240
 nb_diff_replace = [
     # Compensate output of `crash`.
     '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
@@ -47,24 +43,12 @@ nb_diff_ignore = [
     "/cells/*/outputs/*/metadata/nbreg",
     # Ignore images.
     "/cells/*/outputs/*/data/image/png",
-    # FIXME: Those pacifiers should be revisited.
-    #        Some are warnings, some are semantic ambiguities.
-    #        Maybe they can be improved in one way or another,
-    #        for improved QA.
-    "/cells/5/outputs",
-    "/cells/14/outputs",
-    "/cells/16/outputs",
-    "/cells/16/outputs",
-    "/cells/18/outputs",
-    "/cells/22/outputs",
-    "/cells/24/outputs",
-    "/cells/30/outputs/0/data/application/vnd.jupyter.widget-view+json",
-    "/cells/34/outputs",
-    "/cells/36/outputs",
-    "/cells/40/outputs",
-    # automl_timeseries_forecasting_with_pycaret.ipynb
-    "/cells/19/outputs",
-    "/cells/33/outputs",
+    # Ignore all cell output. It is too tedious to compare and maintain.
+    # The validation hereby extends exclusively to the _execution_ of notebook cells,
+    # able to catch syntax errors, module import flaws, and runtime errors.
+    # However, the validation will not catch any regressions on actual cell output,
+    # or whether any output is produced at all.
+    "/cells/*/outputs",
 ]
 
 [tool.coverage.run]

diff --git a/topic/machine-learning/automl/test.py b/topic/machine-learning/automl/test.py
@@ -1,7 +1,7 @@
 """
 ## About
 
-Test cases for classification model examples with CrateDB, PyCaret and MLflow.
+Test cases for classification and forecasting examples with CrateDB, PyCaret, and MLflow.
 
 
 ## Synopsis
@@ -17,6 +17,7 @@
 pytest -k notebook
 ```
 """
+import os
 from pathlib import Path
 
 import pytest
@@ -32,7 +33,8 @@ def cratedb() -> DatabaseAdapter:
     """
     Provide test cases with a connection to CrateDB, with additional tooling.
     """
-    return DatabaseAdapter(dburi="crate://crate@localhost:4200")
+    dburi = os.environ.get("CRATEDB_CONNECTION_STRING")
+    return DatabaseAdapter(dburi=f"{dburi}&schema=testdrive")
 
 
 @pytest.fixture(scope="function", autouse=True)