From d67d459d51b1c83cf2151e80fa9e159770409cb7 Mon Sep 17 00:00:00 2001 From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com> Date: Sat, 18 May 2024 13:21:54 +0300 Subject: [PATCH] [FSTORE-1396] Tutorials Update: Save/load XGBoost model as json file (#265) * Replace pickling xgboost models with saving them as json objects for the model registry --- .../1_citibike_feature_backfill.ipynb | 95 ++++---- .../2_citibike_feature_pipeline.ipynb | 67 ++--- .../3_citibike_training_pipeline.ipynb | 82 ++++--- .../citibike/4_citibike_batch_inference.ipynb | 61 ++--- .../citibike/features/citibike.py | 14 +- .../citibike/streamlit_batch_inference_app.py | 21 +- .../1_credit_scores_feature_backfill.ipynb | 132 +++++----- .../2_credit_scores_feature_pipeline.ipynb | 88 +++---- .../3_credit_scores_training_pipeline.ipynb | 93 +++---- .../4_credit_scores_batch_inference.ipynb | 51 ++-- .../1_nyc_taxi_fares_feature_backfill.ipynb | 44 ++-- .../2_nyc_taxi_fares_feature_pipeline.ipynb | 63 ++--- .../3_nyc_taxi_fares_training_pipeline.ipynb | 25 +- .../4_nyc_taxi_fares_batch_inference.ipynb | 53 ++-- .../streamlit_batch_inference_app.py | 14 +- .../timeseries/features/price.py | 2 +- .../custom_transformation_functions.ipynb | 163 +++++++------ churn/1_churn_feature_pipeline.ipynb | 60 ++--- churn/2_churn_training_pipeline.ipynb | 39 +-- churn/3_churn_batch_inference.ipynb | 93 +++---- churn/streamlit_batch_inference_app.py | 15 +- .../1_fraud_batch_feature_pipeline.ipynb | 84 +++---- .../2_fraud_batch_training_pipeline.ipynb | 17 +- fraud_batch/3_fraud_batch_inference.ipynb | 49 ++-- .../1_fraud_online_feature_pipeline.ipynb | 84 +++---- .../2_fraud_online_training_pipeline.ipynb | 24 +- .../3_fraud_online_inference_pipeline.ipynb | 40 +-- .../1-loan-approval-feature-pipeline.ipynb | 228 +++++------------- .../2-loan-approval-training-pipeline.ipynb | 92 +++---- .../3-loan-approval-batch-inference.ipynb | 54 ++--- 30 files changed, 957 insertions(+), 990 deletions(-) mode change 100755 => 100644 fraud_batch/1_fraud_batch_feature_pipeline.ipynb mode change 100755 => 100644 fraud_online/1_fraud_online_feature_pipeline.ipynb diff --git a/advanced_tutorials/citibike/1_citibike_feature_backfill.ipynb b/advanced_tutorials/citibike/1_citibike_feature_backfill.ipynb index 9144b5db..8dafddbb 100644 --- a/advanced_tutorials/citibike/1_citibike_feature_backfill.ipynb +++ b/advanced_tutorials/citibike/1_citibike_feature_backfill.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "7601bd38", + "id": "3853d219", "metadata": { "id": "ccbbf2cc" }, @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "ed32afc3", + "id": "6d0a2c45", "metadata": { "id": "akyCpdrP0GDH" }, @@ -34,7 +34,7 @@ { "cell_type": "code", "execution_count": null, - "id": "990e400c", + "id": "22629764", "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9592775e", + "id": "cf987f2e", "metadata": { "id": "c3fd23b4" }, @@ -58,7 +58,10 @@ "\n", "from pandas.tseries.holiday import USFederalHolidayCalendar\n", "\n", - "from features import citibike, meteorological_measurements\n", + "from features import (\n", + " citibike, \n", + " meteorological_measurements,\n", + ")\n", "\n", "# Mute warnings\n", "import warnings\n", @@ -67,7 +70,7 @@ }, { "cell_type": "markdown", - "id": "c0557cde", + "id": "0166a759", "metadata": { "id": "KVH8VU5g0JDP" }, @@ -77,7 +80,7 @@ }, { "cell_type": "markdown", - "id": "86c5470d", + "id": "e37497d5", "metadata": {}, "source": [ "## ๐Ÿ’ฝ Load the historical data and ๐Ÿ› ๏ธ Perform Feature Engineering\n", @@ -91,7 +94,7 @@ }, { "cell_type": "markdown", - "id": "22340095", + "id": "30f64562", "metadata": { "id": "285d06c9" }, @@ -118,7 +121,7 @@ }, { "cell_type": "markdown", - "id": "ae44de76", + "id": "da2ffb7c", "metadata": {}, "source": [ "Let's download some data [from here](https://s3.amazonaws.com/tripdata/index.html) and perform preprocessing (removal of redundant columns and data grouping)" @@ -127,7 +130,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54627832", + "id": "b19dedd6", "metadata": { "scrolled": true }, @@ -141,7 +144,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95eb7665", + "id": "9f6559bb", "metadata": {}, "outputs": [], "source": [ @@ -152,7 +155,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d496fdd7", + "id": "267edd82", "metadata": {}, "outputs": [], "source": [ @@ -172,7 +175,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58f8f21f", + "id": "6f4ef6d9", "metadata": { "scrolled": true }, @@ -188,7 +191,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cba9dfee", + "id": "d916a111", "metadata": {}, "outputs": [], "source": [ @@ -198,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "aedc5568", + "id": "645c42b0", "metadata": {}, "source": [ "### ๐Ÿ“’ Citibike stations info" @@ -207,7 +210,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63a68bbd", + "id": "29d136ad", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +221,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d960d9ac", + "id": "72e40b6b", "metadata": {}, "outputs": [], "source": [ @@ -235,7 +238,7 @@ { "cell_type": "code", "execution_count": null, - "id": "105028e5", + "id": "66243560", "metadata": {}, "outputs": [], "source": [ @@ -246,7 +249,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f0360a2a", + "id": "f5322a07", "metadata": {}, "outputs": [], "source": [ @@ -273,7 +276,7 @@ }, { "cell_type": "markdown", - "id": "517224b3", + "id": "1eee1015", "metadata": {}, "source": [ "### ๐Ÿ“… US holidays" @@ -282,7 +285,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca65a9b6", + "id": "03775cbe", "metadata": {}, "outputs": [], "source": [ @@ -305,7 +308,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fa12d5a0", + "id": "ff976b80", "metadata": {}, "outputs": [], "source": [ @@ -325,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e93487c9", + "id": "db30ed56", "metadata": {}, "outputs": [], "source": [ @@ -340,7 +343,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0c044390", + "id": "7574b15d", "metadata": {}, "outputs": [], "source": [ @@ -357,7 +360,7 @@ { "cell_type": "code", "execution_count": null, - "id": "73a33b2d", + "id": "76b5c737", "metadata": {}, "outputs": [], "source": [ @@ -366,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "7e5f8bd1", + "id": "0405e960", "metadata": {}, "source": [ "### ๐ŸŒค Meteorological measurements from VisualCrossing" @@ -374,7 +377,7 @@ }, { "cell_type": "markdown", - "id": "ba160d85-10f3-4d7b-b9b4-efd706b6ceba", + "id": "96d72937", "metadata": {}, "source": [ "You will parse weather data so you should get an API key from [VisualCrossing](https://www.visualcrossing.com/). You can use [this link](https://www.visualcrossing.com/weather-api).\n", @@ -391,7 +394,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c619138e", + "id": "271bed12", "metadata": {}, "outputs": [], "source": [ @@ -405,7 +408,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0b2515c9", + "id": "2ad07dd2", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +424,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b67b32d9", + "id": "d4875559", "metadata": {}, "outputs": [], "source": [ @@ -439,7 +442,7 @@ }, { "cell_type": "markdown", - "id": "49834242", + "id": "4a826897", "metadata": { "id": "H1aYmOX60MXj" }, @@ -449,7 +452,7 @@ }, { "cell_type": "markdown", - "id": "dab06068", + "id": "b204ebdc", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -458,7 +461,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c553b1f8", + "id": "ba3395d2", "metadata": { "id": "cae776d7" }, @@ -473,7 +476,7 @@ }, { "cell_type": "markdown", - "id": "da6ba183", + "id": "be0dfb9d", "metadata": {}, "source": [ "---" @@ -481,7 +484,7 @@ }, { "cell_type": "markdown", - "id": "5b93d95e", + "id": "f55978ba", "metadata": {}, "source": [ "## ๐Ÿช„ Creating Feature Groups \n", @@ -492,7 +495,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8d715bd8", + "id": "6501923e", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +511,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3e10b145", + "id": "81c75fd2", "metadata": { "scrolled": true }, @@ -520,7 +523,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c7af8432", + "id": "98c5b065", "metadata": { "id": "c691d509" }, @@ -537,7 +540,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29631f51", + "id": "3e821b95", "metadata": { "id": "67228279" }, @@ -549,7 +552,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8a5c6266", + "id": "9f1736bf", "metadata": {}, "outputs": [], "source": [ @@ -565,7 +568,7 @@ { "cell_type": "code", "execution_count": null, - "id": "014d4c35", + "id": "aab10254", "metadata": {}, "outputs": [], "source": [ @@ -575,7 +578,7 @@ { "cell_type": "code", "execution_count": null, - "id": "847ab26c", + "id": "010f1a00", "metadata": {}, "outputs": [], "source": [ @@ -591,7 +594,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d72fd5f1", + "id": "695d18c5", "metadata": { "scrolled": true }, @@ -602,7 +605,7 @@ }, { "cell_type": "markdown", - "id": "a373a025", + "id": "77dddcf2", "metadata": {}, "source": [ "## โญ๏ธ **Next:** Part 02: Feature Pipeline \n", @@ -617,7 +620,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -631,7 +634,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/citibike/2_citibike_feature_pipeline.ipynb b/advanced_tutorials/citibike/2_citibike_feature_pipeline.ipynb index a7378fee..737a00f9 100644 --- a/advanced_tutorials/citibike/2_citibike_feature_pipeline.ipynb +++ b/advanced_tutorials/citibike/2_citibike_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "74b6c01c", + "id": "d179fa38", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 02: Feature Pipeline\n", @@ -15,7 +15,7 @@ }, { "cell_type": "markdown", - "id": "8d022a10", + "id": "66df50de", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -24,7 +24,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7faa949f", + "id": "5b87f18e", "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,10 @@ "import pandas as pd\n", "import os\n", "\n", - "from features import citibike, meteorological_measurements\n", + "from features import (\n", + " citibike, \n", + " meteorological_measurements,\n", + ")\n", "\n", "# Mute warnings\n", "import warnings\n", @@ -41,7 +44,7 @@ }, { "cell_type": "markdown", - "id": "d3dc1ac1", + "id": "b33339b0", "metadata": {}, "source": [ "---" @@ -49,7 +52,7 @@ }, { "cell_type": "markdown", - "id": "77939976", + "id": "1dfacd9f", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -58,7 +61,7 @@ { "cell_type": "code", "execution_count": null, - "id": "608986f1", + "id": "3046b3e4", "metadata": {}, "outputs": [], "source": [ @@ -72,7 +75,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2cc9d553", + "id": "bd7ce002", "metadata": {}, "outputs": [], "source": [ @@ -90,7 +93,7 @@ }, { "cell_type": "markdown", - "id": "bf4c2d52", + "id": "6c7b15a8", "metadata": {}, "source": [ "### ๐Ÿ“… Getting tha last date\n" @@ -99,7 +102,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f339455", + "id": "ffeaec13", "metadata": { "scrolled": true }, @@ -113,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41c8bce4", + "id": "e800d919", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +128,7 @@ { "cell_type": "code", "execution_count": null, - "id": "93633194", + "id": "a97c80e1", "metadata": {}, "outputs": [], "source": [ @@ -137,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "dce30ea9", + "id": "05fabd24", "metadata": {}, "source": [ "---" @@ -145,7 +148,7 @@ }, { "cell_type": "markdown", - "id": "2bfde864", + "id": "fd72808f", "metadata": {}, "source": [ "## ๐Ÿช„ Parsing new data" @@ -153,7 +156,7 @@ }, { "cell_type": "markdown", - "id": "d41cb2c5", + "id": "32c5efb2", "metadata": {}, "source": [ "### ๐Ÿšฒ Citibike usage info" @@ -162,7 +165,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2ca902e0", + "id": "63aa1c27", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +180,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6efc7e2f", + "id": "19a46498", "metadata": {}, "outputs": [], "source": [ @@ -190,7 +193,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62cb7d60", + "id": "198d0ba4", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +209,7 @@ }, { "cell_type": "markdown", - "id": "aa39e68f", + "id": "8bf55a65", "metadata": {}, "source": [ "### ๐ŸŒค Meteorological measurements from VisualCrossing" @@ -214,7 +217,7 @@ }, { "cell_type": "markdown", - "id": "623ed453", + "id": "cf040fe7", "metadata": {}, "source": [ "You will parse weather data so you should get an API key from [VisualCrossing](https://www.visualcrossing.com/). You can use [this link](https://www.visualcrossing.com/weather-api).\n", @@ -231,7 +234,7 @@ { "cell_type": "code", "execution_count": null, - "id": "525d223f", + "id": "af212086", "metadata": {}, "outputs": [], "source": [ @@ -245,7 +248,7 @@ { "cell_type": "code", "execution_count": null, - "id": "183769e6", + "id": "3773aa84", "metadata": {}, "outputs": [], "source": [ @@ -261,7 +264,7 @@ { "cell_type": "code", "execution_count": null, - "id": "74198dc8", + "id": "ce61c382", "metadata": {}, "outputs": [], "source": [ @@ -273,7 +276,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3b343078", + "id": "28655bc8", "metadata": {}, "outputs": [], "source": [ @@ -288,7 +291,7 @@ }, { "cell_type": "markdown", - "id": "ebdb3fa7", + "id": "f3426f85", "metadata": {}, "source": [ "---" @@ -296,7 +299,7 @@ }, { "cell_type": "markdown", - "id": "d95d789c", + "id": "f5dad7d4", "metadata": {}, "source": [ "## โฌ†๏ธ Uploading new data to the Feature Store" @@ -305,7 +308,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d207b245", + "id": "9b144f46", "metadata": {}, "outputs": [], "source": [ @@ -316,7 +319,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dfd8086a", + "id": "0c4a51b2", "metadata": {}, "outputs": [], "source": [ @@ -326,7 +329,7 @@ }, { "cell_type": "markdown", - "id": "4f39826e", + "id": "a4b670d1", "metadata": {}, "source": [ "---" @@ -334,7 +337,7 @@ }, { "cell_type": "markdown", - "id": "139be1e7", + "id": "84acdbfe", "metadata": {}, "source": [ "## โญ๏ธ **Next:** Part 03: Training Pipeline \n", @@ -345,7 +348,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -359,7 +362,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/citibike/3_citibike_training_pipeline.ipynb b/advanced_tutorials/citibike/3_citibike_training_pipeline.ipynb index 0e64708a..2e3d5fb6 100644 --- a/advanced_tutorials/citibike/3_citibike_training_pipeline.ipynb +++ b/advanced_tutorials/citibike/3_citibike_training_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "fd5a44c9", + "id": "d1b75d88", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 03: Training Pipeline\n", @@ -21,7 +21,7 @@ }, { "cell_type": "markdown", - "id": "5272c5bf", + "id": "5be42c18", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -30,11 +30,10 @@ { "cell_type": "code", "execution_count": null, - "id": "f2216406", + "id": "8c8527eb", "metadata": {}, "outputs": [], "source": [ - "import joblib\n", "import os\n", "\n", "import pandas as pd\n", @@ -54,7 +53,7 @@ }, { "cell_type": "markdown", - "id": "5f08ae28", + "id": "329c3705", "metadata": {}, "source": [ "---" @@ -62,7 +61,7 @@ }, { "cell_type": "markdown", - "id": "3c4e057b", + "id": "5327ca0a", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -71,7 +70,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f2910bd8", + "id": "134eefda", "metadata": { "scrolled": true }, @@ -87,7 +86,7 @@ { "cell_type": "code", "execution_count": null, - "id": "09927639", + "id": "b6870ce1", "metadata": {}, "outputs": [], "source": [ @@ -110,7 +109,7 @@ }, { "cell_type": "markdown", - "id": "5c162680", + "id": "b4c983ba", "metadata": {}, "source": [ "---" @@ -118,7 +117,7 @@ }, { "cell_type": "markdown", - "id": "2963c406", + "id": "937eea82", "metadata": {}, "source": [ "## ๐Ÿ– Feature View Creation and Retrieving \n", @@ -129,7 +128,7 @@ { "cell_type": "code", "execution_count": null, - "id": "abddfc94", + "id": "492fe596", "metadata": {}, "outputs": [], "source": [ @@ -148,7 +147,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e4f346fb", + "id": "b293bc00", "metadata": { "scrolled": true }, @@ -160,7 +159,7 @@ }, { "cell_type": "markdown", - "id": "aa092bc9", + "id": "8ad20ee3", "metadata": {}, "source": [ "`Feature Views` stands between **Feature Groups** and **Training Dataset**. ะกombining **Feature Groups** we can create **Feature Views** which store a metadata of our data. Having **Feature Views** we can create **Training Dataset**.\n", @@ -185,7 +184,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0bf70fac", + "id": "71cd5362", "metadata": {}, "outputs": [], "source": [ @@ -199,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "99c6f378", + "id": "c2664d7d", "metadata": {}, "source": [ "---\n", @@ -236,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6a7871c", + "id": "abf7cd6c", "metadata": { "scrolled": true }, @@ -253,10 +252,16 @@ { "cell_type": "code", "execution_count": null, - "id": "eb27d58b", + "id": "f9970969", "metadata": {}, "outputs": [], "source": [ + "# Set the multi-level index for the training set using 'date' and 'station_id' columns\n", + "X_train = X_train.set_index([\"date\", \"station_id\"])\n", + "\n", + "# Set the multi-level index for the test set using 'date' and 'station_id' columns\n", + "X_test = X_test.set_index([\"date\", \"station_id\"])\n", + "\n", "# Convert the specified columns in the training set to float type\n", "X_train.iloc[:, 1:-1] = X_train.iloc[:, 1:-1].astype(float)\n", "\n", @@ -270,16 +275,10 @@ { "cell_type": "code", "execution_count": null, - "id": "4d68fc49", + "id": "bec1b8cb", "metadata": {}, "outputs": [], "source": [ - "# Set the multi-level index for the training set using 'date' and 'station_id' columns\n", - "X_train = X_train.set_index([\"date\", \"station_id\"])\n", - "\n", - "# Set the multi-level index for the test set using 'date' and 'station_id' columns\n", - "X_test = X_test.set_index([\"date\", \"station_id\"])\n", - "\n", "# Drop rows with missing values in the training set\n", "X_train.dropna(inplace=True)\n", "\n", @@ -298,7 +297,7 @@ }, { "cell_type": "markdown", - "id": "436bd004", + "id": "2d8f9077", "metadata": {}, "source": [ "---\n", @@ -308,7 +307,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11efe09b", + "id": "4fd16a7a", "metadata": {}, "outputs": [], "source": [ @@ -322,7 +321,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b8e790a", + "id": "692cf009", "metadata": {}, "outputs": [], "source": [ @@ -337,7 +336,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2db802f", + "id": "e55a7910", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f60d8fc0", + "id": "b52f9ca7", "metadata": {}, "outputs": [], "source": [ @@ -372,7 +371,7 @@ }, { "cell_type": "markdown", - "id": "641ade81", + "id": "32a55b93", "metadata": {}, "source": [ "---\n", @@ -386,7 +385,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aecf4adf", + "id": "55d29272", "metadata": {}, "outputs": [], "source": [ @@ -398,7 +397,10 @@ "output_schema = Schema(y_train)\n", "\n", "# Create a model schema with the input and output schemas\n", - "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", + "model_schema = ModelSchema(\n", + " input_schema=input_schema, \n", + " output_schema=output_schema,\n", + ")\n", "\n", "# Convert the model schema to a dictionary\n", "model_schema.to_dict()" @@ -406,7 +408,7 @@ }, { "cell_type": "markdown", - "id": "3c97a550", + "id": "f66fcfcb", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n", @@ -417,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1feacc3e", + "id": "a0838eaa", "metadata": {}, "outputs": [], "source": [ @@ -426,8 +428,8 @@ "if not os.path.isdir(model_dir):\n", " os.mkdir(model_dir)\n", "\n", - "# Save the XGBoost regressor model to the specified directory\n", - "joblib.dump(regressor, model_dir + '/citibike_xgb_model.pkl')\n", + "# Save the XGBoost regressor model as json file to the specified directory\n", + "regressor.save_model(model_dir + \"/model.json\")\n", "\n", "# Save the residual plot figure as an image in the model directory\n", "fig.savefig(model_dir + \"/residplot.png\")" @@ -436,7 +438,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0102be25", + "id": "bebbb812", "metadata": {}, "outputs": [], "source": [ @@ -458,7 +460,7 @@ }, { "cell_type": "markdown", - "id": "4d5ec11d", + "id": "87a7b133", "metadata": {}, "source": [ "## โญ๏ธ **Next:** Part 04: Batch Inference \n", @@ -469,7 +471,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -483,7 +485,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/citibike/4_citibike_batch_inference.ipynb b/advanced_tutorials/citibike/4_citibike_batch_inference.ipynb index 5c8a4054..9ac03167 100644 --- a/advanced_tutorials/citibike/4_citibike_batch_inference.ipynb +++ b/advanced_tutorials/citibike/4_citibike_batch_inference.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c958e52b", + "id": "0986a873", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 04: Batch Inference\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "8855ee1a", + "id": "62c1023b", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -25,16 +25,16 @@ { "cell_type": "code", "execution_count": null, - "id": "019c9226", + "id": "216b9341", "metadata": {}, "outputs": [], "source": [ - "import joblib" + "from xgboost import XGBRegressor" ] }, { "cell_type": "markdown", - "id": "ce2fe8a8", + "id": "80bbe6ea", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -43,7 +43,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39f83bc9", + "id": "49f4411c", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "87485ee0", + "id": "b46ff232", "metadata": {}, "source": [ "## โš™๏ธ Feature View Retrieval\n" @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e622d6b4", + "id": "0222c382", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "e1dac8b6", + "id": "43641348", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n" @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca35a9f4", + "id": "30843255", "metadata": {}, "outputs": [], "source": [ @@ -97,7 +97,7 @@ }, { "cell_type": "markdown", - "id": "6f3589dc", + "id": "93255afe", "metadata": {}, "source": [ "## ๐Ÿ“ฎ Retrieving model from Model Registry " @@ -106,7 +106,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ac8014f", + "id": "b13fd4c8", "metadata": {}, "outputs": [], "source": [ @@ -123,20 +123,21 @@ { "cell_type": "code", "execution_count": null, - "id": "1a1221c7", + "id": "ca2d53b4", "metadata": {}, "outputs": [], "source": [ - "# Load the XGBoost model from the downloaded model directory\n", - "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/citibike_xgb_model.pkl\")\n", + "# Initialize the model\n", + "model = XGBRegressor()\n", "\n", - "# Display the retrieved XGBoost model\n", - "retrieved_xgboost_model" + "# Load the model from a saved JSON file\n", + "model.load_model(saved_model_dir + \"/model.json\")\n", + "model" ] }, { "cell_type": "markdown", - "id": "3ad20124", + "id": "24dc2b3d", "metadata": {}, "source": [ "## ๐Ÿค– Making the predictions " @@ -144,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "21c82dc9", + "id": "ffad1cc9", "metadata": {}, "source": [ "### โœจ Load Batch Data" @@ -153,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d10c618", + "id": "2fffa43f", "metadata": {}, "outputs": [], "source": [ @@ -173,26 +174,26 @@ { "cell_type": "code", "execution_count": null, - "id": "7b6410e0", + "id": "32efe93e", "metadata": {}, "outputs": [], "source": [ - "# Convert the specified columns in the batch data to float type\n", - "batch_data.iloc[:, 1:-1] = batch_data.iloc[:, 1:-1].astype(float)\n", - "\n", "# Set the multi-level index for the batch data using 'date' and 'station_id' columns\n", - "X_batch = batch_data.set_index([\"date\", \"station_id\"])" + "X_batch = batch_data.set_index([\"date\", \"station_id\"])\n", + "\n", + "# Convert the specified columns in the batch data to float type\n", + "X_batch.iloc[:, 1:-1] = X_batch.iloc[:, 1:-1].astype(float)" ] }, { "cell_type": "code", "execution_count": null, - "id": "9922d9ef", + "id": "8d424819", "metadata": {}, "outputs": [], "source": [ "# Make predictions using the retrieved XGBoost model on the batch data\n", - "predictions = retrieved_xgboost_model.predict(X_batch)\n", + "predictions = model.predict(X_batch)\n", "\n", "# Display the first 10 predictions\n", "predictions[:10]" @@ -200,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "89bc07f2", + "id": "354d65a7", "metadata": {}, "source": [ "---\n", @@ -216,7 +217,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -230,7 +231,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/citibike/features/citibike.py b/advanced_tutorials/citibike/features/citibike.py index 04bc4ce6..eef5b623 100644 --- a/advanced_tutorials/citibike/features/citibike.py +++ b/advanced_tutorials/citibike/features/citibike.py @@ -71,10 +71,12 @@ def get_last_date_in_fg(fg) -> str: Returns: str: Last date string in the format '%Y-%m-%d'. """ - for col in fg.statistics.content["columns"]: - if col["column"] == "timestamp": - res = col["maximum"] - return convert_unix_to_date(res) + date_max = [ + int(feature.max) + for feature in fg.statistics.feature_descriptive_statistics + if feature.feature_name == 'timestamp' + ][0] + return convert_unix_to_date(date_max) ############################################################################### @@ -144,10 +146,10 @@ def update_month_data(main_df: pd.DataFrame, month: str, year: str) -> pd.DataFr print(f"_____ Processing {month}/{year}... _____") if f"{year}{month}" in ["202206", "202207"]: - citibike = "citbike" + citibike = "citibike" else: citibike = "citibike" - url = f'https://s3.amazonaws.com/tripdata/{year}{month}-{citibike}-tripdata.csv.zip' + url = f'https://s3.amazonaws.com/tripdata/JC-{year}{month}-{citibike}-tripdata.csv.zip' print(url) filename = "data/" + url.split('/')[-1].split(".")[0] + ".csv" fn_list = filename.split(".") diff --git a/advanced_tutorials/citibike/streamlit_batch_inference_app.py b/advanced_tutorials/citibike/streamlit_batch_inference_app.py index b772d534..e336ad0b 100644 --- a/advanced_tutorials/citibike/streamlit_batch_inference_app.py +++ b/advanced_tutorials/citibike/streamlit_batch_inference_app.py @@ -1,14 +1,17 @@ from datetime import timedelta, datetime from random import sample import os -import joblib import pandas as pd +from xgboost import XGBRegressor import plotly.express as px import streamlit as st import hopsworks -from features import citibike, meteorological_measurements +from features import ( + citibike, + meteorological_measurements, +) def print_fancy_header(text, font_size=22, color="#ff5f27"): @@ -162,7 +165,11 @@ def get_model(project, model_name, file_name): if list_of_files: model_path = list_of_files[0] - model = joblib.load(model_path) + # Initialize the model + model = XGBRegressor() + + # Load the model from a saved JSON file + model.load_model("/model.json") else: if not os.path.exists(file_name): mr = project.get_model_registry() @@ -173,8 +180,12 @@ def get_model(project, model_name, file_name): EVALUATION_METRIC, SORT_METRICS_BY) model_dir = model.download() - model = joblib.load(model_dir + f"/{file_name}") - + + # Initialize the model + model = XGBRegressor() + + # Load the model from a saved JSON file + model.load_model(model_dir + "/model.json") return model print_fancy_header('\n ๐Ÿค– Getting the model...') diff --git a/advanced_tutorials/credit_scores/1_credit_scores_feature_backfill.ipynb b/advanced_tutorials/credit_scores/1_credit_scores_feature_backfill.ipynb index 643741bf..f3935a84 100644 --- a/advanced_tutorials/credit_scores/1_credit_scores_feature_backfill.ipynb +++ b/advanced_tutorials/credit_scores/1_credit_scores_feature_backfill.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c656257d", + "id": "57fda212", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Backfill\n", @@ -20,7 +20,7 @@ }, { "cell_type": "markdown", - "id": "066cfc1d", + "id": "f8aef9c3", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -29,7 +29,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c2fb6ff7", + "id": "a04f9f76", "metadata": {}, "outputs": [], "source": [ @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54ef807e", + "id": "76ff915b", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "8f346331", + "id": "1e866352", "metadata": {}, "source": [ "## ๐Ÿ’ฝ Loading the Data " @@ -65,7 +65,7 @@ }, { "cell_type": "markdown", - "id": "fe19b02c", + "id": "bba27dfe", "metadata": {}, "source": [ "#### โ›ณ๏ธ Application Train dataset\n", @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f80905ab", + "id": "db7f16d6", "metadata": {}, "outputs": [], "source": [ @@ -90,7 +90,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d20080f2-8802-475c-8518-ad887d14d382", + "id": "7ac4fcde", "metadata": {}, "outputs": [], "source": [ @@ -99,7 +99,7 @@ }, { "cell_type": "markdown", - "id": "c662aa9a", + "id": "67cf47dd", "metadata": {}, "source": [ "#### โ›ณ๏ธ Bureau Balance dataset\n", @@ -110,7 +110,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d7786a1a", + "id": "bfbf2905", "metadata": {}, "outputs": [], "source": [ @@ -123,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "926038e8-3ed1-4895-be51-2d41ac41638e", + "id": "7fdb786b", "metadata": {}, "outputs": [], "source": [ @@ -132,7 +132,7 @@ }, { "cell_type": "markdown", - "id": "74ea2756", + "id": "373e62be", "metadata": {}, "source": [ "#### โ›ณ๏ธ Bureau Dataset\n", @@ -143,7 +143,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dea341fa", + "id": "76e83f31", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8953770c-1563-44af-8f5c-2159bc233422", + "id": "26618c66", "metadata": {}, "outputs": [], "source": [ @@ -166,7 +166,7 @@ }, { "cell_type": "markdown", - "id": "e4ebded9", + "id": "d2c81e95", "metadata": {}, "source": [ "#### โ›ณ๏ธ Credit Card Balance Dataset\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "987feeaf", + "id": "20781cea", "metadata": {}, "outputs": [], "source": [ @@ -190,7 +190,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3794cabc-f1b1-4375-86ff-7855aab2b552", + "id": "02401dd0", "metadata": {}, "outputs": [], "source": [ @@ -199,7 +199,7 @@ }, { "cell_type": "markdown", - "id": "761f558e", + "id": "06ed02d7", "metadata": {}, "source": [ "#### โ›ณ๏ธ Installments Payments Dataset\n", @@ -210,7 +210,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b494b088", + "id": "a30b2039", "metadata": {}, "outputs": [], "source": [ @@ -224,7 +224,7 @@ { "cell_type": "code", "execution_count": null, - "id": "addb3009-7073-41fd-b176-ff9143757b10", + "id": "6d84e96d", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +233,7 @@ }, { "cell_type": "markdown", - "id": "d1749444", + "id": "818dfd45", "metadata": {}, "source": [ "#### โ›ณ๏ธ POS (point of sales) and Cash Loans Balance Dataset\n", @@ -246,7 +246,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b10a16f", + "id": "0590ef5b", "metadata": {}, "outputs": [], "source": [ @@ -259,7 +259,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee35a51b-0b10-4b85-8e62-837a9ab04c90", + "id": "f81edfdb", "metadata": {}, "outputs": [], "source": [ @@ -268,7 +268,7 @@ }, { "cell_type": "markdown", - "id": "8674301f", + "id": "f17eddb3", "metadata": {}, "source": [ "#### โ›ณ๏ธ Previous Application Dataset\n", @@ -281,7 +281,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f269dce9", + "id": "fb26017d", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ { "cell_type": "code", "execution_count": null, - "id": "459f6a2d-2d4c-4b52-a1fd-49f9f06801dc", + "id": "e3ad79b1", "metadata": {}, "outputs": [], "source": [ @@ -304,7 +304,7 @@ }, { "cell_type": "markdown", - "id": "5bb1d5d4", + "id": "04031bfe", "metadata": {}, "source": [ "---\n", @@ -314,7 +314,7 @@ }, { "cell_type": "markdown", - "id": "a4388bf5", + "id": "357a82fb", "metadata": {}, "source": [ "#### โ›ณ๏ธ Dataset with amount of previous loans" @@ -323,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11abd601", + "id": "7f0ca14d", "metadata": {}, "outputs": [], "source": [ @@ -338,7 +338,7 @@ }, { "cell_type": "markdown", - "id": "36b74f22", + "id": "22dd8d88", "metadata": {}, "source": [ "---\n", @@ -349,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "76110306", + "id": "be753565", "metadata": {}, "outputs": [], "source": [ @@ -367,7 +367,7 @@ }, { "cell_type": "markdown", - "id": "2a2c1615-3cfd-4269-92d2-9692333260f7", + "id": "2a19b083", "metadata": {}, "source": [ "---" @@ -375,7 +375,7 @@ }, { "cell_type": "markdown", - "id": "f6f79748", + "id": "3010ef38", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connecting to Hopsworks Feature Store " @@ -384,7 +384,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8ce6369", + "id": "40952cd2", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +397,7 @@ }, { "cell_type": "markdown", - "id": "ff8f9f0b", + "id": "f1583bb4", "metadata": {}, "source": [ "---\n", @@ -413,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "bccf2036", + "id": "99a80d79", "metadata": {}, "source": [ "### โ›ณ๏ธ Creating Applications Feature Group " @@ -422,7 +422,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a1841c22", + "id": "07a61adb", "metadata": {}, "outputs": [], "source": [ @@ -438,7 +438,7 @@ }, { "cell_type": "markdown", - "id": "a3bf21e8", + "id": "ecec6457", "metadata": {}, "source": [ "#### โ›ณ๏ธ Bureau Balance Feature Group" @@ -447,7 +447,7 @@ { "cell_type": "code", "execution_count": null, - "id": "03aef307", + "id": "524e6d30", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "f8c5d4f2", + "id": "b843a188", "metadata": {}, "source": [ "#### โ›ณ๏ธ Bureau Feature Group" @@ -471,7 +471,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ccc0c9bf", + "id": "054bd543", "metadata": {}, "outputs": [], "source": [ @@ -487,7 +487,7 @@ }, { "cell_type": "markdown", - "id": "f61e8d9a", + "id": "3aeebdbc", "metadata": {}, "source": [ "#### โ›ณ๏ธ Previous Application Feature Group" @@ -496,7 +496,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6054f0c2", + "id": "f1563811", "metadata": {}, "outputs": [], "source": [ @@ -512,7 +512,7 @@ }, { "cell_type": "markdown", - "id": "7c96104c", + "id": "a5f945b0", "metadata": {}, "source": [ "#### โ›ณ๏ธ Pos_Cash_Balance Feature Group" @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16f3d825", + "id": "b3a58faf", "metadata": {}, "outputs": [], "source": [ @@ -539,7 +539,7 @@ }, { "cell_type": "markdown", - "id": "8cdbdfae", + "id": "df4b62a5", "metadata": {}, "source": [ "#### โ›ณ๏ธ Instalments Payments Feature Group" @@ -548,7 +548,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1d5b3ab7", + "id": "ef53f638", "metadata": {}, "outputs": [], "source": [ @@ -564,7 +564,7 @@ }, { "cell_type": "markdown", - "id": "07b5b3ee", + "id": "f8ecce49", "metadata": {}, "source": [ "#### โ›ณ๏ธ Credit Card Balance Feature Group" @@ -573,7 +573,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef832847", + "id": "731c93d2", "metadata": {}, "outputs": [], "source": [ @@ -588,7 +588,7 @@ }, { "cell_type": "markdown", - "id": "0db06e97", + "id": "56e9dabb", "metadata": {}, "source": [ "#### โ›ณ๏ธ Previous Load Counts Feature Group" @@ -597,7 +597,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9dba1014", + "id": "37540f47", "metadata": {}, "outputs": [], "source": [ @@ -613,7 +613,7 @@ }, { "cell_type": "markdown", - "id": "53239f75", + "id": "a5d6c36a", "metadata": {}, "source": [ "---\n", @@ -624,7 +624,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b60c91ef", + "id": "c5cbec27", "metadata": {}, "outputs": [], "source": [ @@ -646,7 +646,7 @@ { "cell_type": "code", "execution_count": null, - "id": "391d53f0", + "id": "0ba3a43b", "metadata": {}, "outputs": [], "source": [ @@ -662,7 +662,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9c3c88e", + "id": "e3d67482", "metadata": {}, "outputs": [], "source": [ @@ -678,7 +678,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d142a638", + "id": "f5291588", "metadata": {}, "outputs": [], "source": [ @@ -697,7 +697,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35fe1c36", + "id": "fabe58e9", "metadata": {}, "outputs": [], "source": [ @@ -720,7 +720,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f391fb42", + "id": "d9943e37", "metadata": {}, "outputs": [], "source": [ @@ -742,7 +742,7 @@ { "cell_type": "code", "execution_count": null, - "id": "198ea486", + "id": "9e6c8306", "metadata": {}, "outputs": [], "source": [ @@ -764,7 +764,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1192adb5", + "id": "21a04b2f", "metadata": {}, "outputs": [], "source": [ @@ -789,7 +789,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef05b0f8", + "id": "4c752b39", "metadata": {}, "outputs": [], "source": [ @@ -814,7 +814,7 @@ { "cell_type": "code", "execution_count": null, - "id": "092f6972", + "id": "2dbb0b76", "metadata": {}, "outputs": [], "source": [ @@ -837,7 +837,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0ea22d3b", + "id": "826f5989", "metadata": {}, "outputs": [], "source": [ @@ -861,7 +861,7 @@ }, { "cell_type": "markdown", - "id": "8f0431ac", + "id": "6537d223", "metadata": {}, "source": [ "---\n", @@ -873,7 +873,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -887,7 +887,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/credit_scores/2_credit_scores_feature_pipeline.ipynb b/advanced_tutorials/credit_scores/2_credit_scores_feature_pipeline.ipynb index 5c736ba2..2e5c58f3 100644 --- a/advanced_tutorials/credit_scores/2_credit_scores_feature_pipeline.ipynb +++ b/advanced_tutorials/credit_scores/2_credit_scores_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "f7354b1d", + "id": "e4a4651b", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 02: Feature Pipeline\n", @@ -15,7 +15,7 @@ }, { "cell_type": "markdown", - "id": "39778877-81b1-431b-b3e9-b5e1f91fc07a", + "id": "6fc5c2aa", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -24,7 +24,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b487597a", + "id": "c0f6efe0", "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "markdown", - "id": "062a6265", + "id": "128d18f2", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connecting to Hopsworks Feature Store " @@ -47,7 +47,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4aa2f9a", + "id": "264299c3", "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ }, { "cell_type": "markdown", - "id": "7825f41d", + "id": "2113f6d2", "metadata": {}, "source": [ "## ๐Ÿช„ Retrieving Feature Groups" @@ -68,7 +68,7 @@ }, { "cell_type": "markdown", - "id": "a0efc688", + "id": "7d41f6a5", "metadata": {}, "source": [ "#### โ›ณ๏ธ Application Train Feature Group" @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6467043", + "id": "5392fa62", "metadata": {}, "outputs": [], "source": [ @@ -92,7 +92,7 @@ }, { "cell_type": "markdown", - "id": "19daef3c", + "id": "09c50194", "metadata": {}, "source": [ "#### โ›ณ๏ธ Bureau Balance Feature Group" @@ -101,7 +101,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8a8647c3", + "id": "4407f16c", "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ }, { "cell_type": "markdown", - "id": "a06132a4", + "id": "1636c8ec", "metadata": {}, "source": [ "#### โ›ณ๏ธ Bureau Feature Group" @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29f0701b", + "id": "b4bd73a7", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "70d72f96", + "id": "aa8d1f18", "metadata": {}, "source": [ "#### โ›ณ๏ธ Credit Card Balance Feature Group" @@ -149,7 +149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "22a67c9c", + "id": "132e0515", "metadata": {}, "outputs": [], "source": [ @@ -164,7 +164,7 @@ }, { "cell_type": "markdown", - "id": "4466e30c", + "id": "680b4258", "metadata": {}, "source": [ "#### โ›ณ๏ธ Installments Payments Feature Group" @@ -173,7 +173,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b55c6bb8", + "id": "8a762268", "metadata": {}, "outputs": [], "source": [ @@ -188,7 +188,7 @@ }, { "cell_type": "markdown", - "id": "211c9ae4", + "id": "1deb0620", "metadata": {}, "source": [ "#### โ›ณ๏ธ POS (point of sales) and Cash Loans Balance Feature Group" @@ -197,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "561cea06", + "id": "3e5424d9", "metadata": {}, "outputs": [], "source": [ @@ -212,7 +212,7 @@ }, { "cell_type": "markdown", - "id": "70a2ff26", + "id": "deb2d43d", "metadata": {}, "source": [ "#### โ›ณ๏ธ Previous Application Feature Group" @@ -221,7 +221,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9780899b", + "id": "dcb67123", "metadata": {}, "outputs": [], "source": [ @@ -236,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "a7629071", + "id": "cc760910", "metadata": {}, "source": [ "#### โ›ณ๏ธ Previous Load Counts Feature Group" @@ -245,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a591631", + "id": "546a5b4c", "metadata": {}, "outputs": [], "source": [ @@ -260,7 +260,7 @@ }, { "cell_type": "markdown", - "id": "bbf8ecdb", + "id": "94c4ba70", "metadata": {}, "source": [ "---\n", @@ -271,7 +271,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd420aa0", + "id": "fb7a171f", "metadata": {}, "outputs": [], "source": [ @@ -283,7 +283,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9991cd0e", + "id": "7bc5dcba", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34ed2b23", + "id": "a2e47f1a", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d0f30c4", + "id": "885c31d7", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +318,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f054238", + "id": "80007a73", "metadata": {}, "outputs": [], "source": [ @@ -329,7 +329,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3762a023", + "id": "15c769bc", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +341,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0afa4f91", + "id": "9a610bdf", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67a6d629", + "id": "c7688844", "metadata": {}, "outputs": [], "source": [ @@ -364,7 +364,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca00b80c", + "id": "fab35cbb", "metadata": {}, "outputs": [], "source": [ @@ -374,7 +374,7 @@ }, { "cell_type": "markdown", - "id": "4d0e1f50", + "id": "7afca5ac", "metadata": {}, "source": [ "### โฌ†๏ธ Uploading new data to the Feature Store" @@ -383,7 +383,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88ccf658", + "id": "76e22b7f", "metadata": {}, "outputs": [], "source": [ @@ -393,7 +393,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a7d5e49f", + "id": "314519ea", "metadata": {}, "outputs": [], "source": [ @@ -403,7 +403,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a07c0a8d", + "id": "c0af16ec", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +413,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ae679e44", + "id": "610e0982", "metadata": {}, "outputs": [], "source": [ @@ -423,7 +423,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f71a84e1", + "id": "8016add7", "metadata": {}, "outputs": [], "source": [ @@ -436,7 +436,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b9a1be50", + "id": "10e5f793", "metadata": {}, "outputs": [], "source": [ @@ -446,7 +446,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18bbd6f6", + "id": "486f3751", "metadata": {}, "outputs": [], "source": [ @@ -456,7 +456,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3297c8e", + "id": "187819d3", "metadata": {}, "outputs": [], "source": [ @@ -465,7 +465,7 @@ }, { "cell_type": "markdown", - "id": "f9b23f29", + "id": "0ca58c46", "metadata": {}, "source": [ "---\n", @@ -477,7 +477,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -491,7 +491,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb b/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb index ff4aa2ee..4172e7da 100644 --- a/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb +++ b/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ec1fb5d4", + "id": "e3835d20", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 03: Training Pipeline\n", @@ -24,7 +24,7 @@ }, { "cell_type": "markdown", - "id": "d9985c8d-9000-42e8-840a-ce63b3ae8a20", + "id": "6a039456", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -33,7 +33,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b129ae2", + "id": "78a3bd7f", "metadata": {}, "outputs": [], "source": [ @@ -43,11 +43,10 @@ { "cell_type": "code", "execution_count": null, - "id": "6a6e4839", + "id": "36ef21de", "metadata": {}, "outputs": [], "source": [ - "import joblib\n", "import os\n", "\n", "import matplotlib.pyplot as plt\n", @@ -66,7 +65,7 @@ }, { "cell_type": "markdown", - "id": "63e78e5c", + "id": "7e17c271", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connecting to Hopsworks Feature Store " @@ -75,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8e230c78", + "id": "a9926d65", "metadata": {}, "outputs": [], "source": [ @@ -88,7 +87,7 @@ }, { "cell_type": "markdown", - "id": "d3b486e2", + "id": "a751877c", "metadata": {}, "source": [ "## ๐Ÿช„ Retrieving Feature Groups" @@ -97,7 +96,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ef0178b", + "id": "49d2ea72", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +144,7 @@ }, { "cell_type": "markdown", - "id": "af7bc837", + "id": "68d1d7fd", "metadata": {}, "source": [ "---\n", @@ -170,7 +169,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78225ce5", + "id": "5b73ebfe", "metadata": {}, "outputs": [], "source": [ @@ -180,7 +179,7 @@ { "cell_type": "code", "execution_count": null, - "id": "629d2672", + "id": "e3be7448", "metadata": {}, "outputs": [], "source": [ @@ -190,7 +189,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0db3a1d3", + "id": "39e3b236", "metadata": {}, "outputs": [], "source": [ @@ -199,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "41890ff4", + "id": "4af51378", "metadata": {}, "source": [ "---\n", @@ -210,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41c4be37", + "id": "62380318", "metadata": {}, "outputs": [], "source": [ @@ -240,7 +239,7 @@ }, { "cell_type": "markdown", - "id": "258201a7", + "id": "e020793a", "metadata": { "tags": [] }, @@ -257,7 +256,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3f4173f3", + "id": "b4add316", "metadata": {}, "outputs": [], "source": [ @@ -267,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "1e20134b", + "id": "0e22bdc1", "metadata": {}, "source": [ "We can retrieve transformation function we need .\n", @@ -280,7 +279,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8a41e840", + "id": "d4417311", "metadata": {}, "outputs": [], "source": [ @@ -288,18 +287,19 @@ "cat_cols = selected_features_show5.dtypes[selected_features_show5.dtypes == 'object'].index\n", "\n", "# Retrieving the Label Encoder transformation function from Featuretools\n", - "le = fs.get_transformation_function(name='label_encoder') \n", + "label_encoder = fs.get_transformation_function(name='label_encoder') \n", "\n", "# Creating a dictionary of transformation functions, where each categorical column is associated with the Label Encoder\n", "transformation_functions = {\n", - " col: le\n", - " for col in cat_cols\n", + " col: label_encoder\n", + " for col \n", + " in cat_cols\n", "}" ] }, { "cell_type": "markdown", - "id": "5a161181", + "id": "3194fda2", "metadata": {}, "source": [ "---\n", @@ -328,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0eddfb1d", + "id": "2973313e", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "5d957dae", + "id": "28671167", "metadata": {}, "source": [ "---\n", @@ -381,7 +381,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b30b61c3", + "id": "bb7b350a", "metadata": {}, "outputs": [], "source": [ @@ -392,7 +392,7 @@ }, { "cell_type": "markdown", - "id": "a6d47d29", + "id": "67d287f1", "metadata": {}, "source": [ "---\n", @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0902530", + "id": "3d07c823", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ }, { "cell_type": "markdown", - "id": "69c7eff1", + "id": "f3195458", "metadata": {}, "source": [ "---\n", @@ -425,7 +425,7 @@ { "cell_type": "code", "execution_count": null, - "id": "393ec28c", + "id": "b5f5b547", "metadata": {}, "outputs": [], "source": [ @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "896802e7", + "id": "94d781e3", "metadata": {}, "outputs": [], "source": [ @@ -472,7 +472,7 @@ { "cell_type": "code", "execution_count": null, - "id": "38891567", + "id": "4ff26ab1", "metadata": {}, "outputs": [], "source": [ @@ -491,7 +491,7 @@ }, { "cell_type": "markdown", - "id": "668f2efb", + "id": "4e841906", "metadata": {}, "source": [ "---\n", @@ -503,7 +503,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd5ec387", + "id": "b1978dcb", "metadata": {}, "outputs": [], "source": [ @@ -513,7 +513,7 @@ }, { "cell_type": "markdown", - "id": "30b0c2a7", + "id": "e435716f", "metadata": {}, "source": [ "### โš™๏ธ Model Schema\n", @@ -526,7 +526,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c671699c", + "id": "d7863e60", "metadata": {}, "outputs": [], "source": [ @@ -538,7 +538,10 @@ "output_schema = Schema(y_train)\n", "\n", "# Creating a model schema\n", - "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", + "model_schema = ModelSchema(\n", + " input_schema=input_schema, \n", + " output_schema=output_schema,\n", + ")\n", "\n", "# Converting the model schema to a dictionary representation\n", "schema_dict = model_schema.to_dict()" @@ -546,7 +549,7 @@ }, { "cell_type": "markdown", - "id": "2489db15", + "id": "b68f4493", "metadata": {}, "source": [ "### ๐Ÿ’ฝ Save a model" @@ -555,7 +558,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5400b050-a035-432c-9978-fcd42ba96854", + "id": "1c58e284", "metadata": {}, "outputs": [], "source": [ @@ -564,8 +567,8 @@ "if os.path.isdir(model_dir) == False:\n", " os.mkdir(model_dir)\n", "\n", - "# Saving the trained XGBoost model as a joblib file in the model directory\n", - "joblib.dump(xgboost_model, model_dir + '/credit_scores_model.pkl')\n", + "# Saving the trained XGBoost model as a json file in the model directory\n", + "xgboost_model.save_model(model_dir + \"/model.json\")\n", "\n", "# Saving the confusion matrix and feature importance plots as images in the model directory\n", "figure_cm.figure.savefig(model_dir + '/confusion_matrix.png')\n", @@ -575,7 +578,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ce5af23f", + "id": "70342ce2", "metadata": {}, "outputs": [], "source": [ @@ -594,7 +597,7 @@ }, { "cell_type": "markdown", - "id": "fda2c1c0", + "id": "00656e4d", "metadata": {}, "source": [ "## โญ๏ธ **Next:** Part 04: Batch Inference \n", @@ -605,7 +608,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -619,7 +622,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/credit_scores/4_credit_scores_batch_inference.ipynb b/advanced_tutorials/credit_scores/4_credit_scores_batch_inference.ipynb index 8290b765..fec9040c 100644 --- a/advanced_tutorials/credit_scores/4_credit_scores_batch_inference.ipynb +++ b/advanced_tutorials/credit_scores/4_credit_scores_batch_inference.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c958e52b", + "id": "9a7e8c3f", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 04: Batch Inference\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "8855ee1a", + "id": "6406a5b4", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -25,16 +25,16 @@ { "cell_type": "code", "execution_count": null, - "id": "019c9226", + "id": "e85d7fa1", "metadata": {}, "outputs": [], "source": [ - "import joblib" + "from xgboost import XGBClassifier" ] }, { "cell_type": "markdown", - "id": "ce2fe8a8", + "id": "63a37311", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -43,7 +43,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39f83bc9", + "id": "1d91074e", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "87485ee0", + "id": "895e78b9", "metadata": {}, "source": [ "## โš™๏ธ Feature View Retrieval\n" @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e622d6b4", + "id": "0add3c12", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "e1dac8b6", + "id": "e9dbf027", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n" @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca35a9f4", + "id": "a3a14916", "metadata": {}, "outputs": [], "source": [ @@ -97,7 +97,7 @@ }, { "cell_type": "markdown", - "id": "6f3589dc", + "id": "9073c8dc", "metadata": {}, "source": [ "## ๐Ÿ“ฎ Retrieving model from Model Registry " @@ -106,7 +106,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ac8014f", + "id": "63b4b803", "metadata": {}, "outputs": [], "source": [ @@ -123,20 +123,21 @@ { "cell_type": "code", "execution_count": null, - "id": "ac68c2a0", + "id": "3156cdfc", "metadata": {}, "outputs": [], "source": [ - "# Loading the XGBoost model from the saved model directory\n", - "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/credit_scores_model.pkl\")\n", + "# Initialize the model\n", + "model = XGBClassifier()\n", "\n", - "# Displaying the retrieved XGBoost model\n", - "retrieved_xgboost_model" + "# Load the model from a saved JSON file\n", + "model.load_model(saved_model_dir + \"/model.json\")\n", + "model" ] }, { "cell_type": "markdown", - "id": "50a96ca9", + "id": "03492b0c", "metadata": {}, "source": [ "## ๐Ÿค– Making the predictions " @@ -144,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "de0c050d", + "id": "8e610938", "metadata": {}, "source": [ "### โœจ Load Batch Data" @@ -153,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fc56ebd1", + "id": "93fe4ec8", "metadata": {}, "outputs": [], "source": [ @@ -170,12 +171,12 @@ { "cell_type": "code", "execution_count": null, - "id": "fcbfef6e", + "id": "2169cbdc", "metadata": {}, "outputs": [], "source": [ "# Making predictions on the batch data using the retrieved XGBoost model\n", - "predictions = retrieved_xgboost_model.predict(batch_data)\n", + "predictions = model.predict(batch_data)\n", "\n", "# Displaying the first 10 predictions\n", "predictions[:10]" @@ -183,7 +184,7 @@ }, { "cell_type": "markdown", - "id": "89bc07f2", + "id": "e7d371d5", "metadata": {}, "source": [ "---\n", @@ -199,7 +200,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -213,7 +214,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/nyc_taxi_fares/1_nyc_taxi_fares_feature_backfill.ipynb b/advanced_tutorials/nyc_taxi_fares/1_nyc_taxi_fares_feature_backfill.ipynb index 54372aab..78de0b35 100644 --- a/advanced_tutorials/nyc_taxi_fares/1_nyc_taxi_fares_feature_backfill.ipynb +++ b/advanced_tutorials/nyc_taxi_fares/1_nyc_taxi_fares_feature_backfill.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "33e115a6", + "id": "dd36272d", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Backfill\n", @@ -19,7 +19,7 @@ }, { "cell_type": "markdown", - "id": "49b3ce29", + "id": "43713d56", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -28,7 +28,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30786dcc", + "id": "2f322474", "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c3a82469", + "id": "aeed974f", "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "markdown", - "id": "2956016d", + "id": "af8f24b1", "metadata": {}, "source": [ "___" @@ -59,7 +59,7 @@ }, { "cell_type": "markdown", - "id": "b9585831", + "id": "55fec98f", "metadata": {}, "source": [ "## ๐Ÿ’ฝ Loading Historical Data\n" @@ -67,7 +67,7 @@ }, { "cell_type": "markdown", - "id": "ffef12a5", + "id": "097fa6b8", "metadata": {}, "source": [ "### ๐Ÿš– Rides Data" @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "508ffc6a", + "id": "23c93fd1", "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ }, { "cell_type": "markdown", - "id": "6b07cf18", + "id": "9b04266f", "metadata": {}, "source": [ "### ๐Ÿ’ธ Fares Data" @@ -98,7 +98,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0fc2f8a0", + "id": "3025713c", "metadata": {}, "outputs": [], "source": [ @@ -111,7 +111,7 @@ }, { "cell_type": "markdown", - "id": "ca08499c", + "id": "c0cdbddb", "metadata": {}, "source": [ "---" @@ -119,7 +119,7 @@ }, { "cell_type": "markdown", - "id": "67ba455d", + "id": "06f48ff5", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to the Hopsworks Feature Store " @@ -128,7 +128,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b89e69b", + "id": "2c2367c5", "metadata": {}, "outputs": [], "source": [ @@ -141,7 +141,7 @@ }, { "cell_type": "markdown", - "id": "1b75e62a", + "id": "18fdd1af", "metadata": {}, "source": [ "___" @@ -149,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "977b0a17", + "id": "5919f8c4", "metadata": {}, "source": [ "## ๐Ÿช„ Creating Feature Groups" @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "0abd3542", + "id": "4d3bf547", "metadata": {}, "source": [ "### ๐Ÿš– Rides Data" @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63a2eeb9", + "id": "e75628e8", "metadata": {}, "outputs": [], "source": [ @@ -185,7 +185,7 @@ }, { "cell_type": "markdown", - "id": "de46ef09", + "id": "af1042da", "metadata": {}, "source": [ "### ๐Ÿ’ธ Fares Data" @@ -194,7 +194,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19183b27", + "id": "7ad00ce4", "metadata": {}, "outputs": [], "source": [ @@ -212,7 +212,7 @@ }, { "cell_type": "markdown", - "id": "35f036f1", + "id": "655abd26", "metadata": {}, "source": [ "---\n", @@ -224,7 +224,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -238,7 +238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/nyc_taxi_fares/2_nyc_taxi_fares_feature_pipeline.ipynb b/advanced_tutorials/nyc_taxi_fares/2_nyc_taxi_fares_feature_pipeline.ipynb index b8ab4ccd..4c3fda4f 100644 --- a/advanced_tutorials/nyc_taxi_fares/2_nyc_taxi_fares_feature_pipeline.ipynb +++ b/advanced_tutorials/nyc_taxi_fares/2_nyc_taxi_fares_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a071956b", + "id": "835eceed", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 02: Feature Pipeline\n", @@ -15,7 +15,7 @@ }, { "cell_type": "markdown", - "id": "48f55b16", + "id": "310cfc8a", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -24,7 +24,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29e6f1bb", + "id": "95150ba4", "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,10 @@ "import time \n", "import os \n", "\n", - "from features import nyc_taxi_rides, nyc_taxi_fares\n", + "from features import (\n", + " nyc_taxi_rides, \n", + " nyc_taxi_fares,\n", + ")\n", "\n", "# Mute warnings\n", "import warnings\n", @@ -42,7 +45,7 @@ }, { "cell_type": "markdown", - "id": "87a03dc4", + "id": "a3d5d7be", "metadata": {}, "source": [ "___" @@ -50,7 +53,7 @@ }, { "cell_type": "markdown", - "id": "50a543e2", + "id": "4603a252", "metadata": {}, "source": [ "## ๐Ÿช„ Generating new data" @@ -58,7 +61,7 @@ }, { "cell_type": "markdown", - "id": "802d8319", + "id": "474bd1b2", "metadata": {}, "source": [ "### ๐Ÿš– Rides Data" @@ -67,7 +70,7 @@ { "cell_type": "code", "execution_count": null, - "id": "829b67ed", + "id": "6db8cc92", "metadata": {}, "outputs": [], "source": [ @@ -75,13 +78,13 @@ "df_rides = nyc_taxi_rides.generate_rides_data(150)\n", "\n", "# Display the DataFrame containing the generated ride data\n", - "df_rides" + "df_rides.head(5)" ] }, { "cell_type": "code", "execution_count": null, - "id": "86d22e8c", + "id": "c4010b8e", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +98,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0c1cc726", + "id": "f9fc44ab", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +109,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c788df5e", + "id": "bdbe5cf9", "metadata": {}, "outputs": [], "source": [ @@ -117,7 +120,7 @@ }, { "cell_type": "markdown", - "id": "899fa056", + "id": "8cfc9fdf", "metadata": {}, "source": [ "### ๐Ÿ’ธ Fares Data" @@ -126,7 +129,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0847286d", + "id": "36b883e0", "metadata": { "scrolled": true }, @@ -136,13 +139,13 @@ "df_fares = nyc_taxi_fares.generate_fares_data(150)\n", "\n", "# Display the DataFrame containing the generated fare data\n", - "df_fares" + "df_fares.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "b053a508", + "id": "054a0a76", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +156,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1395c730", + "id": "ad1f6760", "metadata": {}, "outputs": [], "source": [ @@ -164,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31df375e", + "id": "d8a9b305", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +178,7 @@ }, { "cell_type": "markdown", - "id": "47145017", + "id": "24d823c9", "metadata": {}, "source": [ "___" @@ -183,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "cd24607e", + "id": "11699f3c", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to the Hopsworks Feature Store " @@ -192,7 +195,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3dc300c0", + "id": "ef887c6e", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a5d908c", + "id": "23213c57", "metadata": {}, "outputs": [], "source": [ @@ -223,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "55de84ac", + "id": "96c4aa87", "metadata": {}, "source": [ "---" @@ -231,7 +234,7 @@ }, { "cell_type": "markdown", - "id": "670b6336", + "id": "4ceb6069", "metadata": {}, "source": [ "## โฌ†๏ธ Uploading new data to the Feature Store" @@ -240,7 +243,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ff63846", + "id": "5672f9ff", "metadata": {}, "outputs": [], "source": [ @@ -250,7 +253,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43ede544", + "id": "64abc698", "metadata": { "scrolled": true }, @@ -261,7 +264,7 @@ }, { "cell_type": "markdown", - "id": "9b352a57", + "id": "470b5e79", "metadata": {}, "source": [ "---" @@ -269,7 +272,7 @@ }, { "cell_type": "markdown", - "id": "7f6152cd", + "id": "12d5f3e1", "metadata": {}, "source": [ "## โญ๏ธ **Next:** Part 03: Training Pipeline \n", @@ -280,7 +283,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -294,7 +297,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" }, "vscode": { "interpreter": { diff --git a/advanced_tutorials/nyc_taxi_fares/3_nyc_taxi_fares_training_pipeline.ipynb b/advanced_tutorials/nyc_taxi_fares/3_nyc_taxi_fares_training_pipeline.ipynb index 0c221790..7ec89b28 100644 --- a/advanced_tutorials/nyc_taxi_fares/3_nyc_taxi_fares_training_pipeline.ipynb +++ b/advanced_tutorials/nyc_taxi_fares/3_nyc_taxi_fares_training_pipeline.ipynb @@ -36,7 +36,10 @@ "\n", "import pandas as pd\n", "\n", - "from sklearn.metrics import mean_absolute_error, r2_score\n", + "from sklearn.metrics import (\n", + " mean_absolute_error, \n", + " r2_score,\n", + ")\n", "import xgboost as xgb\n", "\n", "import matplotlib.pyplot as plt\n", @@ -290,7 +293,12 @@ "})\n", "\n", "# Create a residual plot using Seaborn\n", - "residplot = sns.residplot(data=df_, x=\"y_true\", y=\"y_pred\", color='#613F75')\n", + "residplot = sns.residplot(\n", + " data=df_, \n", + " x=\"y_true\", \n", + " y=\"y_pred\", \n", + " color='#613F75',\n", + ")\n", "\n", "# Set plot title and axis labels\n", "plt.title('Model Residuals')\n", @@ -342,7 +350,10 @@ "output_schema = Schema(y_train)\n", "\n", "# Create a model schema using the defined input and output schemas\n", - "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", + "model_schema = ModelSchema(\n", + " input_schema=input_schema, \n", + " output_schema=output_schema,\n", + ")\n", "\n", "# Convert the model schema to a dictionary representation\n", "model_schema_dict = model_schema.to_dict()" @@ -370,8 +381,8 @@ "if not os.path.isdir(model_dir):\n", " os.mkdir(model_dir)\n", "\n", - "# Save the trained XGBoost regressor to a file in the specified directory\n", - "joblib.dump(regressor, model_dir + '/nyc_taxi_fares_model.pkl')\n", + "# Save the trained XGBoost regressor to a json file in the specified directory\n", + "regressor.save_model(model_dir + \"/model.json\")\n", "\n", "# Save the residual plot figure as an image file in the specified directory\n", "fig.savefig(model_dir + \"/residplot.png\")" @@ -421,7 +432,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -435,7 +446,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/nyc_taxi_fares/4_nyc_taxi_fares_batch_inference.ipynb b/advanced_tutorials/nyc_taxi_fares/4_nyc_taxi_fares_batch_inference.ipynb index de1b3845..dff4b0a3 100644 --- a/advanced_tutorials/nyc_taxi_fares/4_nyc_taxi_fares_batch_inference.ipynb +++ b/advanced_tutorials/nyc_taxi_fares/4_nyc_taxi_fares_batch_inference.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c958e52b", + "id": "c368eab4", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 04: Batch Inference\n", @@ -15,7 +15,7 @@ }, { "cell_type": "markdown", - "id": "8855ee1a", + "id": "255906b9", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -24,16 +24,16 @@ { "cell_type": "code", "execution_count": null, - "id": "019c9226", + "id": "4ba1afa2", "metadata": {}, "outputs": [], "source": [ - "import joblib" + "from xgboost import XGBRegressor" ] }, { "cell_type": "markdown", - "id": "ce2fe8a8", + "id": "cb4a9dca", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -42,7 +42,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39f83bc9", + "id": "c9bce235", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "markdown", - "id": "87485ee0", + "id": "7cad25e1", "metadata": {}, "source": [ "## โš™๏ธ Feature View Retrieval\n" @@ -64,7 +64,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e622d6b4", + "id": "9a36a4b8", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "e1dac8b6", + "id": "2b99f3b3", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n" @@ -86,7 +86,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca35a9f4", + "id": "3ce2c9f1", "metadata": {}, "outputs": [], "source": [ @@ -96,7 +96,7 @@ }, { "cell_type": "markdown", - "id": "6f3589dc", + "id": "f8eec8d4", "metadata": {}, "source": [ "## ๐Ÿ“ฎ Retrieving model from Model Registry " @@ -105,7 +105,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ac8014f", + "id": "e0333663", "metadata": {}, "outputs": [], "source": [ @@ -122,20 +122,21 @@ { "cell_type": "code", "execution_count": null, - "id": "b48a0159", + "id": "b2df9868", "metadata": {}, "outputs": [], "source": [ - "# Load the saved XGBoost model from the downloaded directory\n", - "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/nyc_taxi_fares_model.pkl\")\n", + "# Initialize the model\n", + "model = XGBRegressor()\n", "\n", - "# Display the retrieved XGBoost model\n", - "retrieved_xgboost_model" + "# Load the model from a saved JSON file\n", + "model.load_model(saved_model_dir + \"/model.json\")\n", + "model" ] }, { "cell_type": "markdown", - "id": "86af1be5", + "id": "2531a77a", "metadata": {}, "source": [ "## โœจ Load Batch Data" @@ -144,7 +145,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7dab6acd", + "id": "276fd009", "metadata": {}, "outputs": [], "source": [ @@ -163,7 +164,7 @@ }, { "cell_type": "markdown", - "id": "a41ffc73", + "id": "06f2f514", "metadata": {}, "source": [ "## ๐Ÿค– Making the predictions " @@ -172,12 +173,12 @@ { "cell_type": "code", "execution_count": null, - "id": "88e2fa47", + "id": "09d9847d", "metadata": {}, "outputs": [], "source": [ "# Use the retrieved XGBoost model to make predictions on the batch data\n", - "predictions = retrieved_xgboost_model.predict(batch_data)\n", + "predictions = model.predict(batch_data)\n", "\n", "# Display the first 10 predictions\n", "predictions[:10]" @@ -185,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "b0b40f5b", + "id": "1f351099", "metadata": {}, "source": [ "It's important to know that every time you save a model with the same name, a new version of the model will be saved, so nothing will be overwritten. In this way, you can compare several versions of the same model - or create a model with a new name, if you prefer that." @@ -193,7 +194,7 @@ }, { "cell_type": "markdown", - "id": "89bc07f2", + "id": "a6ebe15e", "metadata": {}, "source": [ "---\n", @@ -209,7 +210,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -223,7 +224,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/nyc_taxi_fares/streamlit_batch_inference_app.py b/advanced_tutorials/nyc_taxi_fares/streamlit_batch_inference_app.py index c9b6226a..42b3dc84 100644 --- a/advanced_tutorials/nyc_taxi_fares/streamlit_batch_inference_app.py +++ b/advanced_tutorials/nyc_taxi_fares/streamlit_batch_inference_app.py @@ -1,6 +1,6 @@ import streamlit as st import hopsworks -import joblib +from xgboost import XGBRegressor import pandas as pd import numpy as np import folium @@ -24,7 +24,11 @@ def get_model(project, model_name, file_name): if list_of_files: model_path = list_of_files[0] - model = joblib.load(model_path) + # Initialize the model + model = XGBRegressor() + + # Load the model from a saved JSON file + model.load_model("/model.json") else: if not os.path.exists(TARGET_FILE): mr = project.get_model_registry() @@ -35,7 +39,11 @@ def get_model(project, model_name, file_name): EVALUATION_METRIC, SORT_METRICS_BY) model_dir = model.download() - model = joblib.load(model_dir + f"/{file_name}.pkl") + # Initialize the model + model = XGBRegressor() + + # Load the model from a saved JSON file + model.load_model(model_dir + "/model.json") return model diff --git a/advanced_tutorials/timeseries/features/price.py b/advanced_tutorials/timeseries/features/price.py index cf14602f..ad372ef8 100644 --- a/advanced_tutorials/timeseries/features/price.py +++ b/advanced_tutorials/timeseries/features/price.py @@ -66,7 +66,7 @@ def generate_historical_data(start_date: Optional[date] = None, end_date: Option data_list = [] - for date in tqdm(date_range, desc="Generating Data"): + for date in tqdm(date_range, desc="๐Ÿ”ฎ Generating Data"): generate_historical_day(date, start_date, data_list) df = pd.DataFrame(data_list, columns=['date', 'id', 'price']) diff --git a/advanced_tutorials/transformation_functions/custom/custom_transformation_functions.ipynb b/advanced_tutorials/transformation_functions/custom/custom_transformation_functions.ipynb index 2cd13a97..fc404587 100644 --- a/advanced_tutorials/transformation_functions/custom/custom_transformation_functions.ipynb +++ b/advanced_tutorials/transformation_functions/custom/custom_transformation_functions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "0c382383", + "id": "d6fb35cf", "metadata": {}, "source": [ "# ๐Ÿ‘จ๐Ÿปโ€๐Ÿซ Custom Transformation Functions\n", @@ -12,7 +12,7 @@ }, { "cell_type": "markdown", - "id": "3994e496", + "id": "a19fd127", "metadata": {}, "source": [ "## ๐Ÿ—„๏ธ Table of Contents\n", @@ -35,7 +35,7 @@ }, { "cell_type": "markdown", - "id": "a8743b40", + "id": "3cc6a7e9", "metadata": {}, "source": [ "\n", @@ -45,23 +45,23 @@ { "cell_type": "code", "execution_count": null, - "id": "f10e0c78", + "id": "22c08e9f", "metadata": {}, "outputs": [], "source": [ "# Importing necessary libraries\n", - "import pandas as pd # For data manipulation and analysis using DataFrames\n", - "import numpy as np # For numerical computations and arrays\n", - "import os # For operating system-related functions\n", - "import joblib # For saving and loading model files\n", + "import pandas as pd # For data manipulation and analysis using DataFrames\n", + "import numpy as np # For numerical computations and arrays\n", + "import os # For operating system-related functions\n", + "import joblib # For saving and loading model files\n", "\n", - "import xgboost as xgb # For using the XGBoost machine learning library\n", + "import xgboost as xgb # For using the XGBoost machine learning library\n", "from sklearn.metrics import accuracy_score # For evaluating model accuracy" ] }, { "cell_type": "markdown", - "id": "814a7e1a", + "id": "97bc8784", "metadata": {}, "source": [ "---\n", @@ -73,7 +73,7 @@ }, { "cell_type": "markdown", - "id": "626ca429", + "id": "4562f488", "metadata": {}, "source": [ "\n", @@ -85,7 +85,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed7aaf97", + "id": "1cfebd42", "metadata": {}, "outputs": [], "source": [ @@ -96,7 +96,7 @@ }, { "cell_type": "markdown", - "id": "fbd04249", + "id": "031974ca", "metadata": {}, "source": [ "Now let's add a target variable to the DataFrame. For simplicity and for demonstration purposes you will randomly assign either a 0 or a 1 to each row." @@ -105,7 +105,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48e9dd48", + "id": "411520b2", "metadata": {}, "outputs": [], "source": [ @@ -119,7 +119,7 @@ }, { "cell_type": "markdown", - "id": "4bd983ab", + "id": "b2f4b822", "metadata": {}, "source": [ "\n", @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4826fc0e", + "id": "49800275", "metadata": {}, "outputs": [], "source": [ @@ -144,7 +144,7 @@ }, { "cell_type": "markdown", - "id": "55f77b7b", + "id": "60c9e83b", "metadata": {}, "source": [ "\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed3a1093", + "id": "d6e5c898", "metadata": {}, "outputs": [], "source": [ @@ -176,7 +176,7 @@ }, { "cell_type": "markdown", - "id": "c5d4d483", + "id": "eca75c35", "metadata": {}, "source": [ "---\n", @@ -188,7 +188,7 @@ }, { "cell_type": "markdown", - "id": "f38330c2", + "id": "06b8350d", "metadata": {}, "source": [ "\n", @@ -201,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "694da24d", + "id": "7c66cd33", "metadata": {}, "source": [ "If your code is running internally within Hopsworks, to register custom transformation functions in the feature store they need to be either part of the library installed in Hopsworks or attached when starting a Jupyter notebook or Hopsworks job.\n", @@ -212,7 +212,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d460963b", + "id": "a3dc2540", "metadata": {}, "outputs": [], "source": [ @@ -222,7 +222,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b67080b", + "id": "1dbb42d0", "metadata": {}, "outputs": [], "source": [ @@ -232,7 +232,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cbd50770", + "id": "52881e65", "metadata": {}, "outputs": [], "source": [ @@ -244,7 +244,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3efc5c75", + "id": "dd0bf161", "metadata": {}, "outputs": [], "source": [ @@ -255,7 +255,7 @@ }, { "cell_type": "markdown", - "id": "66b79ff3", + "id": "679112d9", "metadata": {}, "source": [ "\n", @@ -269,7 +269,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a1e9ba3e", + "id": "23d9adb7", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "9d3c9e22", + "id": "f1ffb06c", "metadata": {}, "source": [ "You can register your transformation function using the `.create_transformation_function()` method with the next parameters:\n", @@ -301,7 +301,7 @@ { "cell_type": "code", "execution_count": null, - "id": "875367b2", + "id": "f187f4f5", "metadata": {}, "outputs": [], "source": [ @@ -326,7 +326,7 @@ }, { "cell_type": "markdown", - "id": "a4242b9e", + "id": "d229ab74", "metadata": {}, "source": [ "Now let's check if your custom transformation functions are present in the feature store." @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5a3b9ff", + "id": "115f5ebc", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +350,7 @@ }, { "cell_type": "markdown", - "id": "6c6e3352", + "id": "a6b8cf14", "metadata": {}, "source": [ "\n", @@ -364,7 +364,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5dd6ce0", + "id": "cd0a0c45", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "95df4a6f", + "id": "54875b4a", "metadata": {}, "source": [ "In Hopsworks Feature Store, a Query object allows you to select specific features from a feature group.\n", @@ -394,7 +394,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98f4dbe0", + "id": "596fa8e0", "metadata": {}, "outputs": [], "source": [ @@ -407,7 +407,7 @@ }, { "cell_type": "markdown", - "id": "df294d0e", + "id": "83db5e88", "metadata": {}, "source": [ "After creating the Query object, you will create a feature view.\n", @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d1791335", + "id": "26622c9c", "metadata": {}, "outputs": [], "source": [ @@ -440,7 +440,7 @@ }, { "cell_type": "markdown", - "id": "f554ced8", + "id": "ccd708ba", "metadata": {}, "source": [ "## ๐Ÿ‹๏ธ Training Dataset Creation\n", @@ -456,7 +456,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8eb20081", + "id": "53b26f67", "metadata": {}, "outputs": [], "source": [ @@ -470,7 +470,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4a830bca", + "id": "30503127", "metadata": {}, "outputs": [], "source": [ @@ -480,7 +480,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d8342245", + "id": "92db837f", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "d09cf48d", + "id": "248bc5d1", "metadata": {}, "source": [ "\n", @@ -503,7 +503,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee5019a6", + "id": "9f1346a0", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ }, { "cell_type": "markdown", - "id": "49bedc3e", + "id": "b230ca05", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n", @@ -535,7 +535,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7a73fe0", + "id": "67616e13", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +544,7 @@ }, { "cell_type": "markdown", - "id": "6dbde875", + "id": "f50bed8a", "metadata": {}, "source": [ "### โš™๏ธ Model Schema\n", @@ -555,7 +555,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8c52621c", + "id": "8f9569dc", "metadata": {}, "outputs": [], "source": [ @@ -574,7 +574,7 @@ }, { "cell_type": "markdown", - "id": "c1f6cb84", + "id": "8de1abf1", "metadata": {}, "source": [ "\n", @@ -588,7 +588,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e94a18cf", + "id": "2f790036", "metadata": {}, "outputs": [], "source": [ @@ -598,12 +598,12 @@ " os.mkdir(model_dir)\n", "\n", "# Save the model\n", - "joblib.dump(xgb_classifier, model_dir + '/xgb_classifier.pkl')" + "xgb_classifier.save_model(model_dir + \"/model.json\")" ] }, { "cell_type": "markdown", - "id": "f6c4dd90", + "id": "dfbba175", "metadata": {}, "source": [ "To register your model in the Hopsworks model registry you can use `.create_model()` method with the next parameters:\n", @@ -622,7 +622,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da7415dc", + "id": "5adae94c", "metadata": {}, "outputs": [], "source": [ @@ -640,7 +640,7 @@ }, { "cell_type": "markdown", - "id": "33fcad3c", + "id": "9d416af2", "metadata": {}, "source": [ "---\n", @@ -652,7 +652,7 @@ }, { "cell_type": "markdown", - "id": "be4e238f", + "id": "d8c45f8e", "metadata": {}, "source": [ "\n", @@ -670,7 +670,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b287444", + "id": "42290763", "metadata": {}, "outputs": [], "source": [ @@ -685,18 +685,21 @@ { "cell_type": "code", "execution_count": null, - "id": "9854b853", + "id": "28d78f36", "metadata": {}, "outputs": [], "source": [ - "# Retrieve the XGB model\n", - "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/xgb_classifier.pkl\")\n", - "retrieved_xgboost_model" + "# Initialize the model\n", + "model = xgb.XGBClassifier()\n", + "\n", + "# Load the model from a saved JSON file\n", + "model.load_model(saved_model_dir + \"/model.json\")\n", + "model" ] }, { "cell_type": "markdown", - "id": "b8e2e23e", + "id": "bf8d901d", "metadata": {}, "source": [ "\n", @@ -707,7 +710,7 @@ }, { "cell_type": "markdown", - "id": "b3fbe4ff", + "id": "9098714a", "metadata": {}, "source": [ "To retrieve batch data from the feature view you need to use `init_batch_scoring` method of the feature view object.\n", @@ -720,7 +723,7 @@ { "cell_type": "code", "execution_count": null, - "id": "94123860", + "id": "b8be1550", "metadata": {}, "outputs": [], "source": [ @@ -734,7 +737,7 @@ }, { "cell_type": "markdown", - "id": "ed400e3c", + "id": "1bcf497b", "metadata": {}, "source": [ "Now let's use retrieved model to predict batch data." @@ -743,18 +746,18 @@ { "cell_type": "code", "execution_count": null, - "id": "9b96bb01", + "id": "c930266d", "metadata": {}, "outputs": [], "source": [ "# Predict batch data using retrieved model\n", - "predictions_batch = retrieved_xgboost_model.predict(batch_data)\n", + "predictions_batch = model.predict(batch_data)\n", "predictions_batch[:10]" ] }, { "cell_type": "markdown", - "id": "eaf4631d", + "id": "93db8b23", "metadata": {}, "source": [ "\n", @@ -768,7 +771,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2317eadf", + "id": "a52389de", "metadata": {}, "outputs": [], "source": [ @@ -787,7 +790,7 @@ }, { "cell_type": "markdown", - "id": "1686bcff", + "id": "e50f54a7", "metadata": {}, "source": [ "The next step is to initialize the feature view for serving and then retrieve a feature vector with specified primary keys." @@ -796,7 +799,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5cdb19e0", + "id": "b8f9595f", "metadata": {}, "outputs": [], "source": [ @@ -815,7 +818,7 @@ }, { "cell_type": "markdown", - "id": "5e581a8e", + "id": "9c659edd", "metadata": {}, "source": [ "Now you can use your model to predict the feature vector." @@ -824,18 +827,18 @@ { "cell_type": "code", "execution_count": null, - "id": "b6fc77d8", + "id": "6b6aa6c1", "metadata": {}, "outputs": [], "source": [ "# Predict feature vector using retrieved model\n", - "prediction_feature_vector = retrieved_xgboost_model.predict(to_numpy(feature_vector))\n", + "prediction_feature_vector = model.predict(to_numpy(feature_vector))\n", "prediction_feature_vector" ] }, { "cell_type": "markdown", - "id": "01e82837", + "id": "dd1e7328", "metadata": {}, "source": [ "In addition, you can retrieve several feature vectors. Just pass primary keys as a list of dictionaries." @@ -844,7 +847,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac541e0b", + "id": "c9d8fbc8", "metadata": {}, "outputs": [], "source": [ @@ -862,7 +865,7 @@ }, { "cell_type": "markdown", - "id": "b6a5d21a", + "id": "ccfce535", "metadata": {}, "source": [ "Now you can use your model to predict feature vectors." @@ -871,18 +874,18 @@ { "cell_type": "code", "execution_count": null, - "id": "3ee7ffa0", + "id": "8db998a2", "metadata": {}, "outputs": [], "source": [ "# Predict feature vectors using retrieved model\n", - "prediction_feature_vectors = retrieved_xgboost_model.predict(to_numpy(feature_vectors))\n", + "prediction_feature_vectors = model.predict(to_numpy(feature_vectors))\n", "prediction_feature_vectors" ] }, { "cell_type": "markdown", - "id": "0b1149b1", + "id": "0c202c74", "metadata": {}, "source": [ "---" @@ -891,7 +894,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -905,7 +908,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/churn/1_churn_feature_pipeline.ipynb b/churn/1_churn_feature_pipeline.ipynb index 8a071f36..5427f97f 100644 --- a/churn/1_churn_feature_pipeline.ipynb +++ b/churn/1_churn_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "e31bb98b", + "id": "2de72615", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "4ab5155e", + "id": "e08a069b", "metadata": {}, "source": [ "The data you will use comes from three different CSV files:\n", @@ -40,7 +40,7 @@ }, { "cell_type": "markdown", - "id": "5b5c3917", + "id": "90f5f948", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -49,7 +49,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d961f87", + "id": "859fbe6b", "metadata": {}, "outputs": [], "source": [ @@ -59,20 +59,20 @@ { "cell_type": "code", "execution_count": null, - "id": "59ad59a9", + "id": "f73d7642", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", - "#ignore warnings\n", + "# Ignore warnings\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", - "id": "56fbc5cc", + "id": "d7d39005", "metadata": {}, "source": [ "## ๐Ÿ’ฝ Loading the Data \n" @@ -81,7 +81,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cab1512d", + "id": "abff9db7", "metadata": {}, "outputs": [], "source": [ @@ -91,20 +91,20 @@ "# Read customer info data with datetime parsing\n", "customer_info_df = pd.read_csv(\n", " \"https://repo.hops.works/dev/davit/churn/customer_info.csv\",\n", - " parse_dates=['datetime']\n", + " parse_dates=['datetime'],\n", ")\n", "\n", "# Read subscriptions data with datetime parsing\n", "subscriptions_df = pd.read_csv(\n", " \"https://repo.hops.works/dev/davit/churn/subscriptions.csv\",\n", - " parse_dates=['datetime']\n", + " parse_dates=['datetime'],\n", ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "95652438-1a10-401e-9438-ca06b6f5ea96", + "id": "47371dd8", "metadata": {}, "outputs": [], "source": [ @@ -114,7 +114,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0401bede-2d69-40a8-ad21-55602a7c72c2", + "id": "171c0a5d", "metadata": {}, "outputs": [], "source": [ @@ -124,7 +124,7 @@ { "cell_type": "code", "execution_count": null, - "id": "86a445a2-712b-4d0e-aea3-326932ef7dc3", + "id": "58f7c4ef", "metadata": {}, "outputs": [], "source": [ @@ -133,7 +133,7 @@ }, { "cell_type": "markdown", - "id": "cfddf1b6", + "id": "5a7a2f89", "metadata": {}, "source": [ "---\n", @@ -145,7 +145,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b26eb398", + "id": "4d0ddecb", "metadata": {}, "outputs": [], "source": [ @@ -164,7 +164,7 @@ }, { "cell_type": "markdown", - "id": "ae5008cf", + "id": "2ca91d6f", "metadata": {}, "source": [ "---\n", @@ -183,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81531cc3", + "id": "2f835078", "metadata": {}, "outputs": [], "source": [ @@ -196,7 +196,7 @@ }, { "cell_type": "markdown", - "id": "7a094e86", + "id": "2195a094", "metadata": {}, "source": [ "To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group." @@ -205,7 +205,7 @@ { "cell_type": "code", "execution_count": null, - "id": "921a5010", + "id": "5be5d163", "metadata": {}, "outputs": [], "source": [ @@ -221,7 +221,7 @@ }, { "cell_type": "markdown", - "id": "1a06298e", + "id": "9454e269", "metadata": {}, "source": [ "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -232,7 +232,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05d9cbd8", + "id": "c820fdb0", "metadata": {}, "outputs": [], "source": [ @@ -243,7 +243,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd8469e7", + "id": "9a751ee7", "metadata": {}, "outputs": [], "source": [ @@ -267,7 +267,7 @@ { "cell_type": "code", "execution_count": null, - "id": "942ecf89", + "id": "e97881ff", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bf4e3842", + "id": "cbabcd85", "metadata": {}, "outputs": [], "source": [ @@ -305,7 +305,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2af92271", + "id": "0d43fb2e", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16aca181", + "id": "472be52b", "metadata": {}, "outputs": [], "source": [ @@ -349,7 +349,7 @@ }, { "cell_type": "markdown", - "id": "6be2416d", + "id": "0f38854f", "metadata": {}, "source": [ "All three feature groups are now accessible and searchable in the UI\n", @@ -359,7 +359,7 @@ }, { "cell_type": "markdown", - "id": "26023b4f", + "id": "817cab5b", "metadata": {}, "source": [ "---\n", @@ -376,7 +376,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -390,7 +390,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/churn/2_churn_training_pipeline.ipynb b/churn/2_churn_training_pipeline.ipynb index 8a041acb..0d34eddd 100644 --- a/churn/2_churn_training_pipeline.ipynb +++ b/churn/2_churn_training_pipeline.ipynb @@ -47,7 +47,6 @@ "metadata": {}, "outputs": [], "source": [ - "import joblib\n", "import os\n", "from PIL import Image\n", "\n", @@ -164,7 +163,7 @@ " \"multiplelines\", \"internetservice\", \"onlinesecurity\", \"onlinebackup\",\n", " \"deviceprotection\", \"techsupport\", \"streamingmovies\", \"streamingtv\",\n", " \"phoneservice\", \"paperlessbilling\", \"contract\", \"paymentmethod\", \"gender\", \n", - " \"dependents\", \"partner\"\n", + " \"dependents\", \"partner\",\n", "]\n", "\n", "# Map features to their corresponding transformation functions\n", @@ -297,10 +296,10 @@ "outputs": [], "source": [ "# Create an instance of the XGBClassifier with a specified scale_pos_weight\n", - "classifier = xgb.XGBClassifier(scale_pos_weight=3)\n", + "model = xgb.XGBClassifier(scale_pos_weight=3)\n", "\n", "# Fit the classifier on the training data\n", - "classifier.fit(X_train, y_train)" + "model.fit(X_train, y_train)" ] }, { @@ -318,7 +317,10 @@ "outputs": [], "source": [ "# Generate the confusion matrix using the true labels (y_test) and predicted labels from the classifier\n", - "conf_matrix = confusion_matrix(y_test, classifier.predict(X_test)).astype(int)\n", + "conf_matrix = confusion_matrix(\n", + " y_test, \n", + " model.predict(X_test),\n", + ").astype(int)\n", "\n", "# Create a DataFrame from the confusion matrix results with appropriate labels\n", "df_cm = pd.DataFrame(\n", @@ -329,7 +331,12 @@ "\n", "# Create a heatmap using seaborn with annotations\n", "figure_cm = plt.figure(figsize=(10, 7))\n", - "figure_cm = sns.heatmap(df_cm, annot=True, annot_kws={\"size\": 14}, fmt='.10g')\n", + "figure_cm = sns.heatmap(\n", + " df_cm, \n", + " annot=True, \n", + " annot_kws={\"size\": 14}, \n", + " fmt='.10g',\n", + ")\n", "\n", "# Set the title for the confusion matrix plot\n", "plt.title('Confusion Matrix', fontsize=17)\n", @@ -385,7 +392,10 @@ "output_schema = Schema(y_train)\n", "\n", "# Create a ModelSchema object specifying the input and output schemas\n", - "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", + "model_schema = ModelSchema(\n", + " input_schema=input_schema, \n", + " output_schema=output_schema,\n", + ")\n", "\n", "# Convert the model schema to a dictionary\n", "model_schema.to_dict()" @@ -404,11 +414,8 @@ "if not os.path.isdir(model_dir):\n", " os.mkdir(model_dir)\n", "\n", - "# Specify the file name for the pickled model\n", - "pkl_file_name = model_dir + '/churnmodel.pkl'\n", - "\n", - "# Save the trained classifier using joblib\n", - "joblib.dump(classifier, pkl_file_name)\n", + "# Save the trained classifier as json file\n", + "model.save_model(model_dir + \"/model.json\")\n", "\n", "# Save the confusion matrix heatmap as an image in the model directory\n", "figure_cm.figure.savefig(model_dir + '/confusion_matrix.png')" @@ -440,7 +447,9 @@ "\n", "In the following notebook you will use your model for batch inference.\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/churn/3_churn_batch_inference.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/churn/3_churn_batch_inference.ipynb)\n", + "\n", + "---" ] } ], @@ -449,7 +458,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -463,7 +472,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/churn/3_churn_batch_inference.ipynb b/churn/3_churn_batch_inference.ipynb index 8fd870e3..c907d5a1 100644 --- a/churn/3_churn_batch_inference.ipynb +++ b/churn/3_churn_batch_inference.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "7ffd2cd9", + "id": "2710cb61", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 03: Batch Inference\n", @@ -12,7 +12,7 @@ }, { "cell_type": "markdown", - "id": "c0f756d8", + "id": "91221f50", "metadata": {}, "source": [ "### ๐Ÿ“ Imports" @@ -21,7 +21,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee2ab866", + "id": "a35f9502", "metadata": {}, "outputs": [], "source": [ @@ -31,12 +31,14 @@ { "cell_type": "code", "execution_count": null, - "id": "a11d2b28", + "id": "5634f0a1", "metadata": {}, "outputs": [], "source": [ - "import joblib\n", - "from xgboost import plot_importance\n", + "from xgboost import (\n", + " XGBClassifier, \n", + " plot_importance,\n", + ")\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", @@ -46,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "21af1eec", + "id": "65b0eb1d", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -55,7 +57,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61b5b046", + "id": "c94353e7", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +70,7 @@ }, { "cell_type": "markdown", - "id": "c0a0c398", + "id": "b4d72861", "metadata": {}, "source": [ "## โš™๏ธ Feature View Retrieval\n" @@ -77,7 +79,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4b10aff", + "id": "d55ff11f", "metadata": {}, "outputs": [], "source": [ @@ -90,7 +92,7 @@ }, { "cell_type": "markdown", - "id": "7e7bec90", + "id": "6f7b31a0", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n" @@ -99,7 +101,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d187a98a", + "id": "8d0d05bc", "metadata": {}, "outputs": [], "source": [ @@ -109,7 +111,7 @@ }, { "cell_type": "markdown", - "id": "b43409c0", + "id": "38d83203", "metadata": {}, "source": [ "---\n", @@ -122,7 +124,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e2563560", + "id": "cc8a1687", "metadata": {}, "outputs": [], "source": [ @@ -139,20 +141,21 @@ { "cell_type": "code", "execution_count": null, - "id": "5e2159d5", + "id": "ca62f805", "metadata": {}, "outputs": [], "source": [ - "# Load the saved XGBoost model using joblib from the downloaded model directory\n", - "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/churnmodel.pkl\")\n", + "# Initialize the model\n", + "model = XGBClassifier()\n", "\n", - "# Display the retrieved XGBoost model\n", - "retrieved_xgboost_model" + "# Load the model from a saved JSON file\n", + "model.load_model(saved_model_dir + \"/model.json\")\n", + "model" ] }, { "cell_type": "markdown", - "id": "53ee5a05", + "id": "eb7680ed", "metadata": {}, "source": [ "---\n", @@ -162,7 +165,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aec4ef7b", + "id": "67a4ac87", "metadata": {}, "outputs": [], "source": [ @@ -180,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ebb02e76", + "id": "880d106f", "metadata": {}, "outputs": [], "source": [ @@ -196,7 +199,7 @@ }, { "cell_type": "markdown", - "id": "43a879bf", + "id": "c522b923", "metadata": {}, "source": [ "Let's predict the all for all customer data and then visualize predictions." @@ -205,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "422ed460", + "id": "7e48ba91", "metadata": {}, "outputs": [], "source": [ @@ -213,7 +216,7 @@ "batch_data.drop('customerid', axis=1, inplace=True)\n", "\n", "# Use the retrieved XGBoost model to make predictions on the batch data\n", - "predictions = retrieved_xgboost_model.predict(batch_data)\n", + "predictions = model.predict(batch_data)\n", "\n", "# Transform numeric predictions to human-readable labels\n", "predictions = transform_preds(predictions)\n", @@ -224,7 +227,7 @@ }, { "cell_type": "markdown", - "id": "d95fa5e6", + "id": "e5a41f2e", "metadata": {}, "source": [ "---\n", @@ -236,7 +239,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd8413b1", + "id": "8dcefec6", "metadata": {}, "outputs": [], "source": [ @@ -264,7 +267,7 @@ }, { "cell_type": "markdown", - "id": "cbae562e", + "id": "4ee29f72", "metadata": {}, "source": [ "Lets plot feature importance " @@ -273,13 +276,13 @@ { "cell_type": "code", "execution_count": null, - "id": "daaa2131", + "id": "669cd297", "metadata": {}, "outputs": [], "source": [ "# Plot feature importance using XGBoost's plot_importance function\n", "figure_imp = plot_importance(\n", - " retrieved_xgboost_model, # The retrieved XGBoost model\n", + " model, # The retrieved XGBoost model\n", " max_num_features=10, # Maximum number of features to display\n", " importance_type='weight', # Type of importance to display ('weight' represents the number of times a feature appears in a tree across all trees)\n", ")\n", @@ -291,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b9256cf", + "id": "b554d9c5", "metadata": {}, "outputs": [], "source": [ @@ -312,7 +315,7 @@ }, { "cell_type": "markdown", - "id": "9d4d273e", + "id": "97b466fb", "metadata": {}, "source": [ "Lets visualise couple of more imporant features such as `streamingtv` and `streamingmovies`" @@ -321,7 +324,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9bbddda9", + "id": "1f7ad6b4", "metadata": {}, "outputs": [], "source": [ @@ -343,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f91b7d1d", + "id": "6d83fe76", "metadata": {}, "outputs": [], "source": [ @@ -365,7 +368,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4598b999", + "id": "22b5744a", "metadata": {}, "outputs": [], "source": [ @@ -387,7 +390,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fdaaa590", + "id": "785a8c93", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +412,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7674b581", + "id": "f844088e", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +434,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e95d8849", + "id": "3b1bae8e", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +455,7 @@ }, { "cell_type": "markdown", - "id": "2421f67a", + "id": "203cb336", "metadata": {}, "source": [ "---\n", @@ -471,7 +474,7 @@ }, { "cell_type": "markdown", - "id": "471dd2be", + "id": "969cdae1", "metadata": {}, "source": [ "---\n", @@ -488,7 +491,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d557008b", + "id": "76d0033b", "metadata": {}, "outputs": [], "source": [ @@ -505,7 +508,7 @@ }, { "cell_type": "markdown", - "id": "98dc224a", + "id": "0a4eeba0", "metadata": {}, "source": [ "![fg-statistics](../churn/images/churn_statistics.gif)\n", @@ -518,7 +521,7 @@ }, { "cell_type": "markdown", - "id": "211ff1e0", + "id": "e3d0628e", "metadata": {}, "source": [ "---\n", @@ -534,7 +537,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -548,7 +551,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/churn/streamlit_batch_inference_app.py b/churn/streamlit_batch_inference_app.py index bb544d4a..8639718e 100644 --- a/churn/streamlit_batch_inference_app.py +++ b/churn/streamlit_batch_inference_app.py @@ -2,7 +2,7 @@ import hopsworks import plotly.graph_objs as go import plotly.express as px -import joblib +from xgboost import XGBClassifier import math import pandas as pd @@ -50,9 +50,18 @@ def retrive_data(feature_view=feature_view): @st.cache_data() def get_model(project=project): mr = project.get_model_registry() - model = mr.get_model("churnmodel", version=1) + model = mr.get_model( + name="churnmodel", + version=1, + ) model_dir = model.download() - return joblib.load(model_dir + "/churnmodel.pkl") + + # Initialize the model + model = XGBClassifier() + + # Load the model from a saved JSON file + model.load_model(model_dir + "/model.json") + return model model = get_model() diff --git a/fraud_batch/1_fraud_batch_feature_pipeline.ipynb b/fraud_batch/1_fraud_batch_feature_pipeline.ipynb old mode 100755 new mode 100644 index 9def1498..1c9767d8 --- a/fraud_batch/1_fraud_batch_feature_pipeline.ipynb +++ b/fraud_batch/1_fraud_batch_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c8572ae1", + "id": "f60c52ce", "metadata": { "tags": [] }, @@ -27,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "3f7a0ac5", + "id": "7ec7724a", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -36,7 +36,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c57262d", + "id": "42efa933", "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd1541d9", + "id": "a4120a95", "metadata": {}, "outputs": [], "source": [ @@ -64,7 +64,7 @@ }, { "cell_type": "markdown", - "id": "7d2e72bb", + "id": "010286f1", "metadata": {}, "source": [ "## ๐Ÿ’ฝ Loading the Data \n", @@ -84,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26bad78a", + "id": "491a37b4", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2bbe076", + "id": "9ed8c4f0", "metadata": {}, "outputs": [], "source": [ @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46874951", + "id": "2543c511", "metadata": {}, "outputs": [], "source": [ @@ -135,7 +135,7 @@ }, { "cell_type": "markdown", - "id": "45720ec1", + "id": "717f9102", "metadata": {}, "source": [ "---" @@ -143,7 +143,7 @@ }, { "cell_type": "markdown", - "id": "41be3e46", + "id": "16357302", "metadata": {}, "source": [ "## ๐Ÿ› ๏ธ Feature Engineering \n", @@ -158,7 +158,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3407147d", + "id": "0b919bf8", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +181,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05f1d168-0849-427e-a6f9-08bc282c28d6", + "id": "5d6fd9ae", "metadata": {}, "outputs": [], "source": [ @@ -191,7 +191,7 @@ }, { "cell_type": "markdown", - "id": "86626356", + "id": "92820fab", "metadata": {}, "source": [ "Next, you will create features that for each credit card aggregate data from multiple time steps.\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2ac9cb86", + "id": "47116f13", "metadata": {}, "outputs": [], "source": [ @@ -223,7 +223,7 @@ }, { "cell_type": "markdown", - "id": "80c0764b", + "id": "0b3abcfc", "metadata": {}, "source": [ "Next lets compute windowed aggregates. Here you will use 4-hour windows, but feel free to experiment with different window lengths by setting `window_len` below to a value of your choice." @@ -232,7 +232,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f95d09f1", + "id": "f9bf9dd7", "metadata": {}, "outputs": [], "source": [ @@ -248,7 +248,7 @@ }, { "cell_type": "markdown", - "id": "e4cc229e", + "id": "b3021fa4", "metadata": {}, "source": [ "### โš™๏ธ Convert date time object to unix epoch in milliseconds " @@ -257,7 +257,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3bbf8852", + "id": "837bafd5", "metadata": {}, "outputs": [], "source": [ @@ -270,7 +270,7 @@ }, { "cell_type": "markdown", - "id": "70f69c73", + "id": "a3df26a9", "metadata": {}, "source": [ "## ๐Ÿ‘ฎ๐Ÿปโ€โ™‚๏ธ Great Expectations " @@ -279,7 +279,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5bf25c13", + "id": "c9391579", "metadata": {}, "outputs": [], "source": [ @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8e420315", + "id": "8969ff27", "metadata": {}, "outputs": [], "source": [ @@ -340,7 +340,7 @@ }, { "cell_type": "markdown", - "id": "21be72c5", + "id": "adf03efd", "metadata": {}, "source": [ "---" @@ -348,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "be723483", + "id": "7c069f5a", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "da57c80c", + "id": "ac32437d", "metadata": { "tags": [] }, @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f7329f2c", + "id": "287491fa", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +386,7 @@ }, { "cell_type": "markdown", - "id": "30358563", + "id": "23b61089", "metadata": {}, "source": [ "To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to `1`." @@ -395,7 +395,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a7dea9fc", + "id": "58353322", "metadata": {}, "outputs": [], "source": [ @@ -412,7 +412,7 @@ }, { "cell_type": "markdown", - "id": "af58171c", + "id": "832333fa", "metadata": {}, "source": [ "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -423,18 +423,19 @@ { "cell_type": "code", "execution_count": null, - "id": "64eef16a", + "id": "7f82e4a2", "metadata": {}, "outputs": [], "source": [ "# Insert data into feature group\n", - "trans_fg.insert(trans_df)" + "trans_fg.insert(trans_df)\n", + "print('โœ… Done!')" ] }, { "cell_type": "code", "execution_count": null, - "id": "39d48ea9", + "id": "0c6832a9", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "dae17609", + "id": "36f7e30a", "metadata": {}, "source": [ "At the creation of the feature group, you will be prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group.\n", @@ -471,7 +472,7 @@ }, { "cell_type": "markdown", - "id": "0d90bbfd", + "id": "1250c3ce", "metadata": {}, "source": [ "You can move on and do the same thing for the feature group with our windows aggregation." @@ -480,7 +481,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e12d198e", + "id": "6b8ef9f6", "metadata": {}, "outputs": [], "source": [ @@ -497,18 +498,19 @@ { "cell_type": "code", "execution_count": null, - "id": "19458252", + "id": "1970657b", "metadata": {}, "outputs": [], "source": [ "# Insert data into feature group\n", - "window_aggs_fg.insert(window_aggs_df)" + "window_aggs_fg.insert(window_aggs_df)\n", + "print('โœ… Done!')" ] }, { "cell_type": "code", "execution_count": null, - "id": "e967a252", + "id": "68b0c84f", "metadata": {}, "outputs": [], "source": [ @@ -528,7 +530,7 @@ }, { "cell_type": "markdown", - "id": "2030383e", + "id": "e22dce87", "metadata": {}, "source": [ "Both feature groups are now accessible and searchable in the UI\n", @@ -538,7 +540,7 @@ }, { "cell_type": "markdown", - "id": "8e6d4483", + "id": "4f8b5d8a", "metadata": {}, "source": [ "## โญ๏ธ **Next:** Part 02: Training Pipeline\n", @@ -553,7 +555,7 @@ "hash": "e1ddeae6eefc765c17da80d38ea59b893ab18c0c0904077a035ef84cfe367f83" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -567,7 +569,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/fraud_batch/2_fraud_batch_training_pipeline.ipynb b/fraud_batch/2_fraud_batch_training_pipeline.ipynb index 0cce3e40..a1252678 100644 --- a/fraud_batch/2_fraud_batch_training_pipeline.ipynb +++ b/fraud_batch/2_fraud_batch_training_pipeline.ipynb @@ -42,7 +42,6 @@ "metadata": {}, "outputs": [], "source": [ - "import joblib\n", "import os\n", "\n", "import pandas as pd\n", @@ -225,7 +224,7 @@ "TEST_SIZE = 0.2\n", "\n", "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n", - " test_size = TEST_SIZE,\n", + " test_size=TEST_SIZE,\n", ")" ] }, @@ -308,10 +307,10 @@ "outputs": [], "source": [ "# Create an instance of the XGBClassifier\n", - "clf = xgb.XGBClassifier()\n", + "model = xgb.XGBClassifier()\n", "\n", "# Fit the classifier on the training data\n", - "clf.fit(X_train.values, y_train)" + "model.fit(X_train, y_train)" ] }, { @@ -321,10 +320,10 @@ "outputs": [], "source": [ "# Predict the training data using the trained classifier\n", - "y_pred_train = clf.predict(X_train.values)\n", + "y_pred_train = model.predict(X_train)\n", "\n", "# Predict the test data using the trained classifier\n", - "y_pred_test = clf.predict(X_test.values)" + "y_pred_test = model.predict(X_test)" ] }, { @@ -439,7 +438,7 @@ " os.mkdir(model_dir)\n", "\n", "# Save the trained XGBoost model using joblib\n", - "joblib.dump(clf, model_dir + '/xgboost_fraud_batch_model.pkl')\n", + "model.save_model(model_dir + \"/model.json\")\n", "\n", "# Save the confusion matrix heatmap as an image in the model directory\n", "fig.savefig(model_dir + \"/confusion_matrix.png\")" @@ -483,7 +482,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -497,7 +496,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/fraud_batch/3_fraud_batch_inference.ipynb b/fraud_batch/3_fraud_batch_inference.ipynb index fab4c5fc..6b72e59d 100644 --- a/fraud_batch/3_fraud_batch_inference.ipynb +++ b/fraud_batch/3_fraud_batch_inference.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c9fec917", + "id": "0be78559", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 03: Batch Inference\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "731cffd4", + "id": "43743c5d", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -25,16 +25,16 @@ { "cell_type": "code", "execution_count": null, - "id": "b96c942c", + "id": "13f0abf3", "metadata": {}, "outputs": [], "source": [ - "import joblib" + "from xgboost import XGBClassifier" ] }, { "cell_type": "markdown", - "id": "dfb09860", + "id": "6a310165", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -43,7 +43,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c71483d", + "id": "3fe3e577", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "c637bf4c", + "id": "f5b8fa76", "metadata": {}, "source": [ "## โš™๏ธ Feature View Retrieval\n" @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b8bb207", + "id": "4cd9db18", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "8ec3cafe", + "id": "0067bc06", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n" @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b412d1e", + "id": "4e6b2247", "metadata": {}, "outputs": [], "source": [ @@ -97,7 +97,7 @@ }, { "cell_type": "markdown", - "id": "d0587a64", + "id": "05dcb148", "metadata": {}, "source": [ "## ๐Ÿš€ Fetch and test the model\n", @@ -108,7 +108,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f0be44ff", + "id": "8a34352e", "metadata": {}, "outputs": [], "source": [ @@ -125,20 +125,21 @@ { "cell_type": "code", "execution_count": null, - "id": "88b09ee2", + "id": "d27adabf", "metadata": {}, "outputs": [], "source": [ - "# Load the saved XGBoost model using joblib from the downloaded model directory\n", - "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/xgboost_fraud_batch_model.pkl\")\n", + "# Initialize the model\n", + "model = XGBClassifier()\n", "\n", - "# Display the retrieved XGBoost model\n", - "retrieved_xgboost_model" + "# Load the model from a saved JSON file\n", + "model.load_model(saved_model_dir + \"/model.json\")\n", + "model" ] }, { "cell_type": "markdown", - "id": "aba7238a", + "id": "ae9b9f92", "metadata": {}, "source": [ "---\n", @@ -148,7 +149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34ac6398", + "id": "8fc906eb", "metadata": {}, "outputs": [], "source": [ @@ -168,12 +169,12 @@ { "cell_type": "code", "execution_count": null, - "id": "0695d933", + "id": "2e58817d", "metadata": {}, "outputs": [], "source": [ "# Use the retrieved XGBoost model to make predictions on the batch data\n", - "predictions = retrieved_xgboost_model.predict(batch_data)\n", + "predictions = model.predict(batch_data)\n", "\n", "# Display the first five predictions\n", "predictions[:5]" @@ -181,7 +182,7 @@ }, { "cell_type": "markdown", - "id": "5ccd12a2", + "id": "ee5fcba9", "metadata": {}, "source": [ "---\n", @@ -197,7 +198,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -211,7 +212,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/fraud_online/1_fraud_online_feature_pipeline.ipynb b/fraud_online/1_fraud_online_feature_pipeline.ipynb old mode 100755 new mode 100644 index 17acf932..0201475e --- a/fraud_online/1_fraud_online_feature_pipeline.ipynb +++ b/fraud_online/1_fraud_online_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "debc8314", + "id": "c997f05a", "metadata": { "tags": [] }, @@ -27,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "14ce4ec5", + "id": "3ebdad2e", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -36,7 +36,7 @@ { "cell_type": "code", "execution_count": null, - "id": "138332f0", + "id": "1aa7ce8a", "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63616634", + "id": "49806257", "metadata": {}, "outputs": [], "source": [ @@ -64,7 +64,7 @@ }, { "cell_type": "markdown", - "id": "f38b023c", + "id": "f87d8f95", "metadata": {}, "source": [ "First of all you will load the data and do some feature engineering on it." @@ -72,7 +72,7 @@ }, { "cell_type": "markdown", - "id": "23ad6b6b", + "id": "66d04213", "metadata": {}, "source": [ "## ๐Ÿ’ฝ Loading the Data \n", @@ -90,7 +90,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b78b9ee2", + "id": "27f2b52e", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0eb13d3", + "id": "713a9568", "metadata": {}, "outputs": [], "source": [ @@ -130,7 +130,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee728353", + "id": "4ad0edf3", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1970d127", + "id": "8efc0deb", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "10cca817", + "id": "fe5105a1", "metadata": {}, "source": [ "---" @@ -165,7 +165,7 @@ }, { "cell_type": "markdown", - "id": "09c67522", + "id": "42b88055", "metadata": {}, "source": [ "## ๐Ÿ› ๏ธ Feature Engineering \n", @@ -179,7 +179,7 @@ }, { "cell_type": "markdown", - "id": "983ef759", + "id": "99b27bbd", "metadata": {}, "source": [ "Now you are ready to start by computing the distance between consecutive transactions, lets call it `loc_delta`.\n", @@ -189,7 +189,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6aaf551d-6495-420f-a878-d4be39e38add", + "id": "6f7d5009", "metadata": {}, "outputs": [], "source": [ @@ -202,7 +202,7 @@ }, { "cell_type": "markdown", - "id": "95d84907", + "id": "284eb2c1", "metadata": {}, "source": [ "## ๐Ÿ‘ฎ๐Ÿปโ€โ™‚๏ธ Great Expectations " @@ -211,7 +211,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f02da7b", + "id": "5b97f5f2", "metadata": {}, "outputs": [], "source": [ @@ -231,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0dfd28b6", + "id": "bbc8e914", "metadata": {}, "outputs": [], "source": [ @@ -273,7 +273,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd0eeba4", + "id": "d2331129", "metadata": {}, "outputs": [], "source": [ @@ -290,7 +290,7 @@ { "cell_type": "code", "execution_count": null, - "id": "08eddf29", + "id": "51383029", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +318,7 @@ }, { "cell_type": "markdown", - "id": "1a7e126d", + "id": "74e826bb", "metadata": {}, "source": [ "---" @@ -326,7 +326,7 @@ }, { "cell_type": "markdown", - "id": "be723483", + "id": "cf53a5e9", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -334,7 +334,7 @@ }, { "cell_type": "markdown", - "id": "da57c80c", + "id": "ab3ac23b", "metadata": {}, "source": [ "### ๐Ÿช„ Creating Feature Groups \n", @@ -349,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ce63afd", + "id": "35f1e17e", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f0ec5ccf-056d-48fb-baa2-0bc6e0cdf0f8", + "id": "7af46c39", "metadata": {}, "outputs": [], "source": [ @@ -372,7 +372,7 @@ }, { "cell_type": "markdown", - "id": "704da9d2", + "id": "15b742ad", "metadata": {}, "source": [ "To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to `1`." @@ -381,7 +381,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f21eb1b5", + "id": "3e926dc7", "metadata": {}, "outputs": [], "source": [ @@ -399,7 +399,7 @@ }, { "cell_type": "markdown", - "id": "c339bd87", + "id": "a16ae49d", "metadata": {}, "source": [ "Here you have also set `online_enabled=True`, which enables low latency access to the data. A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -410,18 +410,19 @@ { "cell_type": "code", "execution_count": null, - "id": "6d4955fa", + "id": "9a366430", "metadata": {}, "outputs": [], "source": [ "# Insert data into feature group\n", - "trans_fg.insert(trans_df)" + "trans_fg.insert(trans_df)\n", + "print('โœ… Done!')" ] }, { "cell_type": "code", "execution_count": null, - "id": "5db676d1", + "id": "3d7de1db", "metadata": {}, "outputs": [], "source": [ @@ -443,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "6a616b7f", + "id": "ffbe721c", "metadata": {}, "source": [ "You can move on and do the same thing for the profile and label feature groups." @@ -452,7 +453,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20af4024", + "id": "e8027f2d", "metadata": {}, "outputs": [], "source": [ @@ -466,13 +467,14 @@ " expectation_suite=expectation_suite_profiles,\n", ")\n", "# Insert data into feature group\n", - "profile_fg.insert(profiles_df)" + "profile_fg.insert(profiles_df)\n", + "print('โœ… Done!')" ] }, { "cell_type": "code", "execution_count": null, - "id": "aed0619e", + "id": "ef348581", "metadata": {}, "outputs": [], "source": [ @@ -488,7 +490,7 @@ }, { "cell_type": "markdown", - "id": "a45a9e20", + "id": "56611f28", "metadata": {}, "source": [ "Click on the hyperlink printed in the cell output above to inspect your feature group in the UI.\n", @@ -498,7 +500,7 @@ }, { "cell_type": "markdown", - "id": "58652612-f30e-4556-b415-53ab940380bd", + "id": "36294255", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] @@ -517,7 +519,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c2a980b-4e05-4fb1-ab25-da593a215d0e", + "id": "53d1da04", "metadata": {}, "outputs": [], "source": [ @@ -534,7 +536,7 @@ }, { "cell_type": "markdown", - "id": "c4d777a4-02c4-443f-88d7-4822b2861faf", + "id": "896b26c3", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] @@ -550,7 +552,7 @@ }, { "cell_type": "markdown", - "id": "a4d232d6", + "id": "c65cde95", "metadata": {}, "source": [ "## โญ๏ธ **Next:** Part 02 Training Pipeline \n", @@ -564,7 +566,7 @@ "hash": "e1ddeae6eefc765c17da80d38ea59b893ab18c0c0904077a035ef84cfe367f83" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -578,7 +580,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/fraud_online/2_fraud_online_training_pipeline.ipynb b/fraud_online/2_fraud_online_training_pipeline.ipynb index 20c7925c..91ec9b68 100644 --- a/fraud_online/2_fraud_online_training_pipeline.ipynb +++ b/fraud_online/2_fraud_online_training_pipeline.ipynb @@ -324,10 +324,10 @@ "outputs": [], "source": [ "# Initialize an XGBoost classifier\n", - "clf = xgb.XGBClassifier()\n", + "model = xgb.XGBClassifier()\n", "\n", "# Train the classifier using the training features (X_train) and labels (y_train)\n", - "clf.fit(X_train.values, y_train)" + "model.fit(X_train, y_train)" ] }, { @@ -337,10 +337,10 @@ "outputs": [], "source": [ "# Predict the training set\n", - "y_pred_train = clf.predict(X_train.values)\n", + "y_pred_train = model.predict(X_train)\n", "\n", "# Predict the test set\n", - "y_pred_test = clf.predict(X_test.values)" + "y_pred_test = model.predict(X_test)" ] }, { @@ -450,7 +450,10 @@ "output_schema = Schema(y_train)\n", "\n", "# Create a ModelSchema using the input and output schemas\n", - "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", + "model_schema = ModelSchema(\n", + " input_schema=input_schema, \n", + " output_schema=output_schema,\n", + ")\n", "\n", "# Convert the ModelSchema to a dictionary representation\n", "model_schema.to_dict()" @@ -479,7 +482,7 @@ " os.mkdir(model_dir)\n", "\n", "# Save the trained XGBoost model to a file within the model directory\n", - "joblib.dump(clf, f\"{model_dir}/xgboost_fraud_online_model.pkl\")\n", + "joblib.dump(model, f\"{model_dir}/xgboost_fraud_online_model.pkl\")\n", "\n", "# Save the confusion matrix plot to an image file within the model directory\n", "fig.savefig(f\"{model_dir}/confusion_matrix.png\")" @@ -565,7 +568,10 @@ " self.fs = fs_conn.get_feature_store()\n", " \n", " # Get feature view\n", - " self.fv = self.fs.get_feature_view(\"transactions_fraud_online_fv\", 1)\n", + " self.fv = self.fs.get_feature_view(\n", + " name=\"transactions_fraud_online_fv\", \n", + " version=1,\n", + " )\n", " \n", " # Initialize serving\n", " self.fv.init_serving(1)\n", @@ -739,7 +745,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -753,7 +759,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/fraud_online/3_fraud_online_inference_pipeline.ipynb b/fraud_online/3_fraud_online_inference_pipeline.ipynb index 5b949709..1642b079 100644 --- a/fraud_online/3_fraud_online_inference_pipeline.ipynb +++ b/fraud_online/3_fraud_online_inference_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c958e52b", + "id": "d28eba60", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 03: Inference Pipeline\n" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "ce2fe8a8", + "id": "f16367c8", "metadata": {}, "source": [ "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " @@ -19,7 +19,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39f83bc9", + "id": "ed952ece", "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ }, { "cell_type": "markdown", - "id": "87485ee0", + "id": "e98e32ce", "metadata": {}, "source": [ "## โš™๏ธ Feature Group Retrieval\n", @@ -42,7 +42,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e622d6b4", + "id": "d2a8475b", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98ffb788", + "id": "c37c5197", "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "markdown", - "id": "e1dac8b6", + "id": "6d5dade0", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n" @@ -78,7 +78,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca35a9f4", + "id": "be66f4c8", "metadata": {}, "outputs": [], "source": [ @@ -88,7 +88,7 @@ }, { "cell_type": "markdown", - "id": "6f3589dc", + "id": "903df073", "metadata": {}, "source": [ "## ๐Ÿš€ Fetch Deployment" @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ac8014f", + "id": "4303ac82", "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ }, { "cell_type": "markdown", - "id": "7764feba", + "id": "045ba7e4", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Predicting using deployment\n", @@ -130,7 +130,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b0fd763a", + "id": "42196023", "metadata": {}, "outputs": [], "source": [ @@ -142,7 +142,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b31447d", + "id": "596f3241", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +155,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf86d364", + "id": "f260b7b6", "metadata": {}, "outputs": [], "source": [ @@ -170,7 +170,7 @@ }, { "cell_type": "markdown", - "id": "40fc846a", + "id": "0b02e2bd", "metadata": {}, "source": [ "### Stop Deployment\n", @@ -180,7 +180,7 @@ { "cell_type": "code", "execution_count": null, - "id": "230cc59f", + "id": "b90b4c19", "metadata": {}, "outputs": [], "source": [ @@ -190,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "69d76f0d", + "id": "93360014", "metadata": {}, "source": [ "## ๐Ÿ‘พ StreamLit App\n", @@ -205,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "c14d7bd1", + "id": "8d98d2a0", "metadata": {}, "source": [ "---\n", @@ -221,7 +221,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -235,7 +235,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/loan_approval/1-loan-approval-feature-pipeline.ipynb b/loan_approval/1-loan-approval-feature-pipeline.ipynb index b7c33389..e17fcacd 100644 --- a/loan_approval/1-loan-approval-feature-pipeline.ipynb +++ b/loan_approval/1-loan-approval-feature-pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "4e1e2fcc", + "id": "ba7044aa", "metadata": { "papermill": { "duration": 0.029083, @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "ce396bcb", + "id": "50fa7c09", "metadata": {}, "source": [ "## ๐Ÿ“ Imports \n" @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b696de8", + "id": "83d06dc9", "metadata": {}, "outputs": [], "source": [ @@ -47,15 +47,9 @@ { "cell_type": "code", "execution_count": null, - "id": "9cb24c87", + "id": "bfd82941", "metadata": { "_kg_hide-input": true, - "execution": { - "iopub.execute_input": "2023-01-31T14:11:35.164738Z", - "iopub.status.busy": "2023-01-31T14:11:35.163899Z", - "iopub.status.idle": "2023-01-31T14:11:44.332161Z", - "shell.execute_reply": "2023-01-31T14:11:44.331116Z" - }, "papermill": { "duration": 9.198485, "end_time": "2023-01-31T14:11:44.334641", @@ -80,7 +74,7 @@ }, { "cell_type": "markdown", - "id": "b38e7831", + "id": "77ef7fcb", "metadata": {}, "source": [ "## ๐Ÿ’ฝ Loading the Data \n" @@ -89,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "078e4a4e", + "id": "f42fcc34", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +92,7 @@ }, { "cell_type": "markdown", - "id": "888e779f", + "id": "b38b8bf3", "metadata": {}, "source": [ "### โ›ณ๏ธ Loans Data \n" @@ -107,14 +101,8 @@ { "cell_type": "code", "execution_count": null, - "id": "276d0ec7", + "id": "d53efe1e", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:11:44.602805Z", - "iopub.status.busy": "2023-01-31T14:11:44.602108Z", - "iopub.status.idle": "2023-01-31T14:11:47.931616Z", - "shell.execute_reply": "2023-01-31T14:11:47.930713Z" - }, "papermill": { "duration": 3.467387, "end_time": "2023-01-31T14:11:47.933929", @@ -134,7 +122,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4f6800b", + "id": "5a161ddf", "metadata": {}, "outputs": [], "source": [ @@ -147,14 +135,8 @@ { "cell_type": "code", "execution_count": null, - "id": "d0def3d9", + "id": "1ae4fedd", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:11:48.208667Z", - "iopub.status.busy": "2023-01-31T14:11:48.208157Z", - "iopub.status.idle": "2023-01-31T14:11:48.428818Z", - "shell.execute_reply": "2023-01-31T14:11:48.427863Z" - }, "papermill": { "duration": 0.360858, "end_time": "2023-01-31T14:11:48.431029", @@ -173,7 +155,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d12bfb4", + "id": "21132787", "metadata": {}, "outputs": [], "source": [ @@ -182,7 +164,7 @@ }, { "cell_type": "markdown", - "id": "2a4f12ac", + "id": "83d26694", "metadata": {}, "source": [ "### โ›ณ๏ธ Applicants Data \n" @@ -191,7 +173,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bf64e24d", + "id": "7589dcdf", "metadata": {}, "outputs": [], "source": [ @@ -203,7 +185,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58521fcc", + "id": "0af0bf78", "metadata": {}, "outputs": [], "source": [ @@ -219,7 +201,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b95e5b8", + "id": "e8e7cfdd", "metadata": {}, "outputs": [], "source": [ @@ -228,7 +210,7 @@ }, { "cell_type": "markdown", - "id": "7dd7cc11", + "id": "18784e9f", "metadata": { "execution": { "iopub.execute_input": "2023-01-31T14:11:48.971332Z", @@ -252,14 +234,8 @@ { "cell_type": "code", "execution_count": null, - "id": "74bf2b1e", + "id": "a6f9bbf3", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:06.335754Z", - "iopub.status.busy": "2023-01-31T14:12:06.334576Z", - "iopub.status.idle": "2023-01-31T14:12:06.424596Z", - "shell.execute_reply": "2023-01-31T14:12:06.423659Z" - }, "papermill": { "duration": 0.240425, "end_time": "2023-01-31T14:12:06.426982", @@ -283,14 +259,8 @@ { "cell_type": "code", "execution_count": null, - "id": "095cd7fe", + "id": "62820cd3", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:16.815316Z", - "iopub.status.busy": "2023-01-31T14:12:16.814929Z", - "iopub.status.idle": "2023-01-31T14:12:17.095807Z", - "shell.execute_reply": "2023-01-31T14:12:17.094774Z" - }, "papermill": { "duration": 0.429455, "end_time": "2023-01-31T14:12:17.098362", @@ -311,7 +281,7 @@ }, { "cell_type": "markdown", - "id": "9d1bac3f", + "id": "9e781ab0", "metadata": { "papermill": { "duration": 0.151301, @@ -335,14 +305,8 @@ { "cell_type": "code", "execution_count": null, - "id": "53a5c4a7", + "id": "40cb93e7", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:32.338934Z", - "iopub.status.busy": "2023-01-31T14:12:32.338562Z", - "iopub.status.idle": "2023-01-31T14:12:32.817616Z", - "shell.execute_reply": "2023-01-31T14:12:32.816627Z" - }, "papermill": { "duration": 0.636201, "end_time": "2023-01-31T14:12:32.820041", @@ -367,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "0174e794", + "id": "d0b24843", "metadata": { "papermill": { "duration": 0.152855, @@ -389,7 +353,7 @@ }, { "cell_type": "markdown", - "id": "74d3e2a5", + "id": "bd0a69be", "metadata": { "papermill": { "duration": 0.153798, @@ -407,14 +371,8 @@ { "cell_type": "code", "execution_count": null, - "id": "4e69498c", + "id": "0d06d64f", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:39.733271Z", - "iopub.status.busy": "2023-01-31T14:12:39.732764Z", - "iopub.status.idle": "2023-01-31T14:12:39.823678Z", - "shell.execute_reply": "2023-01-31T14:12:39.822304Z" - }, "papermill": { "duration": 0.253402, "end_time": "2023-01-31T14:12:39.826253", @@ -431,7 +389,7 @@ }, { "cell_type": "markdown", - "id": "f1ad195b", + "id": "8839641f", "metadata": { "papermill": { "duration": 0.241511, @@ -449,14 +407,8 @@ { "cell_type": "code", "execution_count": null, - "id": "08cd74d5", + "id": "8aa885b5", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:42.216196Z", - "iopub.status.busy": "2023-01-31T14:12:42.214985Z", - "iopub.status.idle": "2023-01-31T14:12:42.301892Z", - "shell.execute_reply": "2023-01-31T14:12:42.300870Z" - }, "papermill": { "duration": 0.253314, "end_time": "2023-01-31T14:12:42.304445", @@ -473,7 +425,7 @@ }, { "cell_type": "markdown", - "id": "a946b370", + "id": "6a1c32f1", "metadata": { "papermill": { "duration": 0.154896, @@ -491,14 +443,8 @@ { "cell_type": "code", "execution_count": null, - "id": "87222d20", + "id": "0f408ab6", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:43.955316Z", - "iopub.status.busy": "2023-01-31T14:12:43.954293Z", - "iopub.status.idle": "2023-01-31T14:12:44.029021Z", - "shell.execute_reply": "2023-01-31T14:12:44.027978Z" - }, "papermill": { "duration": 0.232715, "end_time": "2023-01-31T14:12:44.031617", @@ -515,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "00fca018", + "id": "06c94b32", "metadata": { "papermill": { "duration": 0.154279, @@ -534,7 +480,7 @@ }, { "cell_type": "markdown", - "id": "be604f1e", + "id": "1dfecfcc", "metadata": { "papermill": { "duration": 0.157466, @@ -552,14 +498,8 @@ { "cell_type": "code", "execution_count": null, - "id": "655aa19d", + "id": "07785292", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:46.237863Z", - "iopub.status.busy": "2023-01-31T14:12:46.237469Z", - "iopub.status.idle": "2023-01-31T14:12:46.332340Z", - "shell.execute_reply": "2023-01-31T14:12:46.331345Z" - }, "papermill": { "duration": 0.253754, "end_time": "2023-01-31T14:12:46.335011", @@ -578,14 +518,8 @@ { "cell_type": "code", "execution_count": null, - "id": "75bb4c76", + "id": "66c2feed", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:46.991455Z", - "iopub.status.busy": "2023-01-31T14:12:46.990863Z", - "iopub.status.idle": "2023-01-31T14:12:57.341490Z", - "shell.execute_reply": "2023-01-31T14:12:57.340452Z" - }, "papermill": { "duration": 10.520365, "end_time": "2023-01-31T14:12:57.343915", @@ -609,7 +543,7 @@ }, { "cell_type": "markdown", - "id": "bffb4178", + "id": "002836ec", "metadata": { "papermill": { "duration": 0.156009, @@ -628,14 +562,8 @@ { "cell_type": "code", "execution_count": null, - "id": "38ed484b", + "id": "4bde103f", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:12:58.425005Z", - "iopub.status.busy": "2023-01-31T14:12:58.424030Z", - "iopub.status.idle": "2023-01-31T14:12:58.632168Z", - "shell.execute_reply": "2023-01-31T14:12:58.631141Z" - }, "papermill": { "duration": 0.369642, "end_time": "2023-01-31T14:12:58.634630", @@ -652,7 +580,7 @@ }, { "cell_type": "markdown", - "id": "dbd71208", + "id": "bd7c5a9c", "metadata": { "papermill": { "duration": 0.154966, @@ -669,7 +597,7 @@ }, { "cell_type": "markdown", - "id": "20623395", + "id": "71f050ac", "metadata": { "papermill": { "duration": 0.156275, @@ -686,7 +614,7 @@ }, { "cell_type": "markdown", - "id": "de3a0265", + "id": "74cb3c4e", "metadata": { "papermill": { "duration": 0.15528, @@ -706,14 +634,8 @@ { "cell_type": "code", "execution_count": null, - "id": "ad383419", + "id": "fcbf45d8", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:13:01.592535Z", - "iopub.status.busy": "2023-01-31T14:13:01.591599Z", - "iopub.status.idle": "2023-01-31T14:13:01.656733Z", - "shell.execute_reply": "2023-01-31T14:13:01.655753Z" - }, "papermill": { "duration": 0.225278, "end_time": "2023-01-31T14:13:01.659576", @@ -730,7 +652,7 @@ }, { "cell_type": "markdown", - "id": "b78ba175", + "id": "c33c695f", "metadata": { "papermill": { "duration": 0.162961, @@ -749,14 +671,8 @@ { "cell_type": "code", "execution_count": null, - "id": "3cc3188b", + "id": "9fc7e99b", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:13:03.458992Z", - "iopub.status.busy": "2023-01-31T14:13:03.458459Z", - "iopub.status.idle": "2023-01-31T14:13:03.641611Z", - "shell.execute_reply": "2023-01-31T14:13:03.628414Z" - }, "papermill": { "duration": 0.362285, "end_time": "2023-01-31T14:13:03.644153", @@ -777,14 +693,8 @@ { "cell_type": "code", "execution_count": null, - "id": "8a04d628", + "id": "97a9662b", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:13:04.068331Z", - "iopub.status.busy": "2023-01-31T14:13:04.067955Z", - "iopub.status.idle": "2023-01-31T14:13:04.103983Z", - "shell.execute_reply": "2023-01-31T14:13:04.102946Z" - }, "papermill": { "duration": 0.238062, "end_time": "2023-01-31T14:13:04.105979", @@ -802,14 +712,8 @@ { "cell_type": "code", "execution_count": null, - "id": "dafe46bf", + "id": "9f97ee87", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:13:04.919809Z", - "iopub.status.busy": "2023-01-31T14:13:04.919417Z", - "iopub.status.idle": "2023-01-31T14:13:04.986378Z", - "shell.execute_reply": "2023-01-31T14:13:04.985236Z" - }, "papermill": { "duration": 0.226129, "end_time": "2023-01-31T14:13:04.989199", @@ -826,7 +730,7 @@ }, { "cell_type": "markdown", - "id": "3be6a8ab", + "id": "c00651cc", "metadata": { "papermill": { "duration": 0.155658, @@ -846,7 +750,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d3c3bb7c", + "id": "357d9e7f", "metadata": {}, "outputs": [], "source": [ @@ -855,7 +759,7 @@ }, { "cell_type": "markdown", - "id": "3af9fbb5", + "id": "f1e87b8a", "metadata": { "papermill": { "duration": 0.158483, @@ -874,14 +778,8 @@ { "cell_type": "code", "execution_count": null, - "id": "9e8e8d21", + "id": "df7bcff9", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:13:06.287247Z", - "iopub.status.busy": "2023-01-31T14:13:06.286161Z", - "iopub.status.idle": "2023-01-31T14:13:06.337535Z", - "shell.execute_reply": "2023-01-31T14:13:06.336363Z" - }, "papermill": { "duration": 0.221956, "end_time": "2023-01-31T14:13:06.339961", @@ -900,7 +798,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b29436e", + "id": "ab9f4744", "metadata": {}, "outputs": [], "source": [ @@ -923,7 +821,7 @@ }, { "cell_type": "markdown", - "id": "5b9edb3f", + "id": "90788109", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connect to Hopsworks Feature Store" @@ -932,7 +830,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c991c5e9", + "id": "6cfde176", "metadata": {}, "outputs": [], "source": [ @@ -945,7 +843,7 @@ }, { "cell_type": "markdown", - "id": "528ae792", + "id": "275a2b8e", "metadata": {}, "source": [ "## ๐Ÿช„ Creating Feature Groups \n" @@ -954,7 +852,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8b0001e4", + "id": "9c548bf7", "metadata": {}, "outputs": [], "source": [ @@ -972,7 +870,7 @@ { "cell_type": "code", "execution_count": null, - "id": "360c1a96", + "id": "9b7d5edb", "metadata": {}, "outputs": [], "source": [ @@ -989,7 +887,7 @@ }, { "cell_type": "markdown", - "id": "48103c37", + "id": "37b0eddf", "metadata": {}, "source": [ "### Configure upload batch size for performance (latency vs throughput)\n", @@ -1013,28 +911,30 @@ { "cell_type": "code", "execution_count": null, - "id": "040bf356", + "id": "80a50d09", "metadata": {}, "outputs": [], "source": [ "# Insert data into the \"loans\" feature group\n", - "loans_fg.insert(loans_df)" + "loans_fg.insert(loans_df)\n", + "print('โœ… Done!')" ] }, { "cell_type": "code", "execution_count": null, - "id": "97c9f72f", + "id": "c9a7c228", "metadata": {}, "outputs": [], "source": [ "# Insert data into the \"applicants\" feature group\n", - "applicants_fg.insert(applicants_df)" + "applicants_fg.insert(applicants_df)\n", + "print('โœ… Done!')" ] }, { "cell_type": "markdown", - "id": "c3f24d00", + "id": "08c47109", "metadata": {}, "source": [ "## ๐Ÿ“– Update the description of any features found in the data dictionary\n", @@ -1045,7 +945,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7e956993", + "id": "009888f0", "metadata": {}, "outputs": [], "source": [ @@ -1056,7 +956,7 @@ { "cell_type": "code", "execution_count": null, - "id": "93889452", + "id": "15b84302", "metadata": {}, "outputs": [], "source": [ @@ -1084,7 +984,7 @@ }, { "cell_type": "markdown", - "id": "4b3f7025", + "id": "0b5e5e3e", "metadata": {}, "source": [ "---\n", @@ -1097,7 +997,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -1111,7 +1011,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" }, "papermill": { "default_parameters": {}, diff --git a/loan_approval/2-loan-approval-training-pipeline.ipynb b/loan_approval/2-loan-approval-training-pipeline.ipynb index a42950ba..c7d1fa2b 100644 --- a/loan_approval/2-loan-approval-training-pipeline.ipynb +++ b/loan_approval/2-loan-approval-training-pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "4e1e2fcc", + "id": "ea002020", "metadata": { "papermill": { "duration": 0.029083, @@ -30,7 +30,7 @@ }, { "cell_type": "markdown", - "id": "cfb490f6", + "id": "d446dbd1", "metadata": {}, "source": [ "## ๐Ÿ“ Imports \n" @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a3cdbf34", + "id": "1f8de2a6", "metadata": {}, "outputs": [], "source": [ @@ -49,15 +49,9 @@ { "cell_type": "code", "execution_count": null, - "id": "9cb24c87", + "id": "e0b5ebdf", "metadata": { "_kg_hide-input": true, - "execution": { - "iopub.execute_input": "2023-01-31T14:11:35.164738Z", - "iopub.status.busy": "2023-01-31T14:11:35.163899Z", - "iopub.status.idle": "2023-01-31T14:11:44.332161Z", - "shell.execute_reply": "2023-01-31T14:11:44.331116Z" - }, "papermill": { "duration": 9.198485, "end_time": "2023-01-31T14:11:44.334641", @@ -76,8 +70,10 @@ "import warnings\n", "warnings.filterwarnings('ignore')\n", "from sklearn.metrics import (\n", - " accuracy_score, confusion_matrix, classification_report, \n", - " roc_auc_score\n", + " accuracy_score, \n", + " confusion_matrix, \n", + " classification_report, \n", + " roc_auc_score,\n", ")\n", "from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay\n", "from sklearn.linear_model import LogisticRegression\n", @@ -94,7 +90,7 @@ }, { "cell_type": "markdown", - "id": "5b9edb3f", + "id": "2c80b367", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connect to Hopsworks Feature Store" @@ -103,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4a36d859", + "id": "5b5e80d7", "metadata": {}, "outputs": [], "source": [ @@ -116,7 +112,7 @@ }, { "cell_type": "markdown", - "id": "9dde0a81", + "id": "bb9ddbce", "metadata": { "papermill": { "duration": 0.158827, @@ -134,7 +130,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bec1a57b", + "id": "092ad164", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fc541a8a", + "id": "3c8d1156", "metadata": {}, "outputs": [], "source": [ @@ -168,7 +164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26d001a5", + "id": "a87c1044", "metadata": {}, "outputs": [], "source": [ @@ -184,7 +180,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18ae1976", + "id": "67c8efea", "metadata": {}, "outputs": [], "source": [ @@ -196,7 +192,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c741e55c", + "id": "11bff38d", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +202,7 @@ { "cell_type": "code", "execution_count": null, - "id": "14822496", + "id": "2eb40a8a", "metadata": {}, "outputs": [], "source": [ @@ -215,7 +211,7 @@ }, { "cell_type": "markdown", - "id": "0b156600", + "id": "fa1add94", "metadata": {}, "source": [ "## ๐Ÿ‘ฉ๐Ÿปโ€๐Ÿ”ฌ Feature Transformation\n" @@ -224,7 +220,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a0ffed41", + "id": "8de9ec25", "metadata": {}, "outputs": [], "source": [ @@ -253,7 +249,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2319b999", + "id": "8e162d58", "metadata": {}, "outputs": [], "source": [ @@ -280,7 +276,7 @@ "# and categorical features are processed by the categorical_transformer\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", - " (\"num\", numeric_transformer, numeric_features), # Apply numeric transformer to numeric features\n", + " (\"num\", numeric_transformer, numeric_features), # Apply numeric transformer to numeric features\n", " (\"cat\", categorical_transformer, categorical_features), # Apply categorical transformer to categorical features\n", " ]\n", ")" @@ -289,7 +285,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c83f66d", + "id": "01aab79e", "metadata": {}, "outputs": [], "source": [ @@ -303,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eeaa534d", + "id": "9f369b4c", "metadata": {}, "outputs": [], "source": [ @@ -312,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "64ddcb0d", + "id": "cdba44c9", "metadata": { "papermill": { "duration": 0.162943, @@ -330,14 +326,8 @@ { "cell_type": "code", "execution_count": null, - "id": "47a8291a", + "id": "f4a7ab73", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:13:13.702918Z", - "iopub.status.busy": "2023-01-31T14:13:13.702551Z", - "iopub.status.idle": "2023-01-31T14:13:13.709718Z", - "shell.execute_reply": "2023-01-31T14:13:13.708705Z" - }, "papermill": { "duration": 0.176326, "end_time": "2023-01-31T14:13:13.711748", @@ -372,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9af753fe", + "id": "14760c12", "metadata": {}, "outputs": [], "source": [ @@ -391,7 +381,7 @@ { "cell_type": "code", "execution_count": null, - "id": "558157e2", + "id": "30b316e7", "metadata": {}, "outputs": [], "source": [ @@ -411,14 +401,8 @@ { "cell_type": "code", "execution_count": null, - "id": "0221c340", + "id": "5924aa50", "metadata": { - "execution": { - "iopub.execute_input": "2023-01-31T14:32:39.229358Z", - "iopub.status.busy": "2023-01-31T14:32:39.229002Z", - "iopub.status.idle": "2023-01-31T14:32:40.630885Z", - "shell.execute_reply": "2023-01-31T14:32:40.629547Z" - }, "papermill": { "duration": 2.472623, "end_time": "2023-01-31T14:32:40.633694", @@ -449,7 +433,7 @@ }, { "cell_type": "markdown", - "id": "d2d59196", + "id": "70923c22", "metadata": {}, "source": [ "## ๐Ÿ—„๏ธ Register the Model with Model Registry\n" @@ -458,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6abb6d9", + "id": "61da403f", "metadata": { "papermill": { "duration": 1.025564, @@ -478,7 +462,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c980e883", + "id": "cc22005a", "metadata": {}, "outputs": [], "source": [ @@ -498,7 +482,7 @@ }, { "cell_type": "markdown", - "id": "b5147669", + "id": "647102d1", "metadata": {}, "source": [ "### โš™๏ธ Model Schema\n", @@ -511,7 +495,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6fffe5e", + "id": "20d2d6db", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +517,7 @@ }, { "cell_type": "markdown", - "id": "b35d0d26", + "id": "bbb513ea", "metadata": {}, "source": [ "## ๐Ÿ“ Register model\n", @@ -544,7 +528,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c1bee22", + "id": "a18cb44c", "metadata": {}, "outputs": [], "source": [ @@ -562,7 +546,7 @@ }, { "cell_type": "markdown", - "id": "a8689024", + "id": "604caaa4", "metadata": {}, "source": [ "---\n", @@ -574,7 +558,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -588,7 +572,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" }, "papermill": { "default_parameters": {}, diff --git a/loan_approval/3-loan-approval-batch-inference.ipynb b/loan_approval/3-loan-approval-batch-inference.ipynb index 97f1da9b..8ad00268 100644 --- a/loan_approval/3-loan-approval-batch-inference.ipynb +++ b/loan_approval/3-loan-approval-batch-inference.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "384b4632", + "id": "3795581f", "metadata": {}, "source": [ "## ๐Ÿš€ Batch Inference Pipeline\n", @@ -17,7 +17,7 @@ }, { "cell_type": "markdown", - "id": "19c44be4", + "id": "d5c48a06", "metadata": {}, "source": [ "## ๐Ÿ“ Imports \n" @@ -26,7 +26,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3a6fa29", + "id": "6677341a", "metadata": {}, "outputs": [], "source": [ @@ -36,7 +36,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8efc53a7", + "id": "28f7509e", "metadata": {}, "outputs": [], "source": [ @@ -53,22 +53,22 @@ { "cell_type": "code", "execution_count": null, - "id": "c61e3adc", + "id": "76b5901d", "metadata": {}, "outputs": [], "source": [ "# Define version numbers for feature view and model\n", - "fv_version = 1\n", - "model_version = 1\n", + "FV_VERSION = 1\n", + "MODEL_VERSION = 1\n", "\n", "# Define start and end times for the data\n", - "start_time_data = \"2016-11-01\"\n", - "end_time_data = \"2016-12-01\"" + "START_TIME_DATA = \"2016-11-01\"\n", + "END_TIME_DATA = \"2016-12-01\"" ] }, { "cell_type": "markdown", - "id": "70fe09ca", + "id": "c6ebb07e", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connect to Hopsworks Feature Store" @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f8cd469", + "id": "ec202adb", "metadata": {}, "outputs": [], "source": [ @@ -90,7 +90,7 @@ }, { "cell_type": "markdown", - "id": "b66df4b2", + "id": "c35f50e8", "metadata": {}, "source": [ "## โš™๏ธ Feature View Retrieval\n" @@ -99,20 +99,20 @@ { "cell_type": "code", "execution_count": null, - "id": "34d8e82a", + "id": "fde529ba", "metadata": {}, "outputs": [], "source": [ "# Get the 'loans_approvals' feature view\n", "feature_view = fs.get_feature_view(\n", " name=\"loans_approvals\", \n", - " version=fv_version,\n", + " version=FV_VERSION,\n", ")" ] }, { "cell_type": "markdown", - "id": "0890a082", + "id": "871fb3e6", "metadata": {}, "source": [ "## ๐Ÿ—„ Model Registry\n" @@ -121,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f8194512", + "id": "629006f7", "metadata": {}, "outputs": [], "source": [ @@ -131,7 +131,7 @@ }, { "cell_type": "markdown", - "id": "d251a56a", + "id": "e33a656a", "metadata": {}, "source": [ "## ๐Ÿš€ Fetch and test the model" @@ -140,14 +140,14 @@ { "cell_type": "code", "execution_count": null, - "id": "c435cd7c", + "id": "296b3056", "metadata": {}, "outputs": [], "source": [ "# Retrieve the model from the Model Registry using the name \"lending_model\" and specified version\n", "model = mr.get_model(\n", " \"lending_model\",\n", - " version=model_version,\n", + " version=MODEL_VERSION,\n", ")\n", "\n", "# Download the model directory from the Model Registry\n", @@ -159,7 +159,7 @@ }, { "cell_type": "markdown", - "id": "814f9bfd", + "id": "a4d09d2a", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Batch Prediction " @@ -168,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cc915814", + "id": "6ff15884", "metadata": {}, "outputs": [], "source": [ @@ -177,8 +177,8 @@ "\n", "# Get batch data for a specified time range from start_time_data to end_time_data\n", "batch_data = feature_view.get_batch_data(\n", - " start_time=start_time_data,\n", - " end_time=end_time_data,\n", + " start_time=START_TIME_DATA,\n", + " end_time=END_TIME_DATA,\n", ")\n", "\n", "# Display the first three rows of the batch data\n", @@ -188,7 +188,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37127def", + "id": "0fa57b92", "metadata": {}, "outputs": [], "source": [ @@ -201,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "faf04cb0", + "id": "734918f3", "metadata": {}, "source": [ "---\n", @@ -217,7 +217,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -231,7 +231,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" }, "papermill": { "default_parameters": {},