From 8ad263d5d20427f6264d07b6d8982e092ea4475f Mon Sep 17 00:00:00 2001
From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com>
Date: Thu, 27 Jun 2024 12:20:45 +0300
Subject: [PATCH] [FSTORE-1437] Hospital Wait Time Tutorial (#272)
* hospital waiting time forecasting tutorial
---
README.md | 1 +
.../1_feature_pipeline.ipynb | 597 ++++++++++++++++++
.../2_training_pipeline.ipynb | 537 ++++++++++++++++
.../3_inference_pipeline.ipynb | 229 +++++++
.../hospital_wait_time/requirements.txt | 1 +
5 files changed, 1365 insertions(+)
create mode 100644 advanced_tutorials/hospital_wait_time/1_feature_pipeline.ipynb
create mode 100644 advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb
create mode 100644 advanced_tutorials/hospital_wait_time/3_inference_pipeline.ipynb
create mode 100644 advanced_tutorials/hospital_wait_time/requirements.txt
diff --git a/README.md b/README.md
index 01793905..be42aee0 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ In order to understand the tutorials you need to be familiar with general concep
- [Credit Scores](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/credit_scores): Predict clients' repayment abilities.
- [Electricity](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/electricity): Predict the electricity prices in several Swedish cities based on weather conditions, previous prices, and Swedish holidays.
- [NYC Taxi Fares](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/nyc_taxi_fares): Predict the fare amount for a taxi ride in New York City given the pickup and dropoff locations.
+ - [Hospital Wait Time](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/hospital_wait_time): Predict the waiting time for a deceased donor kidney using Prophet model.
- [Recommender System](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/recommender-system): Build a recommender system for fashion items.
- [TimeSeries](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/timeseries): Timeseries price prediction.
- [LLM PDF](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/llm_pdfs): An AI assistant that utilizes a Retrieval-Augmented Generation (RAG) system to provide accurate answers to user questions by retrieving relevant context from PDF documents.
diff --git a/advanced_tutorials/hospital_wait_time/1_feature_pipeline.ipynb b/advanced_tutorials/hospital_wait_time/1_feature_pipeline.ipynb
new file mode 100644
index 00000000..ebf89dff
--- /dev/null
+++ b/advanced_tutorials/hospital_wait_time/1_feature_pipeline.ipynb
@@ -0,0 +1,597 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "5615c2ae",
+ "metadata": {},
+ "source": [
+ "## 📝 Imports "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9fd527fe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# Mute warnings\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "97f5d51d",
+ "metadata": {},
+ "source": [
+ "## 💽 Data Loading\n",
+ "\n",
+ "In this case, you are predicting the waiting time for a deceased donor kidney transplant involves estimating the duration a patient might need to wait from the time they are registered on the transplant list until a suitable donor kidney becomes available for transplantation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "819100e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "patient_demographics_data = pd.read_csv(\n",
+ " 'https://repo.hops.works/dev/davit/hospital_wait_time/patient_demographics.csv', \n",
+ " parse_dates=['date'],\n",
+ ")\n",
+ "patient_demographics_data.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d827df9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "medical_background_data = pd.read_csv(\n",
+ " 'https://repo.hops.works/dev/davit/hospital_wait_time/medical_background.csv', \n",
+ " parse_dates=['date'],\n",
+ ")\n",
+ "medical_background_data.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7f7973d2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transplant_compatibility_data = pd.read_csv(\n",
+ " 'https://repo.hops.works/dev/davit/hospital_wait_time/transplant_compatibility.csv', \n",
+ " parse_dates=['date'],\n",
+ ")\n",
+ "transplant_compatibility_data.columns = transplant_compatibility_data.columns.str.lower()\n",
+ "transplant_compatibility_data.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c7a4c7c6",
+ "metadata": {},
+ "source": [
+ "## 👨🏻🍳 Data Preparation\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "23284e32",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "patient_demographics_data.isna().sum()[patient_demographics_data.isna().sum() > 0] / len(patient_demographics_data)*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5532bdbf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "medical_background_data.isna().sum()[medical_background_data.isna().sum() > 0] / len(medical_background_data)*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31876981",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transplant_compatibility_data.isna().sum()[transplant_compatibility_data.isna().sum() > 0] / len(transplant_compatibility_data)*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "058cf4f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "medical_background_data['dialysis_duration'] = medical_background_data['dialysis_duration'].fillna(1).replace(0, 1)\n",
+ "medical_background_data['dialysis_duration'] = np.log(medical_background_data['dialysis_duration'] + 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a6a1faef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remove_outliers_iqr(dataframe, iqr_multiplier=1.5):\n",
+ " # Select numerical columns for outlier removal\n",
+ " numerical_columns = dataframe.select_dtypes(\n",
+ " include=['int64', 'float64']).columns\n",
+ "\n",
+ " # Loop through numerical columns to identify and remove outliers using IQR\n",
+ " for column in numerical_columns:\n",
+ " Q1 = dataframe[column].quantile(0.25)\n",
+ " Q3 = dataframe[column].quantile(0.75)\n",
+ " IQR = Q3 - Q1\n",
+ " lower_bound = Q1 - iqr_multiplier * IQR\n",
+ " upper_bound = Q3 + iqr_multiplier * IQR\n",
+ "\n",
+ " outliers = dataframe[(dataframe[column] < lower_bound) | (\n",
+ " dataframe[column] > upper_bound)]\n",
+ "\n",
+ " # Remove outliers\n",
+ " dataframe = dataframe[~dataframe.index.isin(outliers.index)]\n",
+ "\n",
+ " return dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7cacfd97",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "patient_demographics_data_filtered = remove_outliers_iqr(patient_demographics_data, iqr_multiplier=1.5)\n",
+ "print(f'⛳️ Original shape: {patient_demographics_data.shape}')\n",
+ "print(f'⛳️ Cleared shape: {patient_demographics_data_filtered.shape}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "de3c3316",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "medical_background_data_filtered = remove_outliers_iqr(medical_background_data, iqr_multiplier=1.5)\n",
+ "print(f'⛳️ Original shape: {medical_background_data.shape}')\n",
+ "print(f'⛳️ Cleared shape: {medical_background_data_filtered.shape}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "45b9c45b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transplant_compatibility_data_filtered = remove_outliers_iqr(transplant_compatibility_data, iqr_multiplier=1.5)\n",
+ "print(f'⛳️ Original shape: {transplant_compatibility_data.shape}')\n",
+ "print(f'⛳️ Cleared shape: {transplant_compatibility_data_filtered.shape}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f7ac6f7",
+ "metadata": {},
+ "source": [
+ "## 👮🏻♂️ Great Expectations "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f99a5cff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import great_expectations as ge\n",
+ "from great_expectations.core import ExpectationSuite, ExpectationConfiguration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ee28579a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Convert your DataFrame to a Great Expectations DataFrame\n",
+ "ge_df_patient_demographics = ge.from_pandas(patient_demographics_data_filtered)\n",
+ "\n",
+ "# Retrieve the expectation suite associated with the ge DataFrame\n",
+ "expectation_suite_patient_demographics = ge_df_patient_demographics.get_expectation_suite()\n",
+ "\n",
+ "# Set the expectation suite name\n",
+ "expectation_suite_patient_demographics.expectation_suite_name = \"patient_registration_suite\"\n",
+ "\n",
+ "# Expectation: 'id' should always be unique and not null\n",
+ "expectation_suite_patient_demographics.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_unique\",\n",
+ " kwargs={\"column\": \"id\"},\n",
+ " )\n",
+ ")\n",
+ "expectation_suite_patient_demographics.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_not_be_null\",\n",
+ " kwargs={\"column\": \"id\"},\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation: 'date' should be a valid date and not null\n",
+ "expectation_suite_patient_demographics.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_of_type\",\n",
+ " kwargs={\n",
+ " \"column\": \"date\",\n",
+ " \"type_\": \"datetime64[ns]\",\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "expectation_suite_patient_demographics.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_not_be_null\",\n",
+ " kwargs={\"column\": \"date\"},\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation: 'age_at_list_registration' to be non-negative\n",
+ "expectation_suite_patient_demographics.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_between\",\n",
+ " kwargs={\n",
+ " \"column\": \"age_at_list_registration\",\n",
+ " \"min_value\": 0,\n",
+ " \"max_value\": None,\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation: 'gender' to be within expected values\n",
+ "expectation_suite_patient_demographics.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_in_set\",\n",
+ " kwargs={\n",
+ " \"column\": \"gender\",\n",
+ " \"value_set\": [\"M\", \"F\"],\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation: 'age_cat' to contain expected categories\n",
+ "expectation_suite_patient_demographics.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_in_set\",\n",
+ " kwargs={\n",
+ " \"column\": \"age_cat\",\n",
+ " \"value_set\": [\"Over60\", \"From18to60\", \"Below18\"],\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "print(\"✅ Expectations defined and saved successfully.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "640bbf2a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ge_df_medical_background = ge.from_pandas(medical_background_data_filtered)\n",
+ "\n",
+ "# Retrieve and set the expectation suite\n",
+ "expectation_suite_medical_background = ge_df_medical_background.get_expectation_suite()\n",
+ "expectation_suite_medical_background.expectation_suite_name = \"medical_background_suite\"\n",
+ "\n",
+ "# Expectations for 'id' and 'date'\n",
+ "expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_unique\",\n",
+ " kwargs={\"column\": \"id\"},\n",
+ " )\n",
+ ")\n",
+ "expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_not_be_null\",\n",
+ " kwargs={\"column\": \"id\"},\n",
+ " )\n",
+ ")\n",
+ "expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_of_type\",\n",
+ " kwargs={\n",
+ " \"column\": \"date\",\n",
+ " \"type_\": \"datetime64[ns]\",\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_not_be_null\",\n",
+ " kwargs={\"column\": \"date\"},\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation for 'dialysis_duration'\n",
+ "expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_between\",\n",
+ " kwargs={\n",
+ " \"column\": \"dialysis_duration\",\n",
+ " \"min_value\": 0,\n",
+ " \"max_value\": None,\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation for 'blood_gp'\n",
+ "expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_in_set\",\n",
+ " kwargs={\n",
+ " \"column\": \"blood_gp\",\n",
+ " \"value_set\": [\"A\", \"B\", \"AB\", \"O\"],\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Gestation and Prior Transplant Expectations\n",
+ "for column in [\"gestation\", \"prior_transplant\"]:\n",
+ " expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_in_set\",\n",
+ " kwargs={\n",
+ " \"column\": column,\n",
+ " \"value_set\": [\"YES\", \"NO\"],\n",
+ " }\n",
+ " )\n",
+ " )\n",
+ "\n",
+ "# Expectation for 'number_prior_transplant' - check alignment with 'prior_transplant'\n",
+ "expectation_suite_medical_background.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_between\",\n",
+ " kwargs={\n",
+ " \"column\": \"number_prior_transplant\",\n",
+ " \"min_value\": 0,\n",
+ " \"max_value\": None,\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "print(\"✅ Expectations defined and saved successfully.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6ca9a557",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ge_df_transplant_compatibility = ge.from_pandas(transplant_compatibility_data_filtered)\n",
+ "\n",
+ "# Retrieve and set the expectation suite\n",
+ "expectation_suite_transplant_compatibility = ge_df_transplant_compatibility.get_expectation_suite()\n",
+ "expectation_suite_transplant_compatibility.expectation_suite_name = \"transplant_compatibility_and_outcome_suite\"\n",
+ "\n",
+ "# Expectations for 'id' and 'date'\n",
+ "expectation_suite_transplant_compatibility.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_unique\",\n",
+ " kwargs={\"column\": \"id\"},\n",
+ " )\n",
+ ")\n",
+ "expectation_suite_transplant_compatibility.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_not_be_null\",\n",
+ " kwargs={\"column\": \"id\"},\n",
+ " )\n",
+ ")\n",
+ "expectation_suite_transplant_compatibility.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_of_type\",\n",
+ " kwargs={\n",
+ " \"column\": \"date\",\n",
+ " \"type_\": \"datetime64[ns]\",\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation for 'cPRA' to be between 0 and 100\n",
+ "expectation_suite_transplant_compatibility.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_between\",\n",
+ " kwargs={\n",
+ " \"column\": \"cpra\",\n",
+ " \"min_value\": 0,\n",
+ " \"max_value\": 100,\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# HLA Marker Expectations (checking they are non-negative integers)\n",
+ "for hla_marker in [\"hla_a1\", \"hla_a2\", \"hla_b1\", \"hla_b2\", \"hla_dr1\", \"hla_dr2\"]:\n",
+ " expectation_suite_transplant_compatibility.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_of_type\",\n",
+ " kwargs={\n",
+ " \"column\": hla_marker,\n",
+ " \"type_\": \"int\",\n",
+ " }\n",
+ " )\n",
+ " )\n",
+ "\n",
+ "# Expectation for 'if_transplanted'\n",
+ "expectation_suite_transplant_compatibility.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_in_set\",\n",
+ " kwargs={\n",
+ " \"column\": \"if_transplanted\",\n",
+ " \"value_set\": [\"YES\", \"NO\"],\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "# Expectation for 'duration' to be non-negative\n",
+ "expectation_suite_transplant_compatibility.add_expectation(\n",
+ " ExpectationConfiguration(\n",
+ " expectation_type=\"expect_column_values_to_be_between\",\n",
+ " kwargs={\n",
+ " \"column\": \"duration\",\n",
+ " \"min_value\": 0,\n",
+ " \"max_value\": None,\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "print(\"✅ Expectations defined and saved successfully.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed8fa713",
+ "metadata": {},
+ "source": [
+ "## 📡 Connecting to Hopsworks Feature Store "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "82777378",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hopsworks\n",
+ "\n",
+ "project = hopsworks.login()\n",
+ "\n",
+ "fs = project.get_feature_store()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff82304a",
+ "metadata": {},
+ "source": [
+ "## 🪄 Creating Feature Groups \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d21bff59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'patient_info' feature group\n",
+ "patient_info_fg = fs.get_or_create_feature_group(\n",
+ " name=\"patient_info\",\n",
+ " version=1,\n",
+ " description=\"Demographic Features\",\n",
+ " primary_key=[\"id\"],\n",
+ " event_time=\"date\",\n",
+ " expectation_suite=expectation_suite_patient_demographics,\n",
+ ")\n",
+ "\n",
+ "patient_info_fg.insert(patient_demographics_data_filtered)\n",
+ "print('✅ Done')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a524f361",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'medical_info' feature group\n",
+ "medical_info_fg = fs.get_or_create_feature_group(\n",
+ " name=\"medical_info\",\n",
+ " version=1,\n",
+ " description=\"Medical background features\",\n",
+ " primary_key=[\"id\"],\n",
+ " event_time=\"date\",\n",
+ " expectation_suite=expectation_suite_medical_background,\n",
+ ")\n",
+ "\n",
+ "medical_info_fg.insert(medical_background_data_filtered)\n",
+ "print('✅ Done')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f476f24b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'transplant_compatibility' feature group\n",
+ "transplant_compatibility_fg = fs.get_or_create_feature_group(\n",
+ " name=\"transplant_compatibility\",\n",
+ " version=1,\n",
+ " description=\"Transplant compatibility features\",\n",
+ " primary_key=[\"id\"],\n",
+ " event_time=\"date\",\n",
+ " expectation_suite=expectation_suite_transplant_compatibility,\n",
+ ")\n",
+ "\n",
+ "transplant_compatibility_fg.insert(transplant_compatibility_data_filtered)\n",
+ "print('✅ Done')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e15c005",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.1.-1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb b/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb
new file mode 100644
index 00000000..e8583a6a
--- /dev/null
+++ b/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb
@@ -0,0 +1,537 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b6699fe1",
+ "metadata": {},
+ "source": [
+ "## 📝 Imports "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a39a3d61",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import joblib\n",
+ "import os\n",
+ "import datetime\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from matplotlib import pyplot\n",
+ "\n",
+ "from sklearn.metrics import mean_absolute_error\n",
+ "from prophet import Prophet\n",
+ "from prophet.serialize import model_to_json\n",
+ "\n",
+ "# Mute warnings\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f80e358",
+ "metadata": {},
+ "source": [
+ "## 📡 Connecting to Hopsworks Feature Store "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f88a3faa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hopsworks\n",
+ "\n",
+ "project = hopsworks.login()\n",
+ "\n",
+ "fs = project.get_feature_store()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d65e85bf",
+ "metadata": {},
+ "source": [
+ "### 🔪 Feature Selection \n",
+ "\n",
+ "You will start by selecting all the features you want to include for model training/inference."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a11ec70f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve the 'patient_info' feature group\n",
+ "patient_info_fg = fs.get_feature_group(\n",
+ " name=\"patient_info\",\n",
+ " version=1,\n",
+ ")\n",
+ "\n",
+ "# Retrieve the 'medical_info' feature group\n",
+ "medical_info_fg = fs.get_feature_group(\n",
+ " name=\"medical_info\",\n",
+ " version=1,\n",
+ ")\n",
+ "\n",
+ "# Retrieve the 'transplant_compatibility' feature group\n",
+ "transplant_compatibility_fg = fs.get_feature_group(\n",
+ " name=\"transplant_compatibility\",\n",
+ " version=1,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "17ba77ed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Select features for training data.\n",
+ "selected_features = patient_info_fg.select_all([\"id\", \"date\"])\\\n",
+ " .join(medical_info_fg.select_except([\"id\", \"date\"]))\\\n",
+ " .join(transplant_compatibility_fg.select_except([\"id\", \"date\"])\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "495cbb5f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Uncomment this if you would like to view your selected features\n",
+ "selected_features.show(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d5902c9e",
+ "metadata": {},
+ "source": [
+ "## ⚙️ Transformation Functions \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba4ae19c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "[f.name for f in fs.get_transformation_functions()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6c32555b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n",
+ "\n",
+ "standard_scaler = fs.get_transformation_function(name=\"standard_scaler\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4580c73",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "features_category = ['gender', 'age_cat', 'blood_gp', 'underlying_disease', 'gestation', 'prior_transplant', 'if_transplanted']\n",
+ "\n",
+ "transformation_functions_category = {\n",
+ " feature_name: label_encoder\n",
+ " for feature_name\n",
+ " in features_category\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "be5eb3c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "features_numerical = [\n",
+ " 'age_at_list_registration', 'dialysis_duration', 'number_prior_transplant', 'cpra', 'hla_a1', 'hla_a2', 'hla_b1', 'hla_b2', 'hla_dr1', 'hla_dr2',\n",
+ "]\n",
+ "\n",
+ "transformation_functions_numerical = {\n",
+ " feature_name: standard_scaler\n",
+ " for feature_name\n",
+ " in features_numerical\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1a7e76ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join transformation_functions_category and transformation_functions_numerical dictionaries into one\n",
+ "transformation_functions = transformation_functions_category | transformation_functions_numerical"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91636dc3",
+ "metadata": {},
+ "source": [
+ "## ⚙️ Feature View Creation \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "576617c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'medical_features' feature view\n",
+ "feature_view = fs.get_or_create_feature_view(\n",
+ " name='medical_features',\n",
+ " version=1,\n",
+ " query=selected_features,\n",
+ " labels=[\"duration\"],\n",
+ " transformation_functions=transformation_functions,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a3bb3b8e",
+ "metadata": {},
+ "source": [
+ "## 🏋️ Training Dataset Creation\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7851e335",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Split date with percentage \n",
+ "df = patient_info_fg.read()\n",
+ "\n",
+ "def split_dfs(df): \n",
+ " df = df.sort_values(by='date') \n",
+ " trainvals = df[:int(len(df)*0.8)] \n",
+ " testvals = df[int(len(df)*0.8):] \n",
+ " return {\n",
+ " 'train_start': min(trainvals.date).date(), \n",
+ " 'train_end': max(trainvals.date).date(), \n",
+ " 'test_start': min(testvals.date).date(), \n",
+ " 'test_end': max(testvals.date).date(),\n",
+ " }\n",
+ "\n",
+ "split_dict = split_dfs(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c7a8f6f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "split_dict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "69f4373c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n",
+ " train_start=split_dict['train_start'],\n",
+ " train_end=split_dict['train_end'],\n",
+ " test_start=split_dict['test_start'],\n",
+ " test_end=split_dict['test_end'], \n",
+ " event_time=True,\n",
+ ")\n",
+ "X_train.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2facefa1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_train.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d510db36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sort the X_train DataFrame based on the \"datetime\" column in ascending order\n",
+ "X_train = X_train.sort_values(\"date\")\n",
+ "# Reindex the y_train Series to match the order of rows in the sorted X_train DataFrame\n",
+ "y_train = y_train.reindex(X_train.index)\n",
+ "\n",
+ "# Sort the X_test DataFrame based on the \"datetime\" column in ascending order\n",
+ "X_test = X_test.sort_values(\"date\")\n",
+ "# Reindex the y_test Series to match the order of rows in the sorted X_test DataFrame\n",
+ "y_test = y_test.reindex(X_test.index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a7e10eb2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train['y'] = y_train\n",
+ "X_train['ds'] = X_train.date\n",
+ "X_train['ds'] = pd.to_datetime(X_train.ds)\n",
+ "X_train['ds'] = X_train.ds.map(lambda x: x.replace(tzinfo=None))\n",
+ "X_train.drop(columns=[\"date\"], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dfbb7b31",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_test['ds'] = X_test.date\n",
+ "X_test['ds'] = pd.to_datetime(X_test.ds)\n",
+ "X_test['ds'] = X_test.ds.map(lambda x: x.replace(tzinfo=None))\n",
+ "X_test.drop(columns=[\"date\"], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3847431e",
+ "metadata": {},
+ "source": [
+ "## 🧬 Modeling\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d639b394",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize the Prophet model with the appropriate seasonalities\n",
+ "model = Prophet(\n",
+ " daily_seasonality=False,\n",
+ " weekly_seasonality=True,\n",
+ " yearly_seasonality=True,\n",
+ ")\n",
+ "\n",
+ "# Add monthly seasonality with a period of 30.5 days (average length of a month)\n",
+ "model.add_seasonality(\n",
+ " name='monthly', \n",
+ " period=30.5, \n",
+ " fourier_order=5,\n",
+ " mode='additive',\n",
+ ")\n",
+ "\n",
+ "# Add the additional regressors\n",
+ "additional_regressors = [\n",
+ " 'age_at_list_registration','cpra', 'hla_a1', 'hla_a2', 'hla_b1', 'hla_b2', 'hla_dr1', 'hla_dr2',\n",
+ "]\n",
+ "\n",
+ "for regressor in additional_regressors:\n",
+ " model.add_regressor(regressor)\n",
+ "\n",
+ "# Fit the model\n",
+ "model.fit(X_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce527621",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "forecast = model.predict(X_test)\n",
+ "\n",
+ "# Summarize the forecast\n",
+ "print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())\n",
+ "\n",
+ "# Plot the forecast\n",
+ "fig = model.plot(forecast)\n",
+ "\n",
+ "pyplot.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e4217c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.plot_components(forecast)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d8701339",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# calculate MAE between expected and predicted values for december\n",
+ "y_pred = forecast['yhat']\n",
+ "mae = mean_absolute_error(y_test, y_pred)\n",
+ "print('MAE: %.3f' % mae)\n",
+ "# plot expected vs actual\n",
+ "\n",
+ "metrics = {\n",
+ " \"mae\": round(mae,2)\n",
+ "}\n",
+ "metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ea6cfd10",
+ "metadata": {},
+ "source": [
+ "### ⚙️ Model Schema\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0bf4bb02",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from hsml.schema import Schema\n",
+ "from hsml.model_schema import ModelSchema\n",
+ "\n",
+ "# Define the input schema using the values of X_test\n",
+ "input_schema = Schema(X_test.values)\n",
+ "\n",
+ "# Define the output schema using y_train\n",
+ "output_schema = Schema(y_train)\n",
+ "\n",
+ "# Create a ModelSchema object specifying the input and output schemas\n",
+ "model_schema = ModelSchema(\n",
+ " input_schema=input_schema, \n",
+ " output_schema=output_schema,\n",
+ ")\n",
+ "\n",
+ "# Convert the model schema to a dictionary for further inspection or serialization\n",
+ "model_schema.to_dict()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "93a92ddd",
+ "metadata": {},
+ "source": [
+ "## 📝 Register model\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1dbc8bae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Specify the directory where the model will be saved\n",
+ "model_dir = \"forecast_model\"\n",
+ "\n",
+ "# Check if the directory exists, and create it if it doesn't\n",
+ "if not os.path.isdir(model_dir):\n",
+ " os.mkdir(model_dir)\n",
+ "\n",
+ "# Save the trained model using joblib\n",
+ "with open(model_dir + '/serialized_model.json', 'w') as fout:\n",
+ " fout.write(model_to_json(model)) # Save model\n",
+ " \n",
+ "# Save the confusion matrix plot as an image file in the 'iris_model' directory\n",
+ "fig.savefig(model_dir + \"/forecast.png\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "707022f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the model registry\n",
+ "mr = project.get_model_registry()\n",
+ "\n",
+ "# Create a new model in the model registry\n",
+ "forecast_model = mr.python.create_model(\n",
+ " name=\"waiting_time_forecast_model\", # Name for the model\n",
+ " metrics=metrics, # Metrics used for evaluation\n",
+ " model_schema=model_schema, # Schema defining the model's input and output\n",
+ " input_example=X_test.sample(), # Example input data for reference\n",
+ " description=\"Waiting time for a deceased donor kidney transplant forecasting model\", # Description of the model\n",
+ ")\n",
+ "\n",
+ "# Save the model to the specified directory\n",
+ "forecast_model.save(model_dir)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0db0bf98",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/advanced_tutorials/hospital_wait_time/3_inference_pipeline.ipynb b/advanced_tutorials/hospital_wait_time/3_inference_pipeline.ipynb
new file mode 100644
index 00000000..af472ff3
--- /dev/null
+++ b/advanced_tutorials/hospital_wait_time/3_inference_pipeline.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "72f42eba",
+ "metadata": {},
+ "source": [
+ "## 📝 Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8158dcec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import joblib\n",
+ "import pandas as pd\n",
+ "from datetime import datetime\n",
+ "from prophet.serialize import model_from_json\n",
+ "from matplotlib import pyplot\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "faa87ecf",
+ "metadata": {},
+ "source": [
+ "## 📡 Connecting to Hopsworks Feature Store "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4330a4a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hopsworks\n",
+ "\n",
+ "project = hopsworks.login()\n",
+ "\n",
+ "fs = project.get_feature_store()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dff5ebe0",
+ "metadata": {},
+ "source": [
+ "## ⚙️ Feature View Retrieval\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bd3f8c1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve the 'medical_features' feature view\n",
+ "feature_view = fs.get_feature_view(\n",
+ " name='medical_features',\n",
+ " version=1,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "21b6f351",
+ "metadata": {},
+ "source": [
+ "## 🗄 Model Registry\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ecad81cc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the model registry\n",
+ "mr = project.get_model_registry()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "515fe05a",
+ "metadata": {},
+ "source": [
+ "## 🚀 Fetch and test the model\n",
+ "\n",
+ "Finally you can start making predictions with your model!\n",
+ "\n",
+ "Retrieve your model from Hopsworks model registry."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fbb5cdc7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve the model from the model registry\n",
+ "retrieved_model = mr.get_model(\n",
+ " name=\"waiting_time_forecast_model\",\n",
+ " version=1,\n",
+ ")\n",
+ "\n",
+ "# Download the saved model files to a local directory\n",
+ "saved_model_dir = retrieved_model.download()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9dfaba98",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(saved_model_dir + '/serialized_model.json', 'r') as fin:\n",
+ " model = model_from_json(fin.read()) # Load model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e5b40c5",
+ "metadata": {},
+ "source": [
+ "## 🔮 Batch Prediction \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a5e49a6e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize batch scoring\n",
+ "feature_view.init_batch_scoring(1)\n",
+ "\n",
+ "# Get the batch data\n",
+ "batch_data = feature_view.get_batch_data(\n",
+ " start_time=datetime(2015, 10, 19), \n",
+ " end_time=datetime(2017, 12, 29), \n",
+ " event_time=True,\n",
+ ")\n",
+ "\n",
+ "batch_data['ds'] = batch_data.date\n",
+ "batch_data['ds'] = pd.to_datetime(batch_data.ds)\n",
+ "batch_data['ds'] = batch_data.ds.map(lambda x: x.replace(tzinfo=None))\n",
+ "batch_data.drop(columns=[\"date\"], axis=1, inplace=True)\n",
+ "batch_data = batch_data.sort_values(\"ds\")\n",
+ "\n",
+ "# Display the first 3 rows\n",
+ "batch_data.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8ff70533",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# use the model to make a forecast\n",
+ "forecast = model.predict(batch_data)\n",
+ "\n",
+ "# summarize the forecast\n",
+ "print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b6821c72",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.plot(forecast)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c6c4a04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.plot_components(forecast)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "583f95e2",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/advanced_tutorials/hospital_wait_time/requirements.txt b/advanced_tutorials/hospital_wait_time/requirements.txt
new file mode 100644
index 00000000..3f5db197
--- /dev/null
+++ b/advanced_tutorials/hospital_wait_time/requirements.txt
@@ -0,0 +1 @@
+prophet==1.1.5