From 18285a28b94ba7080b8070d3f47bb3e52ede2e84 Mon Sep 17 00:00:00 2001
From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com>
Date: Sat, 18 May 2024 13:19:40 +0300
Subject: [PATCH] [FSTORE-1395] Update the Langchain usage in Fraud Cheque
 Detection (#263)

* Update the Langchain usage in Fraud Cheque Detection
---
 .../3_inference_pipeline.ipynb                | 629 ++++++++++--------
 .../features/cheque_validation.py             |  34 +
 .../functions/llm_chain.py                    |  13 +-
 3 files changed, 397 insertions(+), 279 deletions(-)
 create mode 100644 advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py
diff --git a/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb b/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb
index c47eb16f..0ff6d7b2 100644
--- a/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb
+++ b/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "2349d93d",
+   "id": "292e9456",
    "metadata": {},
    "source": [
     "## <span style='color:#ff5f27'> 📝 Imports"
@@ -11,19 +11,36 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "56998928",
+   "id": "8bed2e2a",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
    "source": [
-    "# !pip install -r requirements.txt -q"
+    "!pip install -r requirements.txt -q"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "2dd41008",
+   "id": "e1343eb1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-05-14 20:34:55,326 INFO: generated new fontManager\n"
+     ]
+    }
+   ],
    "source": [
     "from xgboost import XGBClassifier\n",
     "import pandas as pd\n",
@@ -42,6 +59,7 @@
     "    generate_response,\n",
     "    format_response,\n",
     ")\n",
+    "from features.cheque_validation import get_cheque_ids\n",
     "\n",
     "import config\n",
     "\n",
@@ -51,7 +69,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ec0ad367",
+   "id": "a9fd8c23",
    "metadata": {},
    "source": [
     "## <span style=\"color:#ff5f27;\"> 🔮 Connecting to Hopsworks Feature Store </span>"
@@ -60,7 +78,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "0d3fec1f",
+   "id": "c34837f2",
    "metadata": {},
    "outputs": [
     {
@@ -84,7 +102,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7f23551b",
+   "id": "a3569ef7",
    "metadata": {},
    "source": [
     "## <span style=\"color:#ff5f27;\"> 🪝 Download the Fraud Detection Model from Model Registry </span>"
@@ -93,7 +111,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "786730c4",
+   "id": "fbecf2c2",
    "metadata": {},
    "outputs": [
     {
@@ -120,7 +138,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "b7969d2f",
+   "id": "1f819625",
    "metadata": {},
    "outputs": [
     {
@@ -177,7 +195,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a1decac7",
+   "id": "8e4032ca",
    "metadata": {},
    "source": [
     "## <span style='color:#ff5f27'> 🍩 Donut Model Loading </span>\n"
@@ -186,16 +204,134 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "93d56010",
+   "id": "13efd4a6",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "DeprecationWarning: `np.bool8` is a deprecated alias for `np.bool_`.  (Deprecated NumPy 1.24)\n",
+      "DeprecationWarning: `np.bool8` is a deprecated alias for `np.bool_`.  (Deprecated NumPy 1.24)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "897f241321c24306b798e1bf61e30d3f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "preprocessor_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
       "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9430a034095843239e505a408f88c709",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/588 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "83c6ea3fe9f541aeb3fe622bc4053066",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "sentencepiece.bpe.model:   0%|          | 0.00/1.30M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cc2b5725d4dc4931bd89fa684d44d0cc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/4.01M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e18614d426ff44cabf50f28ba9a9ab58",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "added_tokens.json:   0%|          | 0.00/448 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cdcb13af5664410982a26a5184bdec7d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/355 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "21d88a9e43664a418d56ba86b169c401",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/5.10k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e4886d494bc4d72b6a8f2addb292065",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -206,7 +342,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "22b44861",
+   "id": "9ce9bcae",
    "metadata": {},
    "source": [
     "## <span style='color:#ff5f27'> 🚀 Cheque Text Parsing </span>\n"
@@ -215,7 +351,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "f92cdc66",
+   "id": "3752b514",
    "metadata": {},
    "outputs": [
     {
@@ -252,7 +388,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "8f713ace",
+   "id": "e1f40889",
    "metadata": {},
    "outputs": [
     {
@@ -289,7 +425,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "9480cbee",
+   "id": "d73d918b",
    "metadata": {},
    "outputs": [
     {
@@ -326,7 +462,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "6cb2ad72",
+   "id": "c78caf91",
    "metadata": {},
    "outputs": [
     {
@@ -362,7 +498,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a8bbfeca",
+   "id": "5eaf39ca",
    "metadata": {},
    "source": [
     "## <span style='color:#ff5f27'>👨🏻‍⚖️ Check Evaluation </span>\n"
@@ -371,7 +507,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "c478028e",
+   "id": "1418a08c",
    "metadata": {},
    "outputs": [
     {
@@ -397,7 +533,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "369f7f16",
+   "id": "de7e3d92",
    "metadata": {},
    "outputs": [
     {
@@ -423,7 +559,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "id": "6eadc72a",
+   "id": "fa656a86",
    "metadata": {},
    "outputs": [
     {
@@ -457,7 +593,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "id": "6b380bf0",
+   "id": "3bc9afe8",
    "metadata": {},
    "outputs": [
     {
@@ -483,7 +619,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "4e984b0b",
+   "id": "b91d0b40",
    "metadata": {},
    "outputs": [
     {
@@ -508,7 +644,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9f4fadd7",
+   "id": "4f8f7c7b",
    "metadata": {},
    "source": [
     "## <span style='color:#ff5f27'>🔗 LLM Chain Loading </span>\n"
@@ -517,7 +653,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "id": "10509472",
+   "id": "da9bb569",
    "metadata": {},
    "outputs": [
     {
@@ -527,6 +663,48 @@
       "🔑 Enter your HuggingFace API key:  ·····································\n"
      ]
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e1fe0d2ef6e4b34a816e86005a35597",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4f5c33caca64c7992acd99e6db5b20b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f88714102ed45cd9eb376ede0df79c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stderr",
      "output_type": "stream",
@@ -534,17 +712,115 @@
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
      ]
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d1c6cfbceb3b4e7d9ff2774f1fc4b9e5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2264a7465184aca806c80f70890d10e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1feed4cf343482faf2ccb98c0ce4826",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "47065e2a435e49da9516574eb5e6e66b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b784fc47ca4347dcabf05e9213acaf37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "50f0eb9737f8408889916e9af25e956e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2ac16977cad4ec395ad56733c8e47b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-04-25 11:44:49,740 INFO: We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
+      "2024-05-14 20:38:41,448 INFO: We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d272c02d4d5c46b5abdaf3b9408390be",
+       "model_id": "abf328bcf45a4d289250a31e639cb59c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -555,6 +831,20 @@
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f024e59bc507483cb462d7256de8ce2a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -569,7 +859,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "95ddf185",
+   "id": "a02af846",
    "metadata": {},
    "source": [
     "## <span style='color:#ff5f27'>🚀 Inference </span>\n"
@@ -578,14 +868,14 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "id": "106759c5",
+   "id": "e60661ec",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n"
      ]
     },
     {
@@ -600,7 +890,7 @@
       "Verdict: valid\n",
       "----------\n",
       "\n",
-      "Valid | The cheque is considered valid because the amount in words \"Three Thousand One Hundred and Sixty Eight\" matches the amount in numbers \"3168\" and the spelling is correct.\n"
+      "Valid | The cheque is considered valid because the amount in words \"Three Thousand One Hundred and Sixty Eight\" matches the amount in numbers 3168 and the spelling is correct.\n"
      ]
     }
    ],
@@ -619,14 +909,14 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "cafe9af8",
+   "id": "232f8e4b",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n"
      ]
     },
     {
@@ -660,14 +950,14 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "id": "ac2a6c75",
+   "id": "7795f95d",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n"
      ]
     },
     {
@@ -682,7 +972,7 @@
       "Verdict: fraud\n",
       "----------\n",
       "\n",
-      "Fraud | The cheque is fraudulent due to a mismatch between the numeric and alphabetic values. The amount in words is \"Three Hundred and Thirty Six\", but the amount in numbers is 3318. This mismatch indicates that the cheque may be fraudulent, as a genuine cheque would typically have matching amounts in words and numbers.\n"
+      "Fraud | The cheque is fraudulent due to a mismatch between the numeric and alphabetic values. The amount in words is \"Three Hundred and Thirty Six\", while the amount in numbers is 3318. The discrepancy between the two values suggests that the cheque has been tampered with, making it a fraudulent transaction.\n"
      ]
     }
    ],
@@ -701,14 +991,14 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "id": "0d3214b2",
+   "id": "b8897a01",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n"
      ]
     },
     {
@@ -741,7 +1031,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5301f605",
+   "id": "c67483cf",
    "metadata": {},
    "source": [
     "## <span style='color:#ff5f27'>🗄️ Batch Inference </span>\n"
@@ -750,7 +1040,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "id": "bc04a8b3",
+   "id": "6953f9a6",
    "metadata": {},
    "outputs": [
     {
@@ -790,20 +1080,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "8e0e8849",
+   "execution_count": null,
+   "id": "cec1a184",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n",
       "--- Logging error ---\n",
       "Traceback (most recent call last):\n",
       "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/logging/__init__.py\", line 1100, in emit\n",
@@ -882,14 +1172,12 @@
       "    cheque_batch_validation = [\n",
       "  File \"<ipython-input-22-4338d9687455>\", line 2, in <listcomp>\n",
       "    generate_response(\n",
-      "  File \"/home/yarnapp/hopsfs/Jupyter/functions/llm_chain.py\", line 179, in generate_response\n",
+      "  File \"/home/yarnapp/hopsfs/Jupyter/functions/llm_chain.py\", line 225, in generate_response\n",
       "    model_output = llm_chain.invoke({\n",
-      "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain/chains/base.py\", line 153, in invoke\n",
-      "    self._call(inputs, run_manager=run_manager)\n",
-      "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain/chains/llm.py\", line 103, in _call\n",
-      "    response = self.generate([inputs], run_manager=run_manager)\n",
-      "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain/chains/llm.py\", line 115, in generate\n",
-      "    return self.llm.generate_prompt(\n",
+      "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/runnables/base.py\", line 2499, in invoke\n",
+      "    input = step.invoke(\n",
+      "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/language_models/llms.py\", line 276, in invoke\n",
+      "    self.generate_prompt(\n",
       "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/language_models/llms.py\", line 633, in generate_prompt\n",
       "    return self.generate(prompt_strings, stop=stop, callbacks=callbacks, **kwargs)\n",
       "  File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/language_models/llms.py\", line 803, in generate\n",
@@ -906,84 +1194,9 @@
       "    self.warning(*args, **kwargs)\n",
       "Message: 'You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset'\n",
       "Arguments: (<class 'UserWarning'>,)\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
-      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n"
      ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>status</th>\n",
-       "      <th>description</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Fraud</td>\n",
-       "      <td>The cheque is fraudulent due to a mismatch bet...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Valid</td>\n",
-       "      <td>The cheque is considered valid because the amo...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Valid</td>\n",
-       "      <td>The cheque is considered valid because the amo...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Valid</td>\n",
-       "      <td>The cheque is considered valid because the amo...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Fraud</td>\n",
-       "      <td>The cheque is considered fraudulent because th...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  status                                        description\n",
-       "0  Fraud  The cheque is fraudulent due to a mismatch bet...\n",
-       "1  Valid  The cheque is considered valid because the amo...\n",
-       "2  Valid  The cheque is considered valid because the amo...\n",
-       "3  Valid  The cheque is considered valid because the amo...\n",
-       "4  Fraud  The cheque is considered fraudulent because th..."
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -1016,7 +1229,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c71a66e1",
+   "id": "72b14181",
    "metadata": {},
    "source": [
     "## <span style=\"color:#ff5f27;\"> 🪄 Feature Group Creation </span>"
@@ -1024,8 +1237,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "id": "6c50f039",
+   "execution_count": null,
+   "id": "a56569be",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1040,8 +1253,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "838bb5d6",
+   "execution_count": null,
+   "id": "968b7b16",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1058,94 +1271,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "6c2cd294",
+   "execution_count": null,
+   "id": "40eb1c4d",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "def get_cheque_ids(feature_group, data):\n",
-    "    try:    \n",
-    "        cheque_id_max = [\n",
-    "            int(feature.max) \n",
-    "            for feature \n",
-    "            in feature_group.statistics.feature_descriptive_statistics \n",
-    "            if feature.feature_name == 'cheque_id'\n",
-    "        ][0]\n",
-    "        data['cheque_id'] = [*range(cheque_id_max+1, cheque_id_max+1 + data.shape[0])]\n",
-    "        return data\n",
-    "    \n",
-    "    except:\n",
-    "        # Resetting the index without dropping it to make it a column\n",
-    "        return data.reset_index(drop=False).rename(columns={'index': 'cheque_id'})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "8e7f7efe",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>cheque_id</th>\n",
-       "      <th>status</th>\n",
-       "      <th>description</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>Fraud</td>\n",
-       "      <td>The cheque is fraudulent due to a mismatch bet...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>Valid</td>\n",
-       "      <td>The cheque is considered valid because the amo...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Valid</td>\n",
-       "      <td>The cheque is considered valid because the amo...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   cheque_id status                                        description\n",
-       "0          0  Fraud  The cheque is fraudulent due to a mismatch bet...\n",
-       "1          1  Valid  The cheque is considered valid because the amo...\n",
-       "2          2  Valid  The cheque is considered valid because the amo..."
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "cheque_validation_df_w_index = get_cheque_ids(\n",
     "    cheque_validation_fg, \n",
@@ -1156,52 +1285,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "e2badd15",
+   "execution_count": null,
+   "id": "83d93efd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Feature Group created successfully, explore it at \n",
-      "https://snurran.hops.works/p/11385/fs/11333/fg/12483\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f7710fae38f1462d8570c776f49e5aba",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading Dataframe: 0.00% |          | Rows 0/15 | Elapsed Time: 00:00 | Remaining Time: ?"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Launching job: cheque_validation_fg_1_offline_fg_materialization\n",
-      "Job started successfully, you can follow the progress at \n",
-      "https://snurran.hops.works/p/11385/jobs/named/cheque_validation_fg_1_offline_fg_materialization/executions\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(<hsfs.core.job.Job at 0x7f2d18310640>, None)"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "cheque_validation_fg.insert(\n",
     "    cheque_validation_df_w_index,\n",
@@ -1210,7 +1297,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2645d8f8",
+   "id": "97599fa4",
    "metadata": {},
    "source": [
     "---"
diff --git a/advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py b/advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py
new file mode 100644
index 00000000..54d641df
--- /dev/null
+++ b/advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py
@@ -0,0 +1,34 @@
+import pandas as pd
+
+def get_cheque_ids(feature_group, data: pd.DataFrame) -> pd.DataFrame:
+    """
+    Generate a sequence of new cheque IDs for a DataFrame based on the maximum existing cheque ID found in a feature group.
+
+    The function first attempts to find the maximum 'cheque_id' using the feature group statistics.
+    If it finds this max ID, it generates a new sequence of cheque IDs for the DataFrame starting from the next integer.
+    If it encounters any issue in the process (e.g., the feature group does not exist), it resets the DataFrame's index 
+    to create a 'cheque_id' based on the row index.
+
+    Parameters:
+        feature_group: Hopsworks Feature Group.
+        data (pd.DataFrame): The DataFrame to which the cheque ID will be added.
+
+    Returns:
+        pd.DataFrame: The modified DataFrame with a new 'cheque_id' column added.
+    """
+    try:
+        # Extract the maximum 'cheque_id' from feature_group if it exists
+        cheque_id_max = [
+            int(feature.max) 
+            for feature in feature_group.statistics.feature_descriptive_statistics 
+            if feature.feature_name == 'cheque_id'
+        ][0]
+        
+        # Generate new cheque IDs starting from the maximum found + 1
+        data['cheque_id'] = [*range(cheque_id_max + 1, cheque_id_max + 1 + data.shape[0])]
+    
+    except Exception as e:
+        # In case of any error during ID generation, fallback to using DataFrame index as 'cheque_id'
+        data = data.reset_index(drop=False).rename(columns={'index': 'cheque_id'})
+    
+    return data
diff --git a/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py b/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py
index d63f1533..911c55ca 100644
--- a/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py
+++ b/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py
@@ -5,7 +5,7 @@
 import torch
 from langchain.llms import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
-from langchain.chains.llm import LLMChain
+from langchain.schema.output_parser import StrOutputParser
 
 from functions.utils import (
     load_image,
@@ -135,11 +135,8 @@ def get_llm_chain(model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"):
     )
 
     # Create the LLM chain
-    llm_chain = LLMChain(
-        llm=pipeline_llm,                    # The text generation pipeline
-        prompt=prompt,                       # The structured prompt template
-        verbose=False,                       # Controls verbose output during execution
-    )
+    llm_chain = prompt | pipeline_llm | StrOutputParser()
+
     return llm_chain
 
 
@@ -230,8 +227,8 @@ def generate_response(
     })
 
     # Process the model output to extract the relevant response part
-    return model_output['text'].split(
-        '<|start_header_id|>assistant<|end_header_id|>'
+    return model_output.split(
+       '<|start_header_id|>assistant<|end_header_id|>'
     )[-1].strip()
  
     

	status	description
0	Fraud	The cheque is fraudulent due to a mismatch bet...
1	Valid	The cheque is considered valid because the amo...
2	Valid	The cheque is considered valid because the amo...
3	Valid	The cheque is considered valid because the amo...
4	Fraud	The cheque is considered fraudulent because th...