From 18285a28b94ba7080b8070d3f47bb3e52ede2e84 Mon Sep 17 00:00:00 2001 From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com> Date: Sat, 18 May 2024 13:19:40 +0300 Subject: [PATCH] [FSTORE-1395] Update the Langchain usage in Fraud Cheque Detection (#263) * Update the Langchain usage in Fraud Cheque Detection --- .../3_inference_pipeline.ipynb | 629 ++++++++++-------- .../features/cheque_validation.py | 34 + .../functions/llm_chain.py | 13 +- 3 files changed, 397 insertions(+), 279 deletions(-) create mode 100644 advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py diff --git a/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb b/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb index c47eb16f..0ff6d7b2 100644 --- a/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb +++ b/advanced_tutorials/fraud_cheque_detection/3_inference_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "2349d93d", + "id": "292e9456", "metadata": {}, "source": [ "## ๐Ÿ“ Imports" @@ -11,19 +11,36 @@ { "cell_type": "code", "execution_count": 1, - "id": "56998928", + "id": "8bed2e2a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], "source": [ - "# !pip install -r requirements.txt -q" + "!pip install -r requirements.txt -q" ] }, { "cell_type": "code", "execution_count": 2, - "id": "2dd41008", + "id": "e1343eb1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-05-14 20:34:55,326 INFO: generated new fontManager\n" + ] + } + ], "source": [ "from xgboost import XGBClassifier\n", "import pandas as pd\n", @@ -42,6 +59,7 @@ " generate_response,\n", " format_response,\n", ")\n", + "from features.cheque_validation import get_cheque_ids\n", "\n", "import config\n", "\n", @@ -51,7 +69,7 @@ }, { "cell_type": "markdown", - "id": "ec0ad367", + "id": "a9fd8c23", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connecting to Hopsworks Feature Store " @@ -60,7 +78,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "0d3fec1f", + "id": "c34837f2", "metadata": {}, "outputs": [ { @@ -84,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "7f23551b", + "id": "a3569ef7", "metadata": {}, "source": [ "## ๐Ÿช Download the Fraud Detection Model from Model Registry " @@ -93,7 +111,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "786730c4", + "id": "fbecf2c2", "metadata": {}, "outputs": [ { @@ -120,7 +138,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "b7969d2f", + "id": "1f819625", "metadata": {}, "outputs": [ { @@ -177,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "a1decac7", + "id": "8e4032ca", "metadata": {}, "source": [ "## ๐Ÿฉ Donut Model Loading \n" @@ -186,16 +204,134 @@ { "cell_type": "code", "execution_count": 6, - "id": "93d56010", + "id": "13efd4a6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "DeprecationWarning: `np.bool8` is a deprecated alias for `np.bool_`. (Deprecated NumPy 1.24)\n", + "DeprecationWarning: `np.bool8` is a deprecated alias for `np.bool_`. (Deprecated NumPy 1.24)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "897f241321c24306b798e1bf61e30d3f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "preprocessor_config.json: 0%| | 0.00/361 [00:00 ๐Ÿš€ Cheque Text Parsing \n" @@ -215,7 +351,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "f92cdc66", + "id": "3752b514", "metadata": {}, "outputs": [ { @@ -252,7 +388,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "8f713ace", + "id": "e1f40889", "metadata": {}, "outputs": [ { @@ -289,7 +425,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "9480cbee", + "id": "d73d918b", "metadata": {}, "outputs": [ { @@ -326,7 +462,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "6cb2ad72", + "id": "c78caf91", "metadata": {}, "outputs": [ { @@ -362,7 +498,7 @@ }, { "cell_type": "markdown", - "id": "a8bbfeca", + "id": "5eaf39ca", "metadata": {}, "source": [ "## ๐Ÿ‘จ๐Ÿปโ€โš–๏ธ Check Evaluation \n" @@ -371,7 +507,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "c478028e", + "id": "1418a08c", "metadata": {}, "outputs": [ { @@ -397,7 +533,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "369f7f16", + "id": "de7e3d92", "metadata": {}, "outputs": [ { @@ -423,7 +559,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "6eadc72a", + "id": "fa656a86", "metadata": {}, "outputs": [ { @@ -457,7 +593,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "6b380bf0", + "id": "3bc9afe8", "metadata": {}, "outputs": [ { @@ -483,7 +619,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "4e984b0b", + "id": "b91d0b40", "metadata": {}, "outputs": [ { @@ -508,7 +644,7 @@ }, { "cell_type": "markdown", - "id": "9f4fadd7", + "id": "4f8f7c7b", "metadata": {}, "source": [ "## ๐Ÿ”— LLM Chain Loading \n" @@ -517,7 +653,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "10509472", + "id": "da9bb569", "metadata": {}, "outputs": [ { @@ -527,6 +663,48 @@ "๐Ÿ”‘ Enter your HuggingFace API key: ยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยท\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3e1fe0d2ef6e4b34a816e86005a35597", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/51.0k [00:00๐Ÿš€ Inference \n" @@ -578,14 +868,14 @@ { "cell_type": "code", "execution_count": 17, - "id": "106759c5", + "id": "e60661ec", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n" ] }, { @@ -600,7 +890,7 @@ "Verdict: valid\n", "----------\n", "\n", - "Valid | The cheque is considered valid because the amount in words \"Three Thousand One Hundred and Sixty Eight\" matches the amount in numbers \"3168\" and the spelling is correct.\n" + "Valid | The cheque is considered valid because the amount in words \"Three Thousand One Hundred and Sixty Eight\" matches the amount in numbers 3168 and the spelling is correct.\n" ] } ], @@ -619,14 +909,14 @@ { "cell_type": "code", "execution_count": 18, - "id": "cafe9af8", + "id": "232f8e4b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n" ] }, { @@ -660,14 +950,14 @@ { "cell_type": "code", "execution_count": 19, - "id": "ac2a6c75", + "id": "7795f95d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n" ] }, { @@ -682,7 +972,7 @@ "Verdict: fraud\n", "----------\n", "\n", - "Fraud | The cheque is fraudulent due to a mismatch between the numeric and alphabetic values. The amount in words is \"Three Hundred and Thirty Six\", but the amount in numbers is 3318. This mismatch indicates that the cheque may be fraudulent, as a genuine cheque would typically have matching amounts in words and numbers.\n" + "Fraud | The cheque is fraudulent due to a mismatch between the numeric and alphabetic values. The amount in words is \"Three Hundred and Thirty Six\", while the amount in numbers is 3318. The discrepancy between the two values suggests that the cheque has been tampered with, making it a fraudulent transaction.\n" ] } ], @@ -701,14 +991,14 @@ { "cell_type": "code", "execution_count": 20, - "id": "0d3214b2", + "id": "b8897a01", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n" ] }, { @@ -741,7 +1031,7 @@ }, { "cell_type": "markdown", - "id": "5301f605", + "id": "c67483cf", "metadata": {}, "source": [ "## ๐Ÿ—„๏ธ Batch Inference \n" @@ -750,7 +1040,7 @@ { "cell_type": "code", "execution_count": 21, - "id": "bc04a8b3", + "id": "6953f9a6", "metadata": {}, "outputs": [ { @@ -790,20 +1080,20 @@ }, { "cell_type": "code", - "execution_count": 22, - "id": "8e0e8849", + "execution_count": null, + "id": "cec1a184", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n", "--- Logging error ---\n", "Traceback (most recent call last):\n", " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/logging/__init__.py\", line 1100, in emit\n", @@ -882,14 +1172,12 @@ " cheque_batch_validation = [\n", " File \"\", line 2, in \n", " generate_response(\n", - " File \"/home/yarnapp/hopsfs/Jupyter/functions/llm_chain.py\", line 179, in generate_response\n", + " File \"/home/yarnapp/hopsfs/Jupyter/functions/llm_chain.py\", line 225, in generate_response\n", " model_output = llm_chain.invoke({\n", - " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain/chains/base.py\", line 153, in invoke\n", - " self._call(inputs, run_manager=run_manager)\n", - " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain/chains/llm.py\", line 103, in _call\n", - " response = self.generate([inputs], run_manager=run_manager)\n", - " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain/chains/llm.py\", line 115, in generate\n", - " return self.llm.generate_prompt(\n", + " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/runnables/base.py\", line 2499, in invoke\n", + " input = step.invoke(\n", + " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/language_models/llms.py\", line 276, in invoke\n", + " self.generate_prompt(\n", " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/language_models/llms.py\", line 633, in generate_prompt\n", " return self.generate(prompt_strings, stop=stop, callbacks=callbacks, **kwargs)\n", " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/langchain_core/language_models/llms.py\", line 803, in generate\n", @@ -906,84 +1194,9 @@ " self.warning(*args, **kwargs)\n", "Message: 'You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset'\n", "Arguments: (,)\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n", - "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
statusdescription
0FraudThe cheque is fraudulent due to a mismatch bet...
1ValidThe cheque is considered valid because the amo...
2ValidThe cheque is considered valid because the amo...
3ValidThe cheque is considered valid because the amo...
4FraudThe cheque is considered fraudulent because th...
\n", - "
" - ], - "text/plain": [ - " status description\n", - "0 Fraud The cheque is fraudulent due to a mismatch bet...\n", - "1 Valid The cheque is considered valid because the amo...\n", - "2 Valid The cheque is considered valid because the amo...\n", - "3 Valid The cheque is considered valid because the amo...\n", - "4 Fraud The cheque is considered fraudulent because th..." - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ @@ -1016,7 +1229,7 @@ }, { "cell_type": "markdown", - "id": "c71a66e1", + "id": "72b14181", "metadata": {}, "source": [ "## ๐Ÿช„ Feature Group Creation " @@ -1024,8 +1237,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "id": "6c50f039", + "execution_count": null, + "id": "a56569be", "metadata": {}, "outputs": [], "source": [ @@ -1040,8 +1253,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "838bb5d6", + "execution_count": null, + "id": "968b7b16", "metadata": {}, "outputs": [], "source": [ @@ -1058,94 +1271,10 @@ }, { "cell_type": "code", - "execution_count": 25, - "id": "6c2cd294", + "execution_count": null, + "id": "40eb1c4d", "metadata": {}, "outputs": [], - "source": [ - "def get_cheque_ids(feature_group, data):\n", - " try: \n", - " cheque_id_max = [\n", - " int(feature.max) \n", - " for feature \n", - " in feature_group.statistics.feature_descriptive_statistics \n", - " if feature.feature_name == 'cheque_id'\n", - " ][0]\n", - " data['cheque_id'] = [*range(cheque_id_max+1, cheque_id_max+1 + data.shape[0])]\n", - " return data\n", - " \n", - " except:\n", - " # Resetting the index without dropping it to make it a column\n", - " return data.reset_index(drop=False).rename(columns={'index': 'cheque_id'})" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "8e7f7efe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cheque_idstatusdescription
00FraudThe cheque is fraudulent due to a mismatch bet...
11ValidThe cheque is considered valid because the amo...
22ValidThe cheque is considered valid because the amo...
\n", - "
" - ], - "text/plain": [ - " cheque_id status description\n", - "0 0 Fraud The cheque is fraudulent due to a mismatch bet...\n", - "1 1 Valid The cheque is considered valid because the amo...\n", - "2 2 Valid The cheque is considered valid because the amo..." - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "cheque_validation_df_w_index = get_cheque_ids(\n", " cheque_validation_fg, \n", @@ -1156,52 +1285,10 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "e2badd15", + "execution_count": null, + "id": "83d93efd", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature Group created successfully, explore it at \n", - "https://snurran.hops.works/p/11385/fs/11333/fg/12483\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f7710fae38f1462d8570c776f49e5aba", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Uploading Dataframe: 0.00% | | Rows 0/15 | Elapsed Time: 00:00 | Remaining Time: ?" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Launching job: cheque_validation_fg_1_offline_fg_materialization\n", - "Job started successfully, you can follow the progress at \n", - "https://snurran.hops.works/p/11385/jobs/named/cheque_validation_fg_1_offline_fg_materialization/executions\n" - ] - }, - { - "data": { - "text/plain": [ - "(, None)" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "cheque_validation_fg.insert(\n", " cheque_validation_df_w_index,\n", @@ -1210,7 +1297,7 @@ }, { "cell_type": "markdown", - "id": "2645d8f8", + "id": "97599fa4", "metadata": {}, "source": [ "---" diff --git a/advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py b/advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py new file mode 100644 index 00000000..54d641df --- /dev/null +++ b/advanced_tutorials/fraud_cheque_detection/features/cheque_validation.py @@ -0,0 +1,34 @@ +import pandas as pd + +def get_cheque_ids(feature_group, data: pd.DataFrame) -> pd.DataFrame: + """ + Generate a sequence of new cheque IDs for a DataFrame based on the maximum existing cheque ID found in a feature group. + + The function first attempts to find the maximum 'cheque_id' using the feature group statistics. + If it finds this max ID, it generates a new sequence of cheque IDs for the DataFrame starting from the next integer. + If it encounters any issue in the process (e.g., the feature group does not exist), it resets the DataFrame's index + to create a 'cheque_id' based on the row index. + + Parameters: + feature_group: Hopsworks Feature Group. + data (pd.DataFrame): The DataFrame to which the cheque ID will be added. + + Returns: + pd.DataFrame: The modified DataFrame with a new 'cheque_id' column added. + """ + try: + # Extract the maximum 'cheque_id' from feature_group if it exists + cheque_id_max = [ + int(feature.max) + for feature in feature_group.statistics.feature_descriptive_statistics + if feature.feature_name == 'cheque_id' + ][0] + + # Generate new cheque IDs starting from the maximum found + 1 + data['cheque_id'] = [*range(cheque_id_max + 1, cheque_id_max + 1 + data.shape[0])] + + except Exception as e: + # In case of any error during ID generation, fallback to using DataFrame index as 'cheque_id' + data = data.reset_index(drop=False).rename(columns={'index': 'cheque_id'}) + + return data diff --git a/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py b/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py index d63f1533..911c55ca 100644 --- a/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py +++ b/advanced_tutorials/fraud_cheque_detection/functions/llm_chain.py @@ -5,7 +5,7 @@ import torch from langchain.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate -from langchain.chains.llm import LLMChain +from langchain.schema.output_parser import StrOutputParser from functions.utils import ( load_image, @@ -135,11 +135,8 @@ def get_llm_chain(model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"): ) # Create the LLM chain - llm_chain = LLMChain( - llm=pipeline_llm, # The text generation pipeline - prompt=prompt, # The structured prompt template - verbose=False, # Controls verbose output during execution - ) + llm_chain = prompt | pipeline_llm | StrOutputParser() + return llm_chain @@ -230,8 +227,8 @@ def generate_response( }) # Process the model output to extract the relevant response part - return model_output['text'].split( - '<|start_header_id|>assistant<|end_header_id|>' + return model_output.split( + '<|start_header_id|>assistant<|end_header_id|>' )[-1].strip()