From 299d2f025795f101769daf7390db26b59638e00c Mon Sep 17 00:00:00 2001
From: Prithvi Kannan <46332835+prithvikannan@users.noreply.github.com>
Date: Fri, 20 Oct 2023 12:09:14 -0700
Subject: [PATCH] Add example notebooks for QA (#10018)

---
 .../LLM Evaluation Examples -- QA.ipynb       | 1755 +++++++++++++++++
 1 file changed, 1755 insertions(+)
 create mode 100644 examples/evaluation/LLM Evaluation Examples -- QA.ipynb

diff --git a/examples/evaluation/LLM Evaluation Examples -- QA.ipynb b/examples/evaluation/LLM Evaluation Examples -- QA.ipynb
new file mode 100644
index 0000000000000..a3dfcef0454cb
--- /dev/null
+++ b/examples/evaluation/LLM Evaluation Examples -- QA.ipynb	
@@ -0,0 +1,1755 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "0a87a4cd-8a01-4e35-8a71-eaf91ed4ddd2",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# LLM Evaluation with MLflow Example Notebook\n",
+    "\n",
+    "In this notebook, we will demonstrate how to evaluate various LLMs and RAG systems with MLflow, leveraging simple metrics such as perplexity and toxicity, as well as LLM-judged metrics such as relevance, and even custom LLM-judged metrics such as professionalism"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "cce6412a-2279-4ec1-a344-fa76fec70ee1",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Set OpenAI Key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "fb946228-62fb-4d68-9732-75935c9cb401",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "bec25067-224d-4ee8-9b5d-0beeb6cde684",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "os.environ[\"OPENAI_API_KEY\"] = \"redacted\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "import pandas as pd\n",
+    "\n",
+    "import mlflow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "a9bbfc03-793e-4b95-b009-ef30dccd7e7d",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Basic Question-Answering Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "ff253b9e-59e8-40e0-92d8-8f9ef85348fd",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a test case of `inputs` that will be passed into the model and `ground_truth` which will be used to compare against the generated output from the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6199fb3f-5951-42fe-891a-2227010b630a",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "eval_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"inputs\": [\n",
+    "            \"How does useEffect() work?\",\n",
+    "            \"What does the static keyword in a function mean?\",\n",
+    "            \"What does the 'finally' block in Python do?\",\n",
+    "            \"What is the difference between multiprocessing and multithreading?\",\n",
+    "        ],\n",
+    "        \"ground_truth\": [\n",
+    "            \"The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.\",\n",
+    "            \"Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.\",\n",
+    "            \"'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.\",\n",
+    "            \"Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.\",\n",
+    "        ],\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "06825224-49bd-452d-8dab-b11ca8130017",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a simple OpenAI model that asks gpt-3.5 to answer the question in two sentences. Call `mlflow.evaluate()` with the model and evaluation dataframe. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "7b67eb6f-c91a-4f9a-ac0d-01fd22b087c8",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:35:53 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.7.2/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4bf4e330c1541819217be2deee3dd2b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d75c7bb35a16424c84e727f09a45de3f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:35:58 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:36:04 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:36:04 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:36:04 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:36:04 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:36:05 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:36:05 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:36:05 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:36:05 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6a1638efbf604adcb5fce17dfa781ef3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:36:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:36:08 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:36:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:36:08 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:36:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'perplexity/v1/mean': 19.467615604400635,\n",
+       " 'perplexity/v1/variance': 18.95112384684103,\n",
+       " 'perplexity/v1/p90': 23.42769298553467,\n",
+       " 'flesch_kincaid_grade_level/v1/mean': 14.55,\n",
+       " 'flesch_kincaid_grade_level/v1/variance': 26.192499999999995,\n",
+       " 'flesch_kincaid_grade_level/v1/p90': 20.26,\n",
+       " 'ari_grade_level/v1/mean': 16.2,\n",
+       " 'ari_grade_level/v1/variance': 38.725,\n",
+       " 'ari_grade_level/v1/p90': 23.240000000000002,\n",
+       " 'exact_match/v1': 0.0}"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    system_prompt = \"Answer the following question in two sentences\"\n",
+    "    basic_qa_model = mlflow.openai.log_model(\n",
+    "        model=\"gpt-3.5-turbo\",\n",
+    "        task=openai.ChatCompletion,\n",
+    "        artifact_path=\"model\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": \"{question}\"},\n",
+    "        ],\n",
+    "    )\n",
+    "    results = mlflow.evaluate(\n",
+    "        basic_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        targets=\"ground_truth\",  # specify which column corresponds to the expected output\n",
+    "        model_type=\"question-answering\",  # model type indicates which metrics are relevant for this task\n",
+    "        evaluators=\"default\",\n",
+    "    )\n",
+    "results.metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "6d078816-1de1-4a6e-b757-5c9cbe056638",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Inspect the evaluation results table as a dataframe to see row-by-row metrics to further assess model performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "28688e6c-6a2d-40bd-a737-58cfe70f2e10",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d33967e4787748c69ff9873b6e46227c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>useEffect() is a hook in React that allows you to perform side effects in functional components. It takes a function as its first argument and runs that function after the component has rendered and whenever any of its dependencies have changed.</td><td>45</td><td>23.4797859192</td><td>11.3</td><td>12.4</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>The static keyword in a function means that the function can only be accessed within the same file it is declared in, and is not accessible to other files. It is used to limit the scope of the function and improve encapsulation.</td><td>46</td><td>18.20271492</td><td>10.3</td><td>10.0</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>The 'finally' block in Python is used to define a block of code that will be executed regardless of whether an exception has occurred or not, ensuring that certain clean-up actions are always performed. It is typically used to close files, release resources, or clean up any operations that need to be done before exiting a try-except block.</td><td>68</td><td>23.306142807</td><td>13.4</td><td>16.1</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Multiprocessing involves the simultaneous execution of multiple tasks on multiple processors, while multithreading involves the execution of multiple threads within a single process, allowing for concurrent execution of different parts of the program.</td><td>39</td><td>12.8818187714</td><td>23.2</td><td>26.3</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "useEffect() is a hook in React that allows you to perform side effects in functional components. It takes a function as its first argument and runs that function after the component has rendered and whenever any of its dependencies have changed.",
+         45,
+         23.4797859192,
+         11.3,
+         12.4
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "The static keyword in a function means that the function can only be accessed within the same file it is declared in, and is not accessible to other files. It is used to limit the scope of the function and improve encapsulation.",
+         46,
+         18.20271492,
+         10.3,
+         10
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "The 'finally' block in Python is used to define a block of code that will be executed regardless of whether an exception has occurred or not, ensuring that certain clean-up actions are always performed. It is typically used to close files, release resources, or clean up any operations that need to be done before exiting a try-except block.",
+         68,
+         23.306142807,
+         13.4,
+         16.1
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Multiprocessing involves the simultaneous execution of multiple tasks on multiple processors, while multithreading involves the execution of multiple threads within a single process, allowing for concurrent execution of different parts of the program.",
+         39,
+         12.8818187714,
+         23.2,
+         26.3
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "1a7363c9-3b73-4e3f-bf7c-1d6887fb4f9e",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## LLM-judged correctness with OpenAI GPT-4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "cd23fe79-cfbf-42a7-a3f3-14badfe20db5",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Construct an answer similarity metric using the `answer_similarity()` metric factory function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "88b35b52-5b8f-4b72-9de8-fec05f01e722",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EvaluationMetric(name=answer_similarity, greater_is_better=True, long_name=answer_similarity, version=v1, metric_details=\n",
+      "Task:\n",
+      "You are an impartial judge. You will be given an input that was sent to a machine\n",
+      "learning model, and you will be given an output that the model produced. You\n",
+      "may also be given additional information that was used by the model to generate the output.\n",
+      "\n",
+      "Your task is to determine a numerical score called answer_similarity based on the input and output.\n",
+      "A definition of answer_similarity and a grading rubric are provided below.\n",
+      "You must use the grading rubric to determine your score. You must also justify your score.\n",
+      "\n",
+      "Examples could be included below for reference. Make sure to use them as references and to\n",
+      "understand them before completing the task.\n",
+      "\n",
+      "Input:\n",
+      "{input}\n",
+      "\n",
+      "Output:\n",
+      "{output}\n",
+      "\n",
+      "{grading_context_columns}\n",
+      "\n",
+      "Metric definition:\n",
+      "Answer similarity is evaluated on the degree of semantic similarity of the provided output to the provided targets, which is the ground truth. Scores can be assigned based on the gradual similarity in meaning and description to the provided targets, where a higher score indicates greater alignment between the provided output and provided targets.\n",
+      "\n",
+      "Grading rubric:\n",
+      "Answer similarity: Below are the details for different scores:\n",
+      "- Score 1: the output has little to no semantic similarity to the provided targets.\n",
+      "- Score 2: the output displays partial semantic similarity to the provided targets on some aspects.\n",
+      "- Score 3: the output has moderate semantic similarity to the provided targets.\n",
+      "- Score 4: the output aligns with the provided targets in most aspects and has substantial semantic similarity.\n",
+      "- Score 5: the output closely aligns with the provided targets in all significant aspects.\n",
+      "\n",
+      "Examples:\n",
+      "\n",
+      "Input:\n",
+      "What is MLflow?\n",
+      "\n",
+      "Output:\n",
+      "MLflow is an open-source platform for managing machine learning workflows, including experiment tracking, model packaging, versioning, and deployment, simplifying the ML lifecycle.\n",
+      "\n",
+      "Additional information used by the model:\n",
+      "key: ground_truth\n",
+      "value:\n",
+      "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.\n",
+      "\n",
+      "score: 4\n",
+      "justification: The definition effectively explains what MLflow is its purpose, and its developer. It could be more concise for a 5-score.\n",
+      "        \n",
+      "\n",
+      "You must return the following fields in your response one below the other:\n",
+      "score: Your numerical score for the model's answer_similarity based on the rubric\n",
+      "justification: Your step-by-step reasoning about the model's answer_similarity score\n",
+      "    )\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mlflow.metrics import EvaluationExample, answer_similarity\n",
+    "\n",
+    "# Create an example to describe what answer_similarity means like for this problem.\n",
+    "example = EvaluationExample(\n",
+    "    input=\"What is MLflow?\",\n",
+    "    output=\"MLflow is an open-source platform for managing machine \"\n",
+    "    \"learning workflows, including experiment tracking, model packaging, \"\n",
+    "    \"versioning, and deployment, simplifying the ML lifecycle.\",\n",
+    "    score=4,\n",
+    "    justification=\"The definition effectively explains what MLflow is \"\n",
+    "    \"its purpose, and its developer. It could be more concise for a 5-score.\",\n",
+    "    grading_context={\n",
+    "        \"ground_truth\": \"MLflow is an open-source platform for managing \"\n",
+    "        \"the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, \"\n",
+    "        \"a company that specializes in big data and machine learning solutions. MLflow is \"\n",
+    "        \"designed to address the challenges that data scientists and machine learning \"\n",
+    "        \"engineers face when developing, training, and deploying machine learning models.\"\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "# Construct the metric using OpenAI GPT-4 as the judge\n",
+    "answer_similarity_metric = answer_similarity(model=\"openai:/gpt-4\", examples=[example])\n",
+    "\n",
+    "print(answer_similarity_metric)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "d627f7ab-a7e1-430d-9431-9ce4bd810fa7",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Call `mlflow.evaluate()` again but with your new `answer_similarity_metric`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "cae9d80b-39a2-4e98-ac08-bfa5ba387b8f",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a9ad395386743a0a44cce1875382e27",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:37:06 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:37:12 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:37:12 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:37:12 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:37:12 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:37:13 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:37:13 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:37:13 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:37:13 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c5bd7822f964bdebdb32d219376765e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:37:15 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:37:15 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n",
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: answer_similarity\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'perplexity/v1/mean': 14.810191035270691,\n",
+       " 'perplexity/v1/variance': 55.66698687548323,\n",
+       " 'perplexity/v1/p90': 23.251440143585206,\n",
+       " 'flesch_kincaid_grade_level/v1/mean': 13.649999999999999,\n",
+       " 'flesch_kincaid_grade_level/v1/variance': 6.9025,\n",
+       " 'flesch_kincaid_grade_level/v1/p90': 16.4,\n",
+       " 'ari_grade_level/v1/mean': 16.05,\n",
+       " 'ari_grade_level/v1/variance': 9.427500000000002,\n",
+       " 'ari_grade_level/v1/p90': 19.32,\n",
+       " 'exact_match/v1': 0.0,\n",
+       " 'answer_similarity/v1/mean': 4.25,\n",
+       " 'answer_similarity/v1/variance': 0.1875,\n",
+       " 'answer_similarity/v1/p90': 4.7}"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    results = mlflow.evaluate(\n",
+    "        basic_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        targets=\"ground_truth\",\n",
+    "        model_type=\"question-answering\",\n",
+    "        evaluators=\"default\",\n",
+    "        extra_metrics=[answer_similarity_metric],  # use the answer similarity metric created above\n",
+    "    )\n",
+    "results.metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "df98aa92-4ce4-43dd-9677-68911a0a103d",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "See the row-by-row LLM-judged answer similarity score and justifications"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6f41f22d-e287-4aad-8231-986252ad6682",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "92ba7ee8d4194761b75415c1cc16c211",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th><th>answer_similarity/v1/score</th><th>answer_similarity/v1/justification</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>useEffect() is a function in React that allows you to perform side effects in a functional component. It takes two arguments: a callback function that will be run after the component renders, and an array of dependencies to determine when the callback should be invoked.</td><td>51</td><td>15.5819368362</td><td>12.1</td><td>13.5</td><td>4</td><td>The output provided by the model aligns well with the additional information provided. Both the output and the additional information explain that useEffect() is a function in React that performs actions after the component renders. The output also mentions the two arguments that useEffect() takes, which is not mentioned in the additional information. However, the output does not mention that React will remember the function passed to useEffect() and call it after performing the DOM updates, which is mentioned in the additional information. Therefore, while the output is largely accurate and aligns with the additional information in most aspects, it does not align in all significant aspects, which is why it receives a score of 4.</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>In C++, the static keyword used in a function declaration means that the function is limited to the scope of the translation unit and cannot be accessed by other files. In C#, the static keyword used in a function declaration means that the function belongs to the class itself and not to any specific instance of the class.</td><td>62</td><td>10.348818779</td><td>12.2</td><td>14.7</td><td>4</td><td>The output provided by the model aligns well with the target information. It correctly explains the meaning of the static keyword in a function in both C++ and C#. However, it does not explicitly mention that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any, which is mentioned in the target information. Therefore, while the output is largely accurate and informative, it lacks some details present in the target information, which prevents it from achieving a perfect score.</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>The 'finally' block in Python is used to specify code that will always be executed, whether or not an exception is raised in the try block. This block is typically used for clean-up actions, such as closing files or releasing resources, to ensure they are always performed, regardless of any exceptions.</td><td>60</td><td>26.5383701324</td><td>12.1</td><td>14.7</td><td>5</td><td>The output provided by the model aligns very closely with the provided targets. Both the output and the targets explain that the 'finally' block in Python is used to specify code that will always be executed, regardless of whether an exception is raised in the try block. The output also adds that this block is typically used for clean-up actions, which is an important aspect of the 'finally' block's functionality. Therefore, the output has a high degree of semantic similarity to the provided targets, warranting a score of 5.</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Multiprocessing is the use of multiple processors or cores to execute multiple tasks simultaneously, while multithreading is the use of multiple threads within a single process to execute different parts of the code concurrently. In simple terms, multiprocessing involves running multiple processes, while multithreading involves running multiple threads within a single process.</td><td>62</td><td>6.7716383934</td><td>18.2</td><td>21.3</td><td>4</td><td>The output provided by the model aligns closely with the provided targets. Both the output and the targets explain the difference between multiprocessing and multithreading, and they both mention that multiprocessing involves multiple processors or cores, while multithreading involves multiple threads within a single process. However, the output does not mention that each processor in multiprocessing can run one or more threads, which is mentioned in the targets. Therefore, the output aligns with the targets in most aspects, but not all, which is why a score of 4 is given.</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "useEffect() is a function in React that allows you to perform side effects in a functional component. It takes two arguments: a callback function that will be run after the component renders, and an array of dependencies to determine when the callback should be invoked.",
+         51,
+         15.5819368362,
+         12.1,
+         13.5,
+         4,
+         "The output provided by the model aligns well with the additional information provided. Both the output and the additional information explain that useEffect() is a function in React that performs actions after the component renders. The output also mentions the two arguments that useEffect() takes, which is not mentioned in the additional information. However, the output does not mention that React will remember the function passed to useEffect() and call it after performing the DOM updates, which is mentioned in the additional information. Therefore, while the output is largely accurate and aligns with the additional information in most aspects, it does not align in all significant aspects, which is why it receives a score of 4."
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "In C++, the static keyword used in a function declaration means that the function is limited to the scope of the translation unit and cannot be accessed by other files. In C#, the static keyword used in a function declaration means that the function belongs to the class itself and not to any specific instance of the class.",
+         62,
+         10.348818779,
+         12.2,
+         14.7,
+         4,
+         "The output provided by the model aligns well with the target information. It correctly explains the meaning of the static keyword in a function in both C++ and C#. However, it does not explicitly mention that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any, which is mentioned in the target information. Therefore, while the output is largely accurate and informative, it lacks some details present in the target information, which prevents it from achieving a perfect score."
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "The 'finally' block in Python is used to specify code that will always be executed, whether or not an exception is raised in the try block. This block is typically used for clean-up actions, such as closing files or releasing resources, to ensure they are always performed, regardless of any exceptions.",
+         60,
+         26.5383701324,
+         12.1,
+         14.7,
+         5,
+         "The output provided by the model aligns very closely with the provided targets. Both the output and the targets explain that the 'finally' block in Python is used to specify code that will always be executed, regardless of whether an exception is raised in the try block. The output also adds that this block is typically used for clean-up actions, which is an important aspect of the 'finally' block's functionality. Therefore, the output has a high degree of semantic similarity to the provided targets, warranting a score of 5."
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Multiprocessing is the use of multiple processors or cores to execute multiple tasks simultaneously, while multithreading is the use of multiple threads within a single process to execute different parts of the code concurrently. In simple terms, multiprocessing involves running multiple processes, while multithreading involves running multiple threads within a single process.",
+         62,
+         6.7716383934,
+         18.2,
+         21.3,
+         4,
+         "The output provided by the model aligns closely with the provided targets. Both the output and the targets explain the difference between multiprocessing and multithreading, and they both mention that multiprocessing involves multiple processors or cores, while multithreading involves multiple threads within a single process. However, the output does not mention that each processor in multiprocessing can run one or more threads, which is mentioned in the targets. Therefore, the output aligns with the targets in most aspects, but not all, which is why a score of 4 is given."
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "answer_similarity/v1/score",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "answer_similarity/v1/justification",
+         "type": "\"string\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "85402663-b9d7-4812-a7d2-32aa5b929687",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Custom LLM-judged metric for professionalism"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "a8765226-5d95-49e8-88d8-5ba442ea3b9b",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a custom metric that will be used to determine professionalism of the model outputs. Use `make_genai_metric` with a metric definition, grading prompt, grading example, and judge model configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "45cca2ec-e06b-4d51-9dde-3cc630df9244",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EvaluationMetric(name=professionalism, greater_is_better=True, long_name=professionalism, version=v1, metric_details=\n",
+      "Task:\n",
+      "You are an impartial judge. You will be given an input that was sent to a machine\n",
+      "learning model, and you will be given an output that the model produced. You\n",
+      "may also be given additional information that was used by the model to generate the output.\n",
+      "\n",
+      "Your task is to determine a numerical score called professionalism based on the input and output.\n",
+      "A definition of professionalism and a grading rubric are provided below.\n",
+      "You must use the grading rubric to determine your score. You must also justify your score.\n",
+      "\n",
+      "Examples could be included below for reference. Make sure to use them as references and to\n",
+      "understand them before completing the task.\n",
+      "\n",
+      "Input:\n",
+      "{input}\n",
+      "\n",
+      "Output:\n",
+      "{output}\n",
+      "\n",
+      "{grading_context_columns}\n",
+      "\n",
+      "Metric definition:\n",
+      "Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is tailored to the context and audience. It often involves avoiding overly casual language, slang, or colloquialisms, and instead using clear, concise, and respectful language\n",
+      "\n",
+      "Grading rubric:\n",
+      "Professionalism: If the answer is written using a professional tone, below are the details for different scores: - Score 1: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for professional contexts.- Score 2: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in some informal professional settings.- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. - Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for business or academic settings. - Score 5: Language is excessively formal, respectful, and avoids casual elements. Appropriate for the most formal settings such as textbooks. \n",
+      "\n",
+      "Examples:\n",
+      "\n",
+      "Input:\n",
+      "What is MLflow?\n",
+      "\n",
+      "Output:\n",
+      "MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps you track experiments, package your code and models, and collaborate with your team, making the whole ML workflow smoother. It's like your Swiss Army knife for machine learning!\n",
+      "\n",
+      "\n",
+      "\n",
+      "score: 2\n",
+      "justification: The response is written in a casual tone. It uses contractions, filler words such as 'like', and exclamation points, which make it sound less professional. \n",
+      "        \n",
+      "\n",
+      "You must return the following fields in your response one below the other:\n",
+      "score: Your numerical score for the model's professionalism based on the rubric\n",
+      "justification: Your step-by-step reasoning about the model's professionalism score\n",
+      "    )\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mlflow.metrics import EvaluationExample, make_genai_metric\n",
+    "\n",
+    "professionalism_metric = make_genai_metric(\n",
+    "    name=\"professionalism\",\n",
+    "    definition=(\n",
+    "        \"Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is tailored to the context and audience. It often involves avoiding overly casual language, slang, or colloquialisms, and instead using clear, concise, and respectful language\"\n",
+    "    ),\n",
+    "    grading_prompt=(\n",
+    "        \"Professionalism: If the answer is written using a professional tone, below \"\n",
+    "        \"are the details for different scores: \"\n",
+    "        \"- Score 1: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for professional contexts.\"\n",
+    "        \"- Score 2: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in some informal professional settings.\"\n",
+    "        \"- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. \"\n",
+    "        \"- Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for business or academic settings. \"\n",
+    "        \"- Score 5: Language is excessively formal, respectful, and avoids casual elements. Appropriate for the most formal settings such as textbooks. \"\n",
+    "    ),\n",
+    "    examples=[\n",
+    "        EvaluationExample(\n",
+    "            input=\"What is MLflow?\",\n",
+    "            output=(\n",
+    "                \"MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps you track experiments, package your code and models, and collaborate with your team, making the whole ML workflow smoother. It's like your Swiss Army knife for machine learning!\"\n",
+    "            ),\n",
+    "            score=2,\n",
+    "            justification=(\n",
+    "                \"The response is written in a casual tone. It uses contractions, filler words such as 'like', and exclamation points, which make it sound less professional. \"\n",
+    "            ),\n",
+    "        )\n",
+    "    ],\n",
+    "    version=\"v1\",\n",
+    "    model=\"openai:/gpt-4\",\n",
+    "    parameters={\"temperature\": 0.0},\n",
+    "    grading_context_columns=[],\n",
+    "    aggregations=[\"mean\", \"variance\", \"p90\"],\n",
+    "    greater_is_better=True,\n",
+    ")\n",
+    "\n",
+    "print(professionalism_metric)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "bc615396-b1c1-4302-872d-d19be010382a",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "TODO: Try out your new professionalism metric on a sample output to make sure it behaves as you expect"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "0ca7e945-113a-49ac-8324-2f94efa45771",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Call `mlflow.evaluate` with your new professionalism metric. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "07bb41ae-c878-4384-b36e-3dfb9b8ac6d9",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b7b987f9e46430fac997e9867255c5f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:41:22 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:41:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:41:27 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:41:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:41:27 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:41:28 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:41:28 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:41:28 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:41:28 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fd93cae37c8245079a8ca93e160eeb0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:41:31 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:41:31 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n",
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: professionalism\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'perplexity/v1/mean': 19.43236994743347, 'perplexity/v1/variance': 5.924225461480717, 'perplexity/v1/p90': 21.905458068847658, 'flesch_kincaid_grade_level/v1/mean': 15.4, 'flesch_kincaid_grade_level/v1/variance': 28.564999999999998, 'flesch_kincaid_grade_level/v1/p90': 21.28, 'ari_grade_level/v1/mean': 18.625, 'ari_grade_level/v1/variance': 39.266875000000006, 'ari_grade_level/v1/p90': 25.520000000000003, 'professionalism/v1/mean': 4.0, 'professionalism/v1/variance': 0.0, 'professionalism/v1/p90': 4.0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    results = mlflow.evaluate(\n",
+    "        basic_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        model_type=\"question-answering\",\n",
+    "        evaluators=\"default\",\n",
+    "        extra_metrics=[professionalism_metric],  # use the professionalism metric we created above\n",
+    "    )\n",
+    "print(results.metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "486a7ee9-c557-4939-8ddc-bc282ecb4bc3",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ce07542c6bf444eaf5752609663cd32",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th><th>professionalism/v1/score</th><th>professionalism/v1/justification</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>useEffect() is a hook in React that allows you to perform side effects in functional components. It is triggered after the component is rendered and whenever the component's dependencies change, allowing you to run code at specific points during the component's lifecycle.</td><td>49</td><td>22.7291488647</td><td>12.7</td><td>15.0</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the concept. It is appropriate for business or academic settings. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it doesn't score a 5.</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>The \"static\" keyword in a function means that the function exists only within the scope of the current file, and cannot be called from other files. It also means that the function's local variables retain their values between multiple invocations of the function.</td><td>50</td><td>15.9082307816</td><td>10.5</td><td>13.5</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear and concise, making it easy to understand. The use of quotation marks around the term \"static\" shows a level of formality and precision. The language is not excessively formal, so it does not reach a score of 5, but it is more formal than a balanced language, which would score a 3. Therefore, a score of 4 is appropriate.</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>In Python, the 'finally' block is used to execute a set of statements whether an exception is raised or not, ensuring that certain code is always executed regardless of the outcome of a try-except block. It is commonly used for activities such as closing files or releasing resources that need to be done regardless of whether an exception occurred or not.</td><td>69</td><td>19.9835128784</td><td>14.0</td><td>16.7</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the 'finally' block in Python. The tone is appropriate for business or academic settings, hence the score of 4.</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Multiprocessing involves running multiple processes simultaneously on multiple cores to increase efficiency and achieve parallelism, while multithreading involves running multiple threads within a single process, allowing for concurrent execution and sharing of resources.</td><td>40</td><td>19.108587265</td><td>24.4</td><td>29.3</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not merit a score of 5.</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "useEffect() is a hook in React that allows you to perform side effects in functional components. It is triggered after the component is rendered and whenever the component's dependencies change, allowing you to run code at specific points during the component's lifecycle.",
+         49,
+         22.7291488647,
+         12.7,
+         15,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the concept. It is appropriate for business or academic settings. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it doesn't score a 5."
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "The \"static\" keyword in a function means that the function exists only within the scope of the current file, and cannot be called from other files. It also means that the function's local variables retain their values between multiple invocations of the function.",
+         50,
+         15.9082307816,
+         10.5,
+         13.5,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear and concise, making it easy to understand. The use of quotation marks around the term \"static\" shows a level of formality and precision. The language is not excessively formal, so it does not reach a score of 5, but it is more formal than a balanced language, which would score a 3. Therefore, a score of 4 is appropriate."
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "In Python, the 'finally' block is used to execute a set of statements whether an exception is raised or not, ensuring that certain code is always executed regardless of the outcome of a try-except block. It is commonly used for activities such as closing files or releasing resources that need to be done regardless of whether an exception occurred or not.",
+         69,
+         19.9835128784,
+         14,
+         16.7,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the 'finally' block in Python. The tone is appropriate for business or academic settings, hence the score of 4."
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Multiprocessing involves running multiple processes simultaneously on multiple cores to increase efficiency and achieve parallelism, while multithreading involves running multiple threads within a single process, allowing for concurrent execution and sharing of resources.",
+         40,
+         19.108587265,
+         24.4,
+         29.3,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not merit a score of 5."
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/score",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/justification",
+         "type": "\"string\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "52e9f69f-2f43-46ba-bf88-b4aebae741f4",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "The professionalism score of the `basic_qa_model` is not very good. Let's try to create a new model that can perform better"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "b4ea81e9-6e91-43e7-8539-8dab7b5f52de",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Call `mlflow.evaluate()` using the new model. Observe that the professionalism score has increased!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "5b21ef8f-50ef-4229-83c9-cc2251a081e2",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:41:57 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.7.2/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8714e76f6c9b40a2949a26cb871dfc36",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1faffb78f9694b61b8c80759f474993d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:42:01 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:42:22 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:42:22 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:42:22 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:42:22 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1dcccbb05cef45a8b43bf484a856a337",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:42:27 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:42:27 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n",
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: professionalism\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'perplexity/v1/mean': 25.524279594421387, 'perplexity/v1/variance': 72.94819473072403, 'perplexity/v1/p90': 32.810652542114255, 'flesch_kincaid_grade_level/v1/mean': 16.525, 'flesch_kincaid_grade_level/v1/variance': 0.7818749999999987, 'flesch_kincaid_grade_level/v1/p90': 17.509999999999998, 'ari_grade_level/v1/mean': 19.125, 'ari_grade_level/v1/variance': 1.056875000000001, 'ari_grade_level/v1/p90': 20.23, 'professionalism/v1/mean': 4.5, 'professionalism/v1/variance': 0.25, 'professionalism/v1/p90': 5.0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    system_prompt = \"Answer the following question using extreme formality.\"\n",
+    "    professional_qa_model = mlflow.openai.log_model(\n",
+    "        model=\"gpt-3.5-turbo\",\n",
+    "        task=openai.ChatCompletion,\n",
+    "        artifact_path=\"model\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": \"{question}\"},\n",
+    "        ],\n",
+    "    )\n",
+    "    results = mlflow.evaluate(\n",
+    "        professional_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        model_type=\"question-answering\",\n",
+    "        evaluators=\"default\",\n",
+    "        extra_metrics=[professionalism_metric],\n",
+    "    )\n",
+    "print(results.metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "12027ba1-9d10-4f80-bb44-0857372a2e30",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1d5de8aa36b642128f68e89e214f4752",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th><th>professionalism/v1/score</th><th>professionalism/v1/justification</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>I would be delighted to expound upon the mechanics of the useEffect() function. useEffect() is a hook provided by the React library that facilitates the synchronization of side effects with the React components lifecycle method. It serves as a tool for performing actions that necessitate interaction with external entities, such as fetching data from an API or subscribing to events. The principal construct of useEffect() hinges on the concept of dependencies. By specifying an array of dependencies, which can consist of state variables, the useEffect() hook will execute the specified effect only when these dependencies undergo changes.\n",
+       "\n",
+       "Once the component mounts, useEffect() initially triggers the specified effect, consequently performing the designated side effect. During subsequent renders, useEffect() will carry out the cleanup operation, accommodating for the possibility of multiple effects being executed successively. Should any of the supplied dependencies undergo alteration, useEffect() will then reexecute the effect, ensuring synchronization with the altered state of the component.\n",
+       "\n",
+       "In essence, useEffect() effectively accommodates for the execution of side effects with accuracy and efficiency, aligning them with the React component lifecycle and facilitating the necessary synchronization with state updates.</td><td>220</td><td>25.9580955505</td><td>16.1</td><td>18.0</td><td>4</td><td>The language used in the output is formal and respectful, with no casual elements or slang. The explanation is clear, concise, and detailed, demonstrating a high level of professionalism. However, it does not reach the level of excessive formality that would be expected in the most formal settings such as textbooks, hence the score is not 5.</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>The static keyword, when applied to a function within the realm of computer programming, denotes a special attribute that imparts a significant alteration to the behavior of the said function. Specifically, the static keyword, operating in this context, restricts the visibility and scope of the function to the current translation unit, consequently prohibiting its accessibility from other external units of code. Consequently, the function annotated with the static modifier may solely be invoked within the same translation unit, and any attempts to access or execute it from an external module shall be rendered futile.</td><td>107</td><td>31.0698623657</td><td>17.9</td><td>20.8</td><td>5</td><td>The language used in the output is excessively formal and respectful. It avoids casual elements and is appropriate for the most formal settings such as textbooks. The response is clear, concise, and uses technical language appropriate for the subject matter. It avoids any slang or colloquialisms, and is tailored to an audience that understands the context of the question.</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>The 'finally' block in Python encompasses a set of statements that are executed regardless of whether an exception occurred or not in the preceding 'try' and 'except' blocks. It serves as a reliable means to perform essential tasks, such as closing files or releasing resources, in order to uphold code integrity and adherent to best coding practices. Consequently, the 'finally' block guarantees the execution of these imperative actions, regardless of how the preceding logic unfolds, thus enhancing the robustness and fault tolerance of Python programs.</td><td>103</td><td>33.5567054749</td><td>15.5</td><td>18.9</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not warrant a score of 5.</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Greetings,\n",
+       "\n",
+       "I hereby address your esteemed query regarding the distinction between multiprocessing and multithreading.\n",
+       "\n",
+       "In essence, multiprocessing and multithreading serve the purpose of enhancing the efficiency and effectiveness of simultaneous execution within a computer system. However, they differ considerably in terms of their operational principles and implementation.\n",
+       "\n",
+       "To begin with, multiprocessing refers to the deployment of multiple processors or central processing units (CPUs) to execute multiple tasks or processes concurrently. Each processor operates independently and can execute distinct tasks simultaneously, thereby realizing parallel processing. This architectural approach is particularly beneficial when it comes to computationally intensive applications, as it allows for increased speed and performance.\n",
+       "\n",
+       "On the other hand, multithreading entails the notion of creating multiple threads within a single process. A thread, as a lightweight unit of execution, constitutes a sequence of instructions that can be executed independently. By employing multiple threads, a program can concurrently carry out multiple tasks within a shared memory space. Notably, these threads share the same resources and context, including memory and file systems, which can lead to potential synchronization and communication challenges.\n",
+       "\n",
+       "In summary, multiprocessing leverages multiple processors or CPUs to execute distinct tasks in parallel, whereas multithreading enables the concurrent execution of multiple threads within a single process, sharing resources and memory space. The choice between multiprocessing and multithreading depends on the specific requirements, resources, and objectives of the system or application at hand.\n",
+       "\n",
+       "I trust that this elucidation satisfactorily addresses your inquiry with regards to the disparity between multiprocessing and multithreading. Should you have any further queries or necessitate additional clarification, I remain at your disposal.\n",
+       "\n",
+       "Yours respectfully,</td><td>324</td><td>11.5124549866</td><td>16.6</td><td>18.8</td><td>5</td><td>The response is written in an excessively formal and respectful tone. It uses formal greetings and sign-offs, avoids casual elements, and uses complex sentence structures. The language is appropriate for the most formal settings such as textbooks.</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "I would be delighted to expound upon the mechanics of the useEffect() function. useEffect() is a hook provided by the React library that facilitates the synchronization of side effects with the React components lifecycle method. It serves as a tool for performing actions that necessitate interaction with external entities, such as fetching data from an API or subscribing to events. The principal construct of useEffect() hinges on the concept of dependencies. By specifying an array of dependencies, which can consist of state variables, the useEffect() hook will execute the specified effect only when these dependencies undergo changes.\n\nOnce the component mounts, useEffect() initially triggers the specified effect, consequently performing the designated side effect. During subsequent renders, useEffect() will carry out the cleanup operation, accommodating for the possibility of multiple effects being executed successively. Should any of the supplied dependencies undergo alteration, useEffect() will then reexecute the effect, ensuring synchronization with the altered state of the component.\n\nIn essence, useEffect() effectively accommodates for the execution of side effects with accuracy and efficiency, aligning them with the React component lifecycle and facilitating the necessary synchronization with state updates.",
+         220,
+         25.9580955505,
+         16.1,
+         18,
+         4,
+         "The language used in the output is formal and respectful, with no casual elements or slang. The explanation is clear, concise, and detailed, demonstrating a high level of professionalism. However, it does not reach the level of excessive formality that would be expected in the most formal settings such as textbooks, hence the score is not 5."
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "The static keyword, when applied to a function within the realm of computer programming, denotes a special attribute that imparts a significant alteration to the behavior of the said function. Specifically, the static keyword, operating in this context, restricts the visibility and scope of the function to the current translation unit, consequently prohibiting its accessibility from other external units of code. Consequently, the function annotated with the static modifier may solely be invoked within the same translation unit, and any attempts to access or execute it from an external module shall be rendered futile.",
+         107,
+         31.0698623657,
+         17.9,
+         20.8,
+         5,
+         "The language used in the output is excessively formal and respectful. It avoids casual elements and is appropriate for the most formal settings such as textbooks. The response is clear, concise, and uses technical language appropriate for the subject matter. It avoids any slang or colloquialisms, and is tailored to an audience that understands the context of the question."
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "The 'finally' block in Python encompasses a set of statements that are executed regardless of whether an exception occurred or not in the preceding 'try' and 'except' blocks. It serves as a reliable means to perform essential tasks, such as closing files or releasing resources, in order to uphold code integrity and adherent to best coding practices. Consequently, the 'finally' block guarantees the execution of these imperative actions, regardless of how the preceding logic unfolds, thus enhancing the robustness and fault tolerance of Python programs.",
+         103,
+         33.5567054749,
+         15.5,
+         18.9,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not warrant a score of 5."
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Greetings,\n\nI hereby address your esteemed query regarding the distinction between multiprocessing and multithreading.\n\nIn essence, multiprocessing and multithreading serve the purpose of enhancing the efficiency and effectiveness of simultaneous execution within a computer system. However, they differ considerably in terms of their operational principles and implementation.\n\nTo begin with, multiprocessing refers to the deployment of multiple processors or central processing units (CPUs) to execute multiple tasks or processes concurrently. Each processor operates independently and can execute distinct tasks simultaneously, thereby realizing parallel processing. This architectural approach is particularly beneficial when it comes to computationally intensive applications, as it allows for increased speed and performance.\n\nOn the other hand, multithreading entails the notion of creating multiple threads within a single process. A thread, as a lightweight unit of execution, constitutes a sequence of instructions that can be executed independently. By employing multiple threads, a program can concurrently carry out multiple tasks within a shared memory space. Notably, these threads share the same resources and context, including memory and file systems, which can lead to potential synchronization and communication challenges.\n\nIn summary, multiprocessing leverages multiple processors or CPUs to execute distinct tasks in parallel, whereas multithreading enables the concurrent execution of multiple threads within a single process, sharing resources and memory space. The choice between multiprocessing and multithreading depends on the specific requirements, resources, and objectives of the system or application at hand.\n\nI trust that this elucidation satisfactorily addresses your inquiry with regards to the disparity between multiprocessing and multithreading. Should you have any further queries or necessitate additional clarification, I remain at your disposal.\n\nYours respectfully,",
+         324,
+         11.5124549866,
+         16.6,
+         18.8,
+         5,
+         "The response is written in an excessively formal and respectful tone. It uses formal greetings and sign-offs, avoids casual elements, and uses complex sentence structures. The language is appropriate for the most formal settings such as textbooks."
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/score",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/justification",
+         "type": "\"string\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "e44bbe77-433a-4e03-a44e-d17eb6c06820",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "LLM Evaluation Examples -- QA",
+   "widgets": {}
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}