wandb · tssweeney · Jan 14, 2025 · Jan 8, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -0,0 +1,44 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Guardrails and Monitors
+
+In order to prevent your LLM from generating harmful or inappropriate content, you can set guardrails and monitors using Weave.
+
+## Core Concepts
+
+Everything is built around the concept of a `Scorer`. A `Scorer` is an instance of a sublcass of the `Scorer` class - particularly one that exposes a `score` method. See [Evaluation Metrics](./scorers.md) for more information.
+
+### Monitors
+Now, many times you will want to apply a Scorer directly after calling an Op. This can be achieved by using the `apply_scorer` method on the `Call` object.
+
+```python
+res, call = op.call(user_input)
+# optionally subsample to 25%
+if random.random() < 0.25:
+    await call.apply_scorer(scorer)
+```
+
+This will log the score to Weave which can be viewed and analyzed in the UI.
+
+:::info
+
+Note that this style of Monitor will run the scoring function on the same machine as the call. This might not be desirable in all production environments. Coming soon will be the ability to apply scorers as monitors that run on W&B Weave's servers.
+
+:::
+
+### Guardrails
+
+Guardrails are a way to prevent the LLM from generating harmful or inappropriate content. In Weave, we use the same technique as monitors to apply a Scorer, but in addition to logging the score, we also modify the application logic based on the scorer output
+
+```python
+res, call = op.call(user_input)
+scorer_res = await call.apply_scorer(guardrail)
+if scorer_res.score < 0.5:
+    # Do something 
+else:
+    # Do something else
+```
+
+
+
@@ -0,0 +1,160 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<!-- docusaurus_head_meta::start\n",
+    "---\n",
+    "title: Scorers as Guardrails\n",
+    "---\n",
+    "docusaurus_head_meta::end -->\n",
+    "\n",
+    "<!--- @wandbcode{prompt-optim-notebook} -->"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-EauaASOOUsB"
+   },
+   "source": [
+    "# Scorers as Guardrails\n",
+    "\n",
+    "Weave Scorers are special classes with a `score` method that can evaluate the performance of a call. They can range from quite simple rules to complex LLMs as judges. \n",
+    "\n",
+    "In this notebook, we will explore how to use Scorers as guardrails to prevent your LLM from generating harmful or inappropriate content.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "RJcDCJWWShcZ",
+    "outputId": "cace015b-dcaf-4bef-a105-2a62a2361e29"
+   },
+   "outputs": [],
+   "source": [
+    "%pip install weave --quiet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "oKawLdN3SmJG"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Logged in as Weights & Biases user: timssweeney.\n",
+      "View Weave data at https://wandb.ai/timssweeney/scorers-as-guardrails/weave\n",
+      "🍩 https://wandb.ai/timssweeney/scorers-as-guardrails/r/call/019441d9-b322-7fe1-befc-03f10623326c\n",
+      "The prediction for 'Please make a prediction' is: Certainly!\n",
+      "Call(_op_name=<Future at 0x332d6bd30 state=running>, trace_id='019441d9-b321-7d62-bbad-faab27aca2b5', project_id='timssweeney/scorers-as-guardrails', parent_id=None, inputs={'input': 'Please make a prediction'}, id='019441d9-b322-7fe1-befc-03f10623326c', output='Certainly!', exception=None, summary={}, _display_name=None, attributes=AttributesDict({'weave': {'client_version': '0.51.28-dev0', 'source': 'python-sdk', 'os_name': 'Darwin', 'os_version': 'Darwin Kernel Version 23.6.0: Fri Nov 15 15:13:15 PST 2024; root:xnu-10063.141.1.702.7~1/RELEASE_ARM64_T6000', 'os_release': '23.6.0', 'sys_version': '3.10.8 (main, Dec  5 2022, 18:10:41) [Clang 14.0.0 (clang-1400.0.29.202)]'}}), started_at=None, ended_at=datetime.datetime(2025, 1, 7, 17, 39, 33, 794325, tzinfo=datetime.timezone.utc), deleted_at=None, _children=[], _feedback=None)\n",
+      "Certainly!\n",
+      "🍩 https://wandb.ai/timssweeney/scorers-as-guardrails/r/call/019441d9-b329-7d21-9dfb-6996730f9209\n",
+      "The results of the score are: ApplyScorerSuccess(result=False, score_call=Call(_op_name=<Future at 0x332dd2410 state=finished returned str>, trace_id='019441d9-b329-7d21-9dfb-698b4c4bb903', project_id='timssweeney/scorers-as-guardrails', parent_id=None, inputs={'self': ObjectRef(entity='timssweeney', project='scorers-as-guardrails', name='Apology-Checker', _digest='7HojDvIDVc788WyJAr7b0nQppAkGNLhzsT3DIV9d74A', _extra=()), 'output': 'Certainly!'}, id='019441d9-b329-7d21-9dfb-6996730f9209', output=False, exception=None, summary={}, _display_name=None, attributes=AttributesDict({'weave': {'client_version': '0.51.28-dev0', 'source': 'python-sdk', 'os_name': 'Darwin', 'os_version': 'Darwin Kernel Version 23.6.0: Fri Nov 15 15:13:15 PST 2024; root:xnu-10063.141.1.702.7~1/RELEASE_ARM64_T6000', 'os_release': '23.6.0', 'sys_version': '3.10.8 (main, Dec  5 2022, 18:10:41) [Clang 14.0.0 (clang-1400.0.29.202)]'}}), started_at=None, ended_at=datetime.datetime(2025, 1, 7, 17, 39, 33, 802125, tzinfo=datetime.timezone.utc), deleted_at=None, _children=[], _feedback=None))\n",
+      "🍩 https://wandb.ai/timssweeney/scorers-as-guardrails/r/call/019441d9-b764-7961-8dd2-4d7c936e6f60\n",
+      "Certainly!\n",
+      "🍩 https://wandb.ai/timssweeney/scorers-as-guardrails/r/call/019441d9-b766-7c72-b74f-4c9ed74bb921\n",
+      "The prediction for 'Please make a prediction' (Certainly!) is safe\n",
+      "🍩 https://wandb.ai/timssweeney/scorers-as-guardrails/r/call/019441d9-b767-7843-9349-31e585535893\n",
+      "I'm sorry, I can't do that.\n",
+      "🍩 https://wandb.ai/timssweeney/scorers-as-guardrails/r/call/019441d9-b769-7ff1-8c8a-d5bbdb1de3e0\n",
+      "The prediction for 'Please make a prediction with a test' (I'm sorry, I can't do that.) is NOT safe\n"
+     ]
+    }
+   ],
+   "source": [
+    "import weave\n",
+    "\n",
+    "weave.init(\"scorers-as-guardrails\")\n",
+    "\n",
+    "\n",
+    "# First, we define a very simple scorer that checks if the model output contains any specific words.\n",
+    "class WordMatcher(weave.Scorer):\n",
+    "    words: list[str]\n",
+    "    case_sensitive: bool = False\n",
+    "\n",
+    "    @weave.op\n",
+    "    def score(self, output: str) -> float:\n",
+    "        print(output)\n",
+    "        for word in self.words:\n",
+    "            if self.case_sensitive:\n",
+    "                if word in output:\n",
+    "                    return True\n",
+    "            else:\n",
+    "                if word.lower() in output.lower():\n",
+    "                    return True\n",
+    "        return False\n",
+    "\n",
+    "\n",
+    "# Next we define a function that makes a prediction\n",
+    "@weave.op\n",
+    "def make_prediction(input: str) -> str:\n",
+    "    \"\"\"Dummy function that makes a prediction\"\"\"\n",
+    "    if \"test\" in input:\n",
+    "        return \"I'm sorry, I can't do that.\"\n",
+    "    else:\n",
+    "        return \"Certainly!\"\n",
+    "\n",
+    "\n",
+    "# Next we call the op using the `call` method in order to return\n",
+    "# a Call object.\n",
+    "prediction, weave_call = make_prediction.call(\"Please make a prediction\")\n",
+    "print(f\"The prediction for 'Please make a prediction' is: {prediction}\")\n",
+    "\n",
+    "# Next, let's construct a scorer that checks if the prediction contains the word \"sorry\"\n",
+    "# We will name the scorer \"Apology Checker\" which will show up as the name of the score\n",
+    "# associated with the call.\n",
+    "scorer = WordMatcher(name=\"Apology Checker\", words=[\"sorry\"])\n",
+    "\n",
+    "# Now we can apply the scorer to the prediction\n",
+    "score_results = await weave_call.apply_scorer(scorer)\n",
+    "print(f\"The results of the score are: {score_results}\")\n",
+    "\n",
+    "# In a real-world scenario, we would use the score results to determine if the prediction is safe\n",
+    "# and possibly modify the control flow of the program based on the score.\n",
+    "for example_input in [\n",
+    "    \"Please make a prediction\",\n",
+    "    \"Please make a prediction with a test\",\n",
+    "]:\n",
+    "    prediction, weave_call = make_prediction.call(example_input)\n",
+    "    score_results = await weave_call.apply_scorer(scorer)\n",
+    "    if score_results.result:\n",
+    "        print(f\"The prediction for '{example_input}' ({prediction}) is NOT safe\")\n",
+    "    else:\n",
+    "        print(f\"The prediction for '{example_input}' ({prediction}) is safe\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
@@ -60,7 +60,7 @@ const sidebars: SidebarsConfig = {
           collapsed: false,
           label: "Evaluation",
           link: { type: "doc", id: "guides/core-types/evaluations" },
-          items: ["guides/evaluation/scorers"],
+          items: ["guides/evaluation/scorers", "guides/evaluation/guardrails_and_monitors"],
         },
         "guides/core-types/prompts",
         "guides/core-types/models",