From 189602bdaa48fcbae4f209c101cfb7d4d424ca3a Mon Sep 17 00:00:00 2001
From: jjallaire-aisi <joseph.allaire@dsit.gov.uk>
Date: Tue, 6 Aug 2024 08:00:18 -0400
Subject: [PATCH] improved metrics value_to_float string conversion (#196)

Co-authored-by: aisi-inspect <166920645+aisi-inspect@users.noreply.github.com>
---
 CHANGELOG.md                        |  1 +
 src/inspect_ai/scorer/_metric.py    | 31 ++++++++++++++++++++---------
 tests/scorer/test_value_to_float.py | 31 +++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 9 deletions(-)
 create mode 100644 tests/scorer/test_value_to_float.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fdc380779..a856f4711 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## Unreleased
 
+- Improved metrics `value_to_float` string conversion (handle numbers, "true", "false", etc.)
 - Log viewer: Ctrl/Cmd+F to find text when running in VS Code.
 - Set Claude default `max_tokens` to 4096
 
diff --git a/src/inspect_ai/scorer/_metric.py b/src/inspect_ai/scorer/_metric.py
index 856b9fb2f..99e6dd119 100644
--- a/src/inspect_ai/scorer/_metric.py
+++ b/src/inspect_ai/scorer/_metric.py
@@ -120,12 +120,16 @@ def value_to_float(
 ) -> ValueToFloat:
     """Create a ValueToFloat function.
 
-    Create a ValueToFloat function that maps string values of
-    the form "C", "I", "P", and "N" to 1, 0, 0.5, and 0
-    (respectively). Note that those are the default literal
-    values, but they can be customized. Numeric values are
-    cast to float. Arrays and dictionaries give a warning
-    and return 0.
+    Create a ValueToFloat function that maps scalar values of
+    different types into floats. For strings, common boolean
+    representations (e.g. 'yes', 'no', 'true', 'false') are
+    mapped to 1 and 0. In addition, the specified correct,
+    incorrect, partial, and noanswer values (by default "C"
+    "I", "P", are mapped to "N" to 1, 0, 0.5, and 0. Note that
+    those are the default literal values, but they can be
+    customized. Strings with only numbers are converted, and
+    numeric values are cast to float. Arrays and dictionarie
+    give a warning and return 0.
 
     Args:
        correct (Value): Value that represents a correct answer (1)
@@ -146,9 +150,18 @@ def to_float(value: Value) -> float:
             return 0.5
         elif value == incorrect or value == noanswer:
             return 0
-        else:
-            logger.warning(f"Unable to convert value to float: {value}")
-            return 0
+        elif isinstance(value, str):
+            value = value.lower()
+            if value in ["yes", "true"]:
+                return 1.0
+            elif value in ["no", "false"]:
+                return 0.0
+            elif value.replace(".", "").isnumeric():
+                return float(value)
+
+        # couldn't extract a value
+        logger.warning(f"Unable to convert value to float: {value}")
+        return 0.0
 
     return to_float
 
diff --git a/tests/scorer/test_value_to_float.py b/tests/scorer/test_value_to_float.py
new file mode 100644
index 000000000..0644b2f28
--- /dev/null
+++ b/tests/scorer/test_value_to_float.py
@@ -0,0 +1,31 @@
+from inspect_ai.scorer import CORRECT, PARTIAL, value_to_float
+
+
+def test_value_to_float_numbers():
+    fn = value_to_float()
+    assert fn(1) == 1.0
+    assert fn(0.5) == 0.5
+    assert fn(True) == 1.0
+    assert fn(False) == 0
+
+
+def test_value_to_float_strings():
+    fn = value_to_float()
+    assert fn("1.0") == 1.0
+    assert fn("0.5") == 0.5
+    assert fn("0") == 0
+    assert fn("yes") == 1.0
+    assert fn("No") == 0.0
+    assert fn(CORRECT) == 1.0
+    assert fn(PARTIAL) == 0.5
+
+
+def test_value_to_float_custom():
+    fn = value_to_float(correct="correct", incorrect="incorrect")
+    assert fn("correct") == 1.0
+    assert fn("incorrect") == 0
+
+
+def test_value_to_float_invalid():
+    fn = value_to_float()
+    assert fn("foo") == 0.0