feat: add Cohen's Kappa scores to 'accuracy' output

In addition to the F1 score, we now also calculate Kappa. - Also, upgrade black to 24.x
smart-on-fhir · Apr 23, 2024 · f32baf2 · f32baf2
1 parent 827dda8
commit f32baf2
Show file tree

Hide file tree

Showing 8 changed files with 94 additions and 51 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -45,7 +45,7 @@ jobs:
         # black is synced with the .pre-commit-hooks version
         run: |
           python -m pip install --upgrade pip
-          pip install bandit[toml] pycodestyle black==23.11.0
+          pip install .[dev]
 
       - name: Run pycodestyle
         # E203: pycodestyle is a little too rigid about slices & whitespace

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
   - repo: https://github.com/psf/black
     # this version is synced with the black mentioned in .github/workflows/ci.yml
-    rev: 23.11.0
+    rev: 24.4.0
     hooks:
       - id: black
         entry: bash -c 'black "$@"; git add -u' --

diff --git a/chart_review/agree.py b/chart_review/agree.py
@@ -86,40 +86,68 @@ def append_matrix(first: dict, second: dict) -> dict:
     return added
 
 
+def score_kappa(matrix: dict) -> float:
+    """
+    Computes Cohen kappa for pair-wise annotators.
+    https://en.wikipedia.org/wiki/Cohen%27s_kappa
+
+    :param matrix: confusion matrix with TN/TP/FN/FP values
+    :return: Cohen kappa statistic
+    """
+    tp = len(matrix["TP"])  # true positive
+    tn = len(matrix["TN"])  # true negative
+    fp = len(matrix["FP"])  # false positive
+    fn = len(matrix["FN"])  # false negative
+    total = tp + tn + fp + fn
+
+    # observed agreement A (Po)
+    observed = (tp + tn) / total
+
+    # expected agreement E (Pe)
+    expected_pos = ((tp + fp) / total) * ((tp + fn) / total)
+    expected_neg = ((tn + fp) / total) * ((tn + fn) / total)
+    expected = expected_pos + expected_neg
+
+    return (observed - expected) / (1 - expected)
+
+
 def score_matrix(matrix: dict, sig_digits=3) -> dict:
     """
-    Score F1 measure with precision (PPV) and recall (sensitivity).
+    Score F1 and Kappa measures with precision (PPV) and recall (sensitivity).
     F1 deliberately ignores "True Negatives" because TN inflates scoring (AUROC)
     @return: dict with keys {'f1', 'precision', 'recall'} vals are %score
     """
-    true_pos = matrix["TP"]
-    true_neg = matrix["TN"]
-    false_pos = matrix["FP"]
-    false_neg = matrix["FN"]
+    true_pos = len(matrix["TP"])
+    true_neg = len(matrix["TN"])
+    false_pos = len(matrix["FP"])
+    false_neg = len(matrix["FN"])
 
-    if 0 == len(true_pos) or 0 == len(true_neg):
+    if 0 == true_pos or 0 == true_neg:
         sens = 0
         spec = 0
         ppv = 0
         npv = 0
         f1 = 0
+        kappa = 0
     else:
-        sens = len(true_pos) / (len(true_pos) + len(false_neg))
-        spec = len(true_neg) / (len(true_neg) + len(false_pos))
-        ppv = len(true_pos) / (len(true_pos) + len(false_pos))
-        npv = len(true_neg) / (len(true_neg) + len(false_neg))
+        sens = true_pos / (true_pos + false_neg)
+        spec = true_neg / (true_neg + false_pos)
+        ppv = true_pos / (true_pos + false_pos)
+        npv = true_neg / (true_neg + false_neg)
         f1 = (2 * ppv * sens) / (ppv + sens)
+        kappa = score_kappa(matrix)
 
     return {
         "F1": round(f1, sig_digits),
         "Sens": round(sens, sig_digits),
         "Spec": round(spec, sig_digits),
         "PPV": round(ppv, sig_digits),
         "NPV": round(npv, sig_digits),
-        "TP": len(true_pos),
-        "FP": len(false_pos),
-        "FN": len(false_neg),
-        "TN": len(true_neg),
+        "κ": round(kappa, sig_digits),
+        "TP": true_pos,
+        "FP": false_pos,
+        "FN": false_neg,
+        "TN": true_neg,
     }
 
 
@@ -172,7 +200,7 @@ def csv_header(pick_label=False, as_string=False):
     :param pick_label: default= None
     :return: header
     """
-    as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "TP", "FN", "TN", "FP"]
+    as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "κ", "TP", "FN", "TN", "FP"]
 
     if not as_string:
         return as_list

diff --git a/chart_review/common.py b/chart_review/common.py
@@ -1,4 +1,5 @@
 """Utility methods"""
+
 from enum import Enum, EnumMeta
 from typing import Optional, Union
 from collections.abc import Iterable

diff --git a/chart_review/kappa.py b/chart_review/kappa.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,7 +41,9 @@ tests = [
     "pytest",
 ]
 dev = [
-    "black == 23.11.0",
+    "bandit[toml]",
+    "black >= 24, < 25",
+    "pycodestyle",
     "pylint",
 ]
 

diff --git a/tests/test_agree.py b/tests/test_agree.py
@@ -60,3 +60,39 @@ def test_confusion_matrix_counts(self, truth, annotator, labels, expected_matrix
 
         matrix = agree.confusion_matrix(annotations, truth, annotator, notes, labels=labels)
         self.assertEqual(expected_matrix, matrix)
+
+    @ddt.data(
+        # Examples pulled from https://en.wikipedia.org/wiki/Cohen's_kappa#Examples
+        (
+            {
+                "FN": [{x: "Label"} for x in range(5)],
+                "FP": [{x: "Label"} for x in range(10)],
+                "TN": [{x: "Label"} for x in range(15)],
+                "TP": [{x: "Label"} for x in range(20)],
+            },
+            0.4,
+        ),
+        (
+            {
+                "FN": [{x: "Label"} for x in range(15)],
+                "FP": [{x: "Label"} for x in range(25)],
+                "TN": [{x: "Label"} for x in range(15)],
+                "TP": [{x: "Label"} for x in range(45)],
+            },
+            0.1304,
+        ),
+        (
+            {
+                "FN": [{x: "Label"} for x in range(35)],
+                "FP": [{x: "Label"} for x in range(5)],
+                "TN": [{x: "Label"} for x in range(35)],
+                "TP": [{x: "Label"} for x in range(25)],
+            },
+            0.2593,
+        ),
+    )
+    @ddt.unpack
+    def test_kappa_score(self, matrix, expected_kappa):
+        """Verify that we can score a matrix for kappa."""
+        kappa = round(agree.score_kappa(matrix), 4)
+        self.assertEqual(expected_kappa, kappa)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -30,6 +30,7 @@ def test_accuracy(self):
                     "Spec": 0.6,
                     "PPV": 0.6,
                     "NPV": 0.75,
+                    "κ": 0.341,
                     "TP": 3,
                     "FN": 1,
                     "TN": 3,
@@ -44,6 +45,7 @@ def test_accuracy(self):
                         "Spec": 1.0,
                         "TN": 1,
                         "TP": 1,
+                        "κ": 0.4,
                     },
                     "Fatigue": {
                         "F1": 1.0,
@@ -55,6 +57,7 @@ def test_accuracy(self):
                         "Spec": 1.0,
                         "TN": 1,
                         "TP": 2,
+                        "κ": 1.0,
                     },
                     "Headache": {
                         "F1": 0,
@@ -66,18 +69,19 @@ def test_accuracy(self):
                         "Spec": 0,
                         "TN": 1,
                         "TP": 0,
+                        "κ": 0,
                     },
                 },
                 accuracy_json,
             )
 
             accuracy_csv = common.read_text(f"{tmpdir}/accuracy-jill-jane.csv")
             self.assertEqual(
-                """F1	Sens	Spec	PPV	NPV	TP	FN	TN	FP	Label
-0.667	0.75	0.6	0.6	0.75	3	1	3	2	*
-0.667	0.5	1.0	1.0	0.5	1	1	1	0	Cough
-1.0	1.0	1.0	1.0	1.0	2	0	1	0	Fatigue
-0	0	0	0	0	0	0	1	2	Headache
+                """F1	Sens	Spec	PPV	NPV	κ	TP	FN	TN	FP	Label
+0.667	0.75	0.6	0.6	0.75	0.341	3	1	3	2	*
+0.667	0.5	1.0	1.0	0.5	0.4	1	1	1	0	Cough
+1.0	1.0	1.0	1.0	1.0	1.0	2	0	1	0	Fatigue
+0	0	0	0	0	0	0	0	1	2	Headache
 """,
                 accuracy_csv,
             )