From f32baf2d6a89987d1cbb4bd52fb1cbe6a0a7b30b Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Tue, 23 Apr 2024 11:13:39 -0400
Subject: [PATCH] feat: add Cohen's Kappa scores to 'accuracy' output

In addition to the F1 score, we now also calculate Kappa.

- Also, upgrade black to 24.x
---
 .github/workflows/ci.yaml |  2 +-
 .pre-commit-config.yaml   |  2 +-
 chart_review/agree.py     | 58 +++++++++++++++++++++++++++++----------
 chart_review/common.py    |  1 +
 chart_review/kappa.py     | 28 -------------------
 pyproject.toml            |  4 ++-
 tests/test_agree.py       | 36 ++++++++++++++++++++++++
 tests/test_cli.py         | 14 ++++++----
 8 files changed, 94 insertions(+), 51 deletions(-)
 delete mode 100644 chart_review/kappa.py

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 0946508..3ad3cba 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -45,7 +45,7 @@ jobs:
         # black is synced with the .pre-commit-hooks version
         run: |
           python -m pip install --upgrade pip
-          pip install bandit[toml] pycodestyle black==23.11.0
+          pip install .[dev]
 
       - name: Run pycodestyle
         # E203: pycodestyle is a little too rigid about slices & whitespace
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e04e8f0..d32addc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
   - repo: https://github.com/psf/black
     # this version is synced with the black mentioned in .github/workflows/ci.yml
-    rev: 23.11.0
+    rev: 24.4.0
     hooks:
       - id: black
         entry: bash -c 'black "$@"; git add -u' --
diff --git a/chart_review/agree.py b/chart_review/agree.py
index 590a032..c288b9b 100644
--- a/chart_review/agree.py
+++ b/chart_review/agree.py
@@ -86,29 +86,56 @@ def append_matrix(first: dict, second: dict) -> dict:
     return added
 
 
+def score_kappa(matrix: dict) -> float:
+    """
+    Computes Cohen kappa for pair-wise annotators.
+    https://en.wikipedia.org/wiki/Cohen%27s_kappa
+
+    :param matrix: confusion matrix with TN/TP/FN/FP values
+    :return: Cohen kappa statistic
+    """
+    tp = len(matrix["TP"])  # true positive
+    tn = len(matrix["TN"])  # true negative
+    fp = len(matrix["FP"])  # false positive
+    fn = len(matrix["FN"])  # false negative
+    total = tp + tn + fp + fn
+
+    # observed agreement A (Po)
+    observed = (tp + tn) / total
+
+    # expected agreement E (Pe)
+    expected_pos = ((tp + fp) / total) * ((tp + fn) / total)
+    expected_neg = ((tn + fp) / total) * ((tn + fn) / total)
+    expected = expected_pos + expected_neg
+
+    return (observed - expected) / (1 - expected)
+
+
 def score_matrix(matrix: dict, sig_digits=3) -> dict:
     """
-    Score F1 measure with precision (PPV) and recall (sensitivity).
+    Score F1 and Kappa measures with precision (PPV) and recall (sensitivity).
     F1 deliberately ignores "True Negatives" because TN inflates scoring (AUROC)
     @return: dict with keys {'f1', 'precision', 'recall'} vals are %score
     """
-    true_pos = matrix["TP"]
-    true_neg = matrix["TN"]
-    false_pos = matrix["FP"]
-    false_neg = matrix["FN"]
+    true_pos = len(matrix["TP"])
+    true_neg = len(matrix["TN"])
+    false_pos = len(matrix["FP"])
+    false_neg = len(matrix["FN"])
 
-    if 0 == len(true_pos) or 0 == len(true_neg):
+    if 0 == true_pos or 0 == true_neg:
         sens = 0
         spec = 0
         ppv = 0
         npv = 0
         f1 = 0
+        kappa = 0
     else:
-        sens = len(true_pos) / (len(true_pos) + len(false_neg))
-        spec = len(true_neg) / (len(true_neg) + len(false_pos))
-        ppv = len(true_pos) / (len(true_pos) + len(false_pos))
-        npv = len(true_neg) / (len(true_neg) + len(false_neg))
+        sens = true_pos / (true_pos + false_neg)
+        spec = true_neg / (true_neg + false_pos)
+        ppv = true_pos / (true_pos + false_pos)
+        npv = true_neg / (true_neg + false_neg)
         f1 = (2 * ppv * sens) / (ppv + sens)
+        kappa = score_kappa(matrix)
 
     return {
         "F1": round(f1, sig_digits),
@@ -116,10 +143,11 @@ def score_matrix(matrix: dict, sig_digits=3) -> dict:
         "Spec": round(spec, sig_digits),
         "PPV": round(ppv, sig_digits),
         "NPV": round(npv, sig_digits),
-        "TP": len(true_pos),
-        "FP": len(false_pos),
-        "FN": len(false_neg),
-        "TN": len(true_neg),
+        "κ": round(kappa, sig_digits),
+        "TP": true_pos,
+        "FP": false_pos,
+        "FN": false_neg,
+        "TN": true_neg,
     }
 
 
@@ -172,7 +200,7 @@ def csv_header(pick_label=False, as_string=False):
     :param pick_label: default= None
     :return: header
     """
-    as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "TP", "FN", "TN", "FP"]
+    as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "κ", "TP", "FN", "TN", "FP"]
 
     if not as_string:
         return as_list
diff --git a/chart_review/common.py b/chart_review/common.py
index 17a3a30..7ed202c 100644
--- a/chart_review/common.py
+++ b/chart_review/common.py
@@ -1,4 +1,5 @@
 """Utility methods"""
+
 from enum import Enum, EnumMeta
 from typing import Optional, Union
 from collections.abc import Iterable
diff --git a/chart_review/kappa.py b/chart_review/kappa.py
deleted file mode 100644
index 4fd3073..0000000
--- a/chart_review/kappa.py
+++ /dev/null
@@ -1,28 +0,0 @@
-def score_kappa(truth: list, annotator: list):
-    """
-    Computes Cohen kappa for pair-wise annotators.
-    https://gist.github.com/LouisdeBruijn/1db0283dc69916516e2948f0eefc3a6e#file-cohen_kappa-py
-
-    TODO: refactor This method is NOT actively used, however remains here for comparison.
-          (Low priority)
-
-    :param truth: annotations provided by truth annotator
-    :param annotator: annotations provided by other annotator
-    :rtype: float
-    :return: Cohen kappa statistic
-    """
-    count = 0
-    for an1, an2 in zip(truth, annotator):
-        if an1 == an2:
-            count += 1
-    observed = count / len(truth)  # observed agreement A (Po)
-
-    uniq = set(truth + annotator)
-    expected = 0  # expected agreement E (Pe)
-    for item in uniq:
-        cnt1 = truth.count(item)
-        cnt2 = annotator.count(item)
-        count = (cnt1 / len(truth)) * (cnt2 / len(annotator))
-        expected += count
-
-    return round((observed - expected) / (1 - expected), 4)
diff --git a/pyproject.toml b/pyproject.toml
index f408720..ae1d7f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,9 @@ tests = [
     "pytest",
 ]
 dev = [
-    "black == 23.11.0",
+    "bandit[toml]",
+    "black >= 24, < 25",
+    "pycodestyle",
     "pylint",
 ]
 
diff --git a/tests/test_agree.py b/tests/test_agree.py
index 0028985..fb31e0b 100644
--- a/tests/test_agree.py
+++ b/tests/test_agree.py
@@ -60,3 +60,39 @@ def test_confusion_matrix_counts(self, truth, annotator, labels, expected_matrix
 
         matrix = agree.confusion_matrix(annotations, truth, annotator, notes, labels=labels)
         self.assertEqual(expected_matrix, matrix)
+
+    @ddt.data(
+        # Examples pulled from https://en.wikipedia.org/wiki/Cohen's_kappa#Examples
+        (
+            {
+                "FN": [{x: "Label"} for x in range(5)],
+                "FP": [{x: "Label"} for x in range(10)],
+                "TN": [{x: "Label"} for x in range(15)],
+                "TP": [{x: "Label"} for x in range(20)],
+            },
+            0.4,
+        ),
+        (
+            {
+                "FN": [{x: "Label"} for x in range(15)],
+                "FP": [{x: "Label"} for x in range(25)],
+                "TN": [{x: "Label"} for x in range(15)],
+                "TP": [{x: "Label"} for x in range(45)],
+            },
+            0.1304,
+        ),
+        (
+            {
+                "FN": [{x: "Label"} for x in range(35)],
+                "FP": [{x: "Label"} for x in range(5)],
+                "TN": [{x: "Label"} for x in range(35)],
+                "TP": [{x: "Label"} for x in range(25)],
+            },
+            0.2593,
+        ),
+    )
+    @ddt.unpack
+    def test_kappa_score(self, matrix, expected_kappa):
+        """Verify that we can score a matrix for kappa."""
+        kappa = round(agree.score_kappa(matrix), 4)
+        self.assertEqual(expected_kappa, kappa)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 24baa4a..a272338 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -30,6 +30,7 @@ def test_accuracy(self):
                     "Spec": 0.6,
                     "PPV": 0.6,
                     "NPV": 0.75,
+                    "κ": 0.341,
                     "TP": 3,
                     "FN": 1,
                     "TN": 3,
@@ -44,6 +45,7 @@ def test_accuracy(self):
                         "Spec": 1.0,
                         "TN": 1,
                         "TP": 1,
+                        "κ": 0.4,
                     },
                     "Fatigue": {
                         "F1": 1.0,
@@ -55,6 +57,7 @@ def test_accuracy(self):
                         "Spec": 1.0,
                         "TN": 1,
                         "TP": 2,
+                        "κ": 1.0,
                     },
                     "Headache": {
                         "F1": 0,
@@ -66,6 +69,7 @@ def test_accuracy(self):
                         "Spec": 0,
                         "TN": 1,
                         "TP": 0,
+                        "κ": 0,
                     },
                 },
                 accuracy_json,
@@ -73,11 +77,11 @@ def test_accuracy(self):
 
             accuracy_csv = common.read_text(f"{tmpdir}/accuracy-jill-jane.csv")
             self.assertEqual(
-                """F1	Sens	Spec	PPV	NPV	TP	FN	TN	FP	Label
-0.667	0.75	0.6	0.6	0.75	3	1	3	2	*
-0.667	0.5	1.0	1.0	0.5	1	1	1	0	Cough
-1.0	1.0	1.0	1.0	1.0	2	0	1	0	Fatigue
-0	0	0	0	0	0	0	1	2	Headache
+                """F1	Sens	Spec	PPV	NPV	κ	TP	FN	TN	FP	Label
+0.667	0.75	0.6	0.6	0.75	0.341	3	1	3	2	*
+0.667	0.5	1.0	1.0	0.5	0.4	1	1	1	0	Cough
+1.0	1.0	1.0	1.0	1.0	1.0	2	0	1	0	Fatigue
+0	0	0	0	0	0	0	0	1	2	Headache
 """,
                 accuracy_csv,
             )