From f32baf2d6a89987d1cbb4bd52fb1cbe6a0a7b30b Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Tue, 23 Apr 2024 11:13:39 -0400 Subject: [PATCH] feat: add Cohen's Kappa scores to 'accuracy' output In addition to the F1 score, we now also calculate Kappa. - Also, upgrade black to 24.x --- .github/workflows/ci.yaml | 2 +- .pre-commit-config.yaml | 2 +- chart_review/agree.py | 58 +++++++++++++++++++++++++++++---------- chart_review/common.py | 1 + chart_review/kappa.py | 28 ------------------- pyproject.toml | 4 ++- tests/test_agree.py | 36 ++++++++++++++++++++++++ tests/test_cli.py | 14 ++++++---- 8 files changed, 94 insertions(+), 51 deletions(-) delete mode 100644 chart_review/kappa.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0946508..3ad3cba 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,7 +45,7 @@ jobs: # black is synced with the .pre-commit-hooks version run: | python -m pip install --upgrade pip - pip install bandit[toml] pycodestyle black==23.11.0 + pip install .[dev] - name: Run pycodestyle # E203: pycodestyle is a little too rigid about slices & whitespace diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e04e8f0..d32addc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/psf/black # this version is synced with the black mentioned in .github/workflows/ci.yml - rev: 23.11.0 + rev: 24.4.0 hooks: - id: black entry: bash -c 'black "$@"; git add -u' -- diff --git a/chart_review/agree.py b/chart_review/agree.py index 590a032..c288b9b 100644 --- a/chart_review/agree.py +++ b/chart_review/agree.py @@ -86,29 +86,56 @@ def append_matrix(first: dict, second: dict) -> dict: return added +def score_kappa(matrix: dict) -> float: + """ + Computes Cohen kappa for pair-wise annotators. + https://en.wikipedia.org/wiki/Cohen%27s_kappa + + :param matrix: confusion matrix with TN/TP/FN/FP values + :return: Cohen kappa statistic + """ + tp = len(matrix["TP"]) # true positive + tn = len(matrix["TN"]) # true negative + fp = len(matrix["FP"]) # false positive + fn = len(matrix["FN"]) # false negative + total = tp + tn + fp + fn + + # observed agreement A (Po) + observed = (tp + tn) / total + + # expected agreement E (Pe) + expected_pos = ((tp + fp) / total) * ((tp + fn) / total) + expected_neg = ((tn + fp) / total) * ((tn + fn) / total) + expected = expected_pos + expected_neg + + return (observed - expected) / (1 - expected) + + def score_matrix(matrix: dict, sig_digits=3) -> dict: """ - Score F1 measure with precision (PPV) and recall (sensitivity). + Score F1 and Kappa measures with precision (PPV) and recall (sensitivity). F1 deliberately ignores "True Negatives" because TN inflates scoring (AUROC) @return: dict with keys {'f1', 'precision', 'recall'} vals are %score """ - true_pos = matrix["TP"] - true_neg = matrix["TN"] - false_pos = matrix["FP"] - false_neg = matrix["FN"] + true_pos = len(matrix["TP"]) + true_neg = len(matrix["TN"]) + false_pos = len(matrix["FP"]) + false_neg = len(matrix["FN"]) - if 0 == len(true_pos) or 0 == len(true_neg): + if 0 == true_pos or 0 == true_neg: sens = 0 spec = 0 ppv = 0 npv = 0 f1 = 0 + kappa = 0 else: - sens = len(true_pos) / (len(true_pos) + len(false_neg)) - spec = len(true_neg) / (len(true_neg) + len(false_pos)) - ppv = len(true_pos) / (len(true_pos) + len(false_pos)) - npv = len(true_neg) / (len(true_neg) + len(false_neg)) + sens = true_pos / (true_pos + false_neg) + spec = true_neg / (true_neg + false_pos) + ppv = true_pos / (true_pos + false_pos) + npv = true_neg / (true_neg + false_neg) f1 = (2 * ppv * sens) / (ppv + sens) + kappa = score_kappa(matrix) return { "F1": round(f1, sig_digits), @@ -116,10 +143,11 @@ def score_matrix(matrix: dict, sig_digits=3) -> dict: "Spec": round(spec, sig_digits), "PPV": round(ppv, sig_digits), "NPV": round(npv, sig_digits), - "TP": len(true_pos), - "FP": len(false_pos), - "FN": len(false_neg), - "TN": len(true_neg), + "κ": round(kappa, sig_digits), + "TP": true_pos, + "FP": false_pos, + "FN": false_neg, + "TN": true_neg, } @@ -172,7 +200,7 @@ def csv_header(pick_label=False, as_string=False): :param pick_label: default= None :return: header """ - as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "TP", "FN", "TN", "FP"] + as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "κ", "TP", "FN", "TN", "FP"] if not as_string: return as_list diff --git a/chart_review/common.py b/chart_review/common.py index 17a3a30..7ed202c 100644 --- a/chart_review/common.py +++ b/chart_review/common.py @@ -1,4 +1,5 @@ """Utility methods""" + from enum import Enum, EnumMeta from typing import Optional, Union from collections.abc import Iterable diff --git a/chart_review/kappa.py b/chart_review/kappa.py deleted file mode 100644 index 4fd3073..0000000 --- a/chart_review/kappa.py +++ /dev/null @@ -1,28 +0,0 @@ -def score_kappa(truth: list, annotator: list): - """ - Computes Cohen kappa for pair-wise annotators. - https://gist.github.com/LouisdeBruijn/1db0283dc69916516e2948f0eefc3a6e#file-cohen_kappa-py - - TODO: refactor This method is NOT actively used, however remains here for comparison. - (Low priority) - - :param truth: annotations provided by truth annotator - :param annotator: annotations provided by other annotator - :rtype: float - :return: Cohen kappa statistic - """ - count = 0 - for an1, an2 in zip(truth, annotator): - if an1 == an2: - count += 1 - observed = count / len(truth) # observed agreement A (Po) - - uniq = set(truth + annotator) - expected = 0 # expected agreement E (Pe) - for item in uniq: - cnt1 = truth.count(item) - cnt2 = annotator.count(item) - count = (cnt1 / len(truth)) * (cnt2 / len(annotator)) - expected += count - - return round((observed - expected) / (1 - expected), 4) diff --git a/pyproject.toml b/pyproject.toml index f408720..ae1d7f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,9 @@ tests = [ "pytest", ] dev = [ - "black == 23.11.0", + "bandit[toml]", + "black >= 24, < 25", + "pycodestyle", "pylint", ] diff --git a/tests/test_agree.py b/tests/test_agree.py index 0028985..fb31e0b 100644 --- a/tests/test_agree.py +++ b/tests/test_agree.py @@ -60,3 +60,39 @@ def test_confusion_matrix_counts(self, truth, annotator, labels, expected_matrix matrix = agree.confusion_matrix(annotations, truth, annotator, notes, labels=labels) self.assertEqual(expected_matrix, matrix) + + @ddt.data( + # Examples pulled from https://en.wikipedia.org/wiki/Cohen's_kappa#Examples + ( + { + "FN": [{x: "Label"} for x in range(5)], + "FP": [{x: "Label"} for x in range(10)], + "TN": [{x: "Label"} for x in range(15)], + "TP": [{x: "Label"} for x in range(20)], + }, + 0.4, + ), + ( + { + "FN": [{x: "Label"} for x in range(15)], + "FP": [{x: "Label"} for x in range(25)], + "TN": [{x: "Label"} for x in range(15)], + "TP": [{x: "Label"} for x in range(45)], + }, + 0.1304, + ), + ( + { + "FN": [{x: "Label"} for x in range(35)], + "FP": [{x: "Label"} for x in range(5)], + "TN": [{x: "Label"} for x in range(35)], + "TP": [{x: "Label"} for x in range(25)], + }, + 0.2593, + ), + ) + @ddt.unpack + def test_kappa_score(self, matrix, expected_kappa): + """Verify that we can score a matrix for kappa.""" + kappa = round(agree.score_kappa(matrix), 4) + self.assertEqual(expected_kappa, kappa) diff --git a/tests/test_cli.py b/tests/test_cli.py index 24baa4a..a272338 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -30,6 +30,7 @@ def test_accuracy(self): "Spec": 0.6, "PPV": 0.6, "NPV": 0.75, + "κ": 0.341, "TP": 3, "FN": 1, "TN": 3, @@ -44,6 +45,7 @@ def test_accuracy(self): "Spec": 1.0, "TN": 1, "TP": 1, + "κ": 0.4, }, "Fatigue": { "F1": 1.0, @@ -55,6 +57,7 @@ def test_accuracy(self): "Spec": 1.0, "TN": 1, "TP": 2, + "κ": 1.0, }, "Headache": { "F1": 0, @@ -66,6 +69,7 @@ def test_accuracy(self): "Spec": 0, "TN": 1, "TP": 0, + "κ": 0, }, }, accuracy_json, @@ -73,11 +77,11 @@ def test_accuracy(self): accuracy_csv = common.read_text(f"{tmpdir}/accuracy-jill-jane.csv") self.assertEqual( - """F1 Sens Spec PPV NPV TP FN TN FP Label -0.667 0.75 0.6 0.6 0.75 3 1 3 2 * -0.667 0.5 1.0 1.0 0.5 1 1 1 0 Cough -1.0 1.0 1.0 1.0 1.0 2 0 1 0 Fatigue -0 0 0 0 0 0 0 1 2 Headache + """F1 Sens Spec PPV NPV κ TP FN TN FP Label +0.667 0.75 0.6 0.6 0.75 0.341 3 1 3 2 * +0.667 0.5 1.0 1.0 0.5 0.4 1 1 1 0 Cough +1.0 1.0 1.0 1.0 1.0 1.0 2 0 1 0 Fatigue +0 0 0 0 0 0 0 0 1 2 Headache """, accuracy_csv, )