Skip to content

Commit

Permalink
feat: add Cohen's Kappa scores to 'accuracy' output
Browse files Browse the repository at this point in the history
In addition to the F1 score, we now also calculate Kappa.

- Also, upgrade black to 24.x
  • Loading branch information
mikix committed Apr 23, 2024
1 parent 827dda8 commit f32baf2
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 51 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
# black is synced with the .pre-commit-hooks version
run: |
python -m pip install --upgrade pip
pip install bandit[toml] pycodestyle black==23.11.0
pip install .[dev]
- name: Run pycodestyle
# E203: pycodestyle is a little too rigid about slices & whitespace
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
repos:
- repo: https://github.com/psf/black
# this version is synced with the black mentioned in .github/workflows/ci.yml
rev: 23.11.0
rev: 24.4.0
hooks:
- id: black
entry: bash -c 'black "$@"; git add -u' --
Expand Down
58 changes: 43 additions & 15 deletions chart_review/agree.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,40 +86,68 @@ def append_matrix(first: dict, second: dict) -> dict:
return added


def score_kappa(matrix: dict) -> float:
"""
Computes Cohen kappa for pair-wise annotators.
https://en.wikipedia.org/wiki/Cohen%27s_kappa
:param matrix: confusion matrix with TN/TP/FN/FP values
:return: Cohen kappa statistic
"""
tp = len(matrix["TP"]) # true positive
tn = len(matrix["TN"]) # true negative
fp = len(matrix["FP"]) # false positive
fn = len(matrix["FN"]) # false negative
total = tp + tn + fp + fn

# observed agreement A (Po)
observed = (tp + tn) / total

# expected agreement E (Pe)
expected_pos = ((tp + fp) / total) * ((tp + fn) / total)
expected_neg = ((tn + fp) / total) * ((tn + fn) / total)
expected = expected_pos + expected_neg

return (observed - expected) / (1 - expected)


def score_matrix(matrix: dict, sig_digits=3) -> dict:
"""
Score F1 measure with precision (PPV) and recall (sensitivity).
Score F1 and Kappa measures with precision (PPV) and recall (sensitivity).
F1 deliberately ignores "True Negatives" because TN inflates scoring (AUROC)
@return: dict with keys {'f1', 'precision', 'recall'} vals are %score
"""
true_pos = matrix["TP"]
true_neg = matrix["TN"]
false_pos = matrix["FP"]
false_neg = matrix["FN"]
true_pos = len(matrix["TP"])
true_neg = len(matrix["TN"])
false_pos = len(matrix["FP"])
false_neg = len(matrix["FN"])

if 0 == len(true_pos) or 0 == len(true_neg):
if 0 == true_pos or 0 == true_neg:
sens = 0
spec = 0
ppv = 0
npv = 0
f1 = 0
kappa = 0
else:
sens = len(true_pos) / (len(true_pos) + len(false_neg))
spec = len(true_neg) / (len(true_neg) + len(false_pos))
ppv = len(true_pos) / (len(true_pos) + len(false_pos))
npv = len(true_neg) / (len(true_neg) + len(false_neg))
sens = true_pos / (true_pos + false_neg)
spec = true_neg / (true_neg + false_pos)
ppv = true_pos / (true_pos + false_pos)
npv = true_neg / (true_neg + false_neg)
f1 = (2 * ppv * sens) / (ppv + sens)
kappa = score_kappa(matrix)

return {
"F1": round(f1, sig_digits),
"Sens": round(sens, sig_digits),
"Spec": round(spec, sig_digits),
"PPV": round(ppv, sig_digits),
"NPV": round(npv, sig_digits),
"TP": len(true_pos),
"FP": len(false_pos),
"FN": len(false_neg),
"TN": len(true_neg),
"κ": round(kappa, sig_digits),
"TP": true_pos,
"FP": false_pos,
"FN": false_neg,
"TN": true_neg,
}


Expand Down Expand Up @@ -172,7 +200,7 @@ def csv_header(pick_label=False, as_string=False):
:param pick_label: default= None
:return: header
"""
as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "TP", "FN", "TN", "FP"]
as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "κ", "TP", "FN", "TN", "FP"]

if not as_string:
return as_list
Expand Down
1 change: 1 addition & 0 deletions chart_review/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utility methods"""

from enum import Enum, EnumMeta
from typing import Optional, Union
from collections.abc import Iterable
Expand Down
28 changes: 0 additions & 28 deletions chart_review/kappa.py

This file was deleted.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ tests = [
"pytest",
]
dev = [
"black == 23.11.0",
"bandit[toml]",
"black >= 24, < 25",
"pycodestyle",
"pylint",
]

Expand Down
36 changes: 36 additions & 0 deletions tests/test_agree.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,39 @@ def test_confusion_matrix_counts(self, truth, annotator, labels, expected_matrix

matrix = agree.confusion_matrix(annotations, truth, annotator, notes, labels=labels)
self.assertEqual(expected_matrix, matrix)

@ddt.data(
# Examples pulled from https://en.wikipedia.org/wiki/Cohen's_kappa#Examples
(
{
"FN": [{x: "Label"} for x in range(5)],
"FP": [{x: "Label"} for x in range(10)],
"TN": [{x: "Label"} for x in range(15)],
"TP": [{x: "Label"} for x in range(20)],
},
0.4,
),
(
{
"FN": [{x: "Label"} for x in range(15)],
"FP": [{x: "Label"} for x in range(25)],
"TN": [{x: "Label"} for x in range(15)],
"TP": [{x: "Label"} for x in range(45)],
},
0.1304,
),
(
{
"FN": [{x: "Label"} for x in range(35)],
"FP": [{x: "Label"} for x in range(5)],
"TN": [{x: "Label"} for x in range(35)],
"TP": [{x: "Label"} for x in range(25)],
},
0.2593,
),
)
@ddt.unpack
def test_kappa_score(self, matrix, expected_kappa):
"""Verify that we can score a matrix for kappa."""
kappa = round(agree.score_kappa(matrix), 4)
self.assertEqual(expected_kappa, kappa)
14 changes: 9 additions & 5 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_accuracy(self):
"Spec": 0.6,
"PPV": 0.6,
"NPV": 0.75,
"κ": 0.341,
"TP": 3,
"FN": 1,
"TN": 3,
Expand All @@ -44,6 +45,7 @@ def test_accuracy(self):
"Spec": 1.0,
"TN": 1,
"TP": 1,
"κ": 0.4,
},
"Fatigue": {
"F1": 1.0,
Expand All @@ -55,6 +57,7 @@ def test_accuracy(self):
"Spec": 1.0,
"TN": 1,
"TP": 2,
"κ": 1.0,
},
"Headache": {
"F1": 0,
Expand All @@ -66,18 +69,19 @@ def test_accuracy(self):
"Spec": 0,
"TN": 1,
"TP": 0,
"κ": 0,
},
},
accuracy_json,
)

accuracy_csv = common.read_text(f"{tmpdir}/accuracy-jill-jane.csv")
self.assertEqual(
"""F1 Sens Spec PPV NPV TP FN TN FP Label
0.667 0.75 0.6 0.6 0.75 3 1 3 2 *
0.667 0.5 1.0 1.0 0.5 1 1 1 0 Cough
1.0 1.0 1.0 1.0 1.0 2 0 1 0 Fatigue
0 0 0 0 0 0 0 1 2 Headache
"""F1 Sens Spec PPV NPV κ TP FN TN FP Label
0.667 0.75 0.6 0.6 0.75 0.341 3 1 3 2 *
0.667 0.5 1.0 1.0 0.5 0.4 1 1 1 0 Cough
1.0 1.0 1.0 1.0 1.0 1.0 2 0 1 0 Fatigue
0 0 0 0 0 0 0 0 1 2 Headache
""",
accuracy_csv,
)
Expand Down

0 comments on commit f32baf2

Please sign in to comment.