Skip to content

Commit

Permalink
Merge pull request #28 from smart-on-fhir/mikix/kappa
Browse files Browse the repository at this point in the history
Kappa Validated against widely cited example of Kappa score in https://pubmed.ncbi.nlm.nih.gov/12474424. Also code is well organized and refactored cleanly.
  • Loading branch information
comorbidity authored Apr 29, 2024
2 parents 827dda8 + 9528fb2 commit 492540b
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
# black is synced with the .pre-commit-hooks version
run: |
python -m pip install --upgrade pip
pip install bandit[toml] pycodestyle black==23.11.0
pip install .[dev]
- name: Run pycodestyle
# E203: pycodestyle is a little too rigid about slices & whitespace
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
repos:
- repo: https://github.com/psf/black
# this version is synced with the black mentioned in .github/workflows/ci.yml
rev: 23.11.0
rev: 24.4.0
hooks:
- id: black
entry: bash -c 'black "$@"; git add -u' --
Expand Down
58 changes: 43 additions & 15 deletions chart_review/agree.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,40 +86,68 @@ def append_matrix(first: dict, second: dict) -> dict:
return added


def score_kappa(matrix: dict) -> float:
"""
Computes Cohen kappa for pair-wise annotators.
https://en.wikipedia.org/wiki/Cohen%27s_kappa
:param matrix: confusion matrix with TN/TP/FN/FP values
:return: Cohen kappa statistic
"""
tp = len(matrix["TP"]) # true positive
tn = len(matrix["TN"]) # true negative
fp = len(matrix["FP"]) # false positive
fn = len(matrix["FN"]) # false negative
total = tp + tn + fp + fn

# observed agreement A (Po)
observed = (tp + tn) / total

# expected agreement E (Pe)
expected_pos = ((tp + fp) / total) * ((tp + fn) / total)
expected_neg = ((tn + fp) / total) * ((tn + fn) / total)
expected = expected_pos + expected_neg

return (observed - expected) / (1 - expected)


def score_matrix(matrix: dict, sig_digits=3) -> dict:
"""
Score F1 measure with precision (PPV) and recall (sensitivity).
Score F1 and Kappa measures with precision (PPV) and recall (sensitivity).
F1 deliberately ignores "True Negatives" because TN inflates scoring (AUROC)
@return: dict with keys {'f1', 'precision', 'recall'} vals are %score
"""
true_pos = matrix["TP"]
true_neg = matrix["TN"]
false_pos = matrix["FP"]
false_neg = matrix["FN"]
true_pos = len(matrix["TP"])
true_neg = len(matrix["TN"])
false_pos = len(matrix["FP"])
false_neg = len(matrix["FN"])

if 0 == len(true_pos) or 0 == len(true_neg):
if 0 == true_pos or 0 == true_neg:
sens = 0
spec = 0
ppv = 0
npv = 0
f1 = 0
kappa = 0
else:
sens = len(true_pos) / (len(true_pos) + len(false_neg))
spec = len(true_neg) / (len(true_neg) + len(false_pos))
ppv = len(true_pos) / (len(true_pos) + len(false_pos))
npv = len(true_neg) / (len(true_neg) + len(false_neg))
sens = true_pos / (true_pos + false_neg)
spec = true_neg / (true_neg + false_pos)
ppv = true_pos / (true_pos + false_pos)
npv = true_neg / (true_neg + false_neg)
f1 = (2 * ppv * sens) / (ppv + sens)
kappa = score_kappa(matrix)

return {
"F1": round(f1, sig_digits),
"Sens": round(sens, sig_digits),
"Spec": round(spec, sig_digits),
"PPV": round(ppv, sig_digits),
"NPV": round(npv, sig_digits),
"TP": len(true_pos),
"FP": len(false_pos),
"FN": len(false_neg),
"TN": len(true_neg),
"κ": round(kappa, sig_digits),
"TP": true_pos,
"FP": false_pos,
"FN": false_neg,
"TN": true_neg,
}


Expand Down Expand Up @@ -172,7 +200,7 @@ def csv_header(pick_label=False, as_string=False):
:param pick_label: default= None
:return: header
"""
as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "TP", "FN", "TN", "FP"]
as_list = ["F1", "Sens", "Spec", "PPV", "NPV", "κ", "TP", "FN", "TN", "FP"]

if not as_string:
return as_list
Expand Down
1 change: 1 addition & 0 deletions chart_review/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utility methods"""

from enum import Enum, EnumMeta
from typing import Optional, Union
from collections.abc import Iterable
Expand Down
28 changes: 0 additions & 28 deletions chart_review/kappa.py

This file was deleted.

15 changes: 7 additions & 8 deletions docs/accuracy.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@ your accuracy scores will be printed to the console.
## Example

```shell
$ chart-review accuracy jane john
accuracy-jane-john:
F1 Sens Spec PPV NPV TP FN TN FP Label
0.929 0.958 0.908 0.901 0.961 91 4 99 10 *
0.895 0.895 0.938 0.895 0.938 17 2 30 2 cough
0.815 0.917 0.897 0.733 0.972 11 1 35 4 fever
0.959 1.0 0.812 0.921 1.0 35 0 13 3 headache
0.966 0.966 0.955 0.966 0.955 28 1 21 1 stuffy-nose
$ chart-review accuracy jill jane
accuracy-jill-jane:
F1 Sens Spec PPV NPV κ TP FN TN FP Label
0.667 0.75 0.6 0.6 0.75 0.341 3 1 3 2 *
0.667 0.5 1.0 1.0 0.5 0.4 1 1 1 0 Cough
1.0 1.0 1.0 1.0 1.0 1.0 2 0 1 0 Fatigue
0 0 0 0 0 0 0 0 1 2 Headache
```

## Options
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ tests = [
"pytest",
]
dev = [
"black == 23.11.0",
"bandit[toml]",
"black >= 24, < 25",
"pycodestyle",
"pylint",
]

Expand Down
46 changes: 46 additions & 0 deletions tests/test_agree.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,49 @@ def test_confusion_matrix_counts(self, truth, annotator, labels, expected_matrix

matrix = agree.confusion_matrix(annotations, truth, annotator, notes, labels=labels)
self.assertEqual(expected_matrix, matrix)

@ddt.data(
# Examples pulled from https://en.wikipedia.org/wiki/Cohen's_kappa#Examples
(
{
"FN": [{x: "Label"} for x in range(5)],
"FP": [{x: "Label"} for x in range(10)],
"TN": [{x: "Label"} for x in range(15)],
"TP": [{x: "Label"} for x in range(20)],
},
0.4,
),
(
{
"FN": [{x: "Label"} for x in range(15)],
"FP": [{x: "Label"} for x in range(25)],
"TN": [{x: "Label"} for x in range(15)],
"TP": [{x: "Label"} for x in range(45)],
},
0.1304,
),
(
{
"FN": [{x: "Label"} for x in range(35)],
"FP": [{x: "Label"} for x in range(5)],
"TN": [{x: "Label"} for x in range(35)],
"TP": [{x: "Label"} for x in range(25)],
},
0.2593,
),
# This example is from table 2 in https://pubmed.ncbi.nlm.nih.gov/12474424/
(
{
"FN": [{x: "Label"} for x in range(6)],
"FP": [{x: "Label"} for x in range(9)],
"TN": [{x: "Label"} for x in range(26)],
"TP": [{x: "Label"} for x in range(15)],
},
0.4444,
),
)
@ddt.unpack
def test_kappa_score(self, matrix, expected_kappa):
"""Verify that we can score a matrix for kappa."""
kappa = round(agree.score_kappa(matrix), 4)
self.assertEqual(expected_kappa, kappa)
14 changes: 9 additions & 5 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_accuracy(self):
"Spec": 0.6,
"PPV": 0.6,
"NPV": 0.75,
"κ": 0.341,
"TP": 3,
"FN": 1,
"TN": 3,
Expand All @@ -44,6 +45,7 @@ def test_accuracy(self):
"Spec": 1.0,
"TN": 1,
"TP": 1,
"κ": 0.4,
},
"Fatigue": {
"F1": 1.0,
Expand All @@ -55,6 +57,7 @@ def test_accuracy(self):
"Spec": 1.0,
"TN": 1,
"TP": 2,
"κ": 1.0,
},
"Headache": {
"F1": 0,
Expand All @@ -66,18 +69,19 @@ def test_accuracy(self):
"Spec": 0,
"TN": 1,
"TP": 0,
"κ": 0,
},
},
accuracy_json,
)

accuracy_csv = common.read_text(f"{tmpdir}/accuracy-jill-jane.csv")
self.assertEqual(
"""F1 Sens Spec PPV NPV TP FN TN FP Label
0.667 0.75 0.6 0.6 0.75 3 1 3 2 *
0.667 0.5 1.0 1.0 0.5 1 1 1 0 Cough
1.0 1.0 1.0 1.0 1.0 2 0 1 0 Fatigue
0 0 0 0 0 0 0 1 2 Headache
"""F1 Sens Spec PPV NPV κ TP FN TN FP Label
0.667 0.75 0.6 0.6 0.75 0.341 3 1 3 2 *
0.667 0.5 1.0 1.0 0.5 0.4 1 1 1 0 Cough
1.0 1.0 1.0 1.0 1.0 1.0 2 0 1 0 Fatigue
0 0 0 0 0 0 0 0 1 2 Headache
""",
accuracy_csv,
)
Expand Down

0 comments on commit 492540b

Please sign in to comment.