evaluate.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
# Evaluation script for the Cross-Domain Authorship Verification task @PAN2020.

## Measures
The following evaluation measures are provided:
    - F1-score [Pedregosa et al. 2011]
    - Area-Under-the-Curve [Pedregosa et al. 2011]
    - c@1 [Peñas and Rodrigo 2011; Stamatatos 2014]
    - f_05_u_score [Bevendorff et al. 2019]

Systems will be evaluated, taking all of the measures into account.

## Formats
The script requires two files, one for the ground truth (gold standard)
and one for the system predictions. These files should be formatted using
the `jsonl`-convention, whereby each line should contain a valid
json-string: e.g.

``` json
    {"id": "1", "value": 0.123}
    {"id": "2", "value": 0.5}
    {"id": "3", "value": 0.888}
```

Only files will be considered that:
- have the `.jsonl` extension
- are properly encoded as UTF-8.

Please note:
    * For the c@1, all scores are will binarized using
      the conventional thresholds:
        * score < 0.5 -> 0
        * score > 0.5 -> 1
    * A score of *exactly* 0.5, will be considered a non-decision.
    * All problems which are present in the ground truth, but which
      are *not* provided an answer to by the system, will automatically
      be set to 0.5.
    * Non-answers are removed for the F1 score calculation below, but they
      are taken into account by the AUC score.

## Dependencies:
- Python 3.6+ (we recommend the Anaconda Python distribution)
- scikit-learn

## Usage

From the command line:

>>> python pan20-verif-evaluator.py -i COLLECTION -a ANSWERS -o OUTPUT

where
    COLLECTION is the path to the file with the ground truth
    ANSWERS is the path to the answers file for a submitted method
    OUTPUT is the path to the folder where the results of the evaluation will be saved

Example:

>>> python pan20_verif_evaluator.py -i "datasets/test_truth/truth.jsonl" \
        -a "out/answers.jsonl" \
        -o "pan20-evaluation"

## References
- E. Stamatatos, et al. Overview of the Author Identification
  Task at PAN 2014. CLEF Working Notes (2014): 877-897.
- Pedregosa, F. et al. Scikit-learn: Machine Learning in Python,
  Journal of Machine Learning Research 12 (2011), 2825--2830.
- A. Peñas and A. Rodrigo. A Simple Measure to Assess Nonresponse.
  In Proc. of the 49th Annual Meeting of the Association for
  Computational Linguistics, Vol. 1, pages 1415-1424, 2011.
- Bevendorff et al. Generalizing Unmasking for Short Texts,
  Proceedings of NAACL (2019), 654-659.

"""

import argparse
import json
import os

import numpy as np
from sklearn.metrics import roc_auc_score, f1_score


def binarize(y, threshold=0.5):
    y = np.array(y)
    y = np.ma.fix_invalid(y, fill_value=threshold)
    y[y >= threshold] = 1
    y[y < threshold] = 0
    return y


def auc(true_y, pred_y):
    """
    Calculates the AUC score (Area Under the Curve), a well-known
    scalar evaluation score for binary classifiers. This score
    also considers "unanswered" problem, where score = 0.5.

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will typically be `0` or `1`.

    Returns
    ----------
    auc = the Area Under the Curve.

    References
    ----------
        E. Stamatatos, et al. Overview of the Author Identification
        Task at PAN 2014. CLEF (Working Notes) 2014: 877-897.

    """
    try:
        return roc_auc_score(true_y, pred_y)
    except ValueError:
        return 0.0


def c_at_1(true_y, pred_y, threshold=0.5):
    """
    Calculates the c@1 score, an evaluation method specific to the
    PAN competition. This method rewards predictions which leave
    some problems unanswered (score = 0.5). See:

        A. Peñas and A. Rodrigo. A Simple Measure to Assess Nonresponse.
        In Proc. of the 49th Annual Meeting of the Association for
        Computational Linguistics, Vol. 1, pages 1415-1424, 2011.

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will always be `0` or `1`.

    Returns
    ----------
    c@1 = the c@1 measure (which accounts for unanswered
        problems.)


    References
    ----------
        - E. Stamatatos, et al. Overview of the Author Identification
        Task at PAN 2014. CLEF (Working Notes) 2014: 877-897.
        - A. Peñas and A. Rodrigo. A Simple Measure to Assess Nonresponse.
        In Proc. of the 49th Annual Meeting of the Association for
        Computational Linguistics, Vol. 1, pages 1415-1424, 2011.

    """

    n = float(len(pred_y))
    nc, nu = 0.0, 0.0

    for gt_score, pred_score in zip(true_y, pred_y):
        if pred_score == 0.5:
            nu += 1
        elif (pred_score > 0.5) == (gt_score > 0.5):
            nc += 1.0

    return (1 / n) * (nc + (nu * nc / n))


def f1(true_y, pred_y):
    """
    Assesses verification performance, assuming that every
    `score > 0.5` represents a same-author pair decision.
    Note that all non-decisions (scores == 0.5) are ignored
    by this metric.

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will typically be `0` or `1`.

    Returns
    ----------
    acc = The number of correct attributions.

    References
    ----------
        E. Stamatatos, et al. Overview of the Author Identification
        Task at PAN 2014. CLEF (Working Notes) 2014: 877-897.
    """
    true_y_filtered, pred_y_filtered = [], []

    for true, pred in zip(true_y, pred_y):
        if pred != 0.5:
            true_y_filtered.append(true)
            pred_y_filtered.append(pred)

    pred_y_filtered = binarize(pred_y_filtered)

    return f1_score(true_y_filtered, pred_y_filtered)


def f_05_u_score(true_y, pred_y, pos_label=1, threshold=0.5):
    """
    Return F0.5u score of prediction.

    :param true_y: true labels
    :param pred_y: predicted labels
    :param threshold: indication for non-decisions (default = 0.5)
    :param pos_label: positive class label (default = 1)
    :return: F0.5u score
    """

    pred_y = binarize(pred_y)

    n_tp = 0
    n_fn = 0
    n_fp = 0
    n_u = 0

    for i, pred in enumerate(pred_y):
        if pred == threshold:
            n_u += 1
        elif pred == pos_label and pred == true_y[i]:
            n_tp += 1
        elif pred == pos_label and pred != true_y[i]:
            n_fp += 1
        elif true_y[i] == pos_label and pred != true_y[i]:
            n_fn += 1

    return (1.25 * n_tp) / (1.25 * n_tp + 0.25 * (n_fn + n_u) + n_fp)


def load_file(fn):
    problems = {}
    for line in open(fn):
        d = json.loads(line.strip())
        if 'value' in d:
            problems[d['id']] = d['value']
        else:
            problems[d['id']] = int(d['same'])
    return problems


def evaluate_all(true_y, pred_y):
    """
    Convenience function: calculates all PAN20 evaluation measures
    and returns them as a dict, including the 'overall' score, which
    is the mean of the individual metrics (0 >= metric >= 1). All
    scores get rounded to three digits.
    """

    results = {'auc': auc(true_y, pred_y),
               'c@1': c_at_1(true_y, pred_y),
               'f_05_u': f_05_u_score(true_y, pred_y),
               'F1': f1(true_y, pred_y)}

    results['overall'] = np.mean(list(results.values()))

    for k, v in results.items():
        results[k] = round(v, 3)

    return results


def main():
    parser = argparse.ArgumentParser(description='Evaluation script AA@PAN2020')
    parser.add_argument('-i', type=str,
                        help='Path to the jsonl-file with ground truth scores')
    parser.add_argument('-a', type=str,
                        help='Path to the jsonl-file with the answers (system prediction)')
    parser.add_argument('-o', type=str,
                        help='Path to output files')
    args = parser.parse_args()

    # validate:
    if not args.i:
        raise ValueError('The ground truth path is required')
    if not args.a:
        raise ValueError('The answers path is required')
    if not args.o:
        raise ValueError('The output folder path is required')

    # load:
    gt = load_file(args.i)
    pred = load_file(args.a)

    print('->', len(gt), 'problems in ground truth')
    print('->', len(pred), 'solutions explicitly proposed')

    # default missing problems to 0.5
    for probl_id in sorted(gt):
        if probl_id not in pred:
            pred[probl_id] = 0.5

    # sanity check:
    assert len(gt) == len(pred)
    assert set(gt.keys()).union(set(pred)) == set(gt.keys())

    # align the scores:
    scores = [(gt[k], pred[k]) for k in sorted(gt)]
    gt, pred = zip(*scores)
    gt = np.array(gt, dtype=np.float64)
    pred = np.array(pred, dtype=np.float64)

    assert len(gt) == len(pred)

    # evaluate:
    results = evaluate_all(gt, pred)
    print(results)

    with open(args.o + os.sep + 'out.json', 'w') as f:
        json.dump(results, f, indent=4, sort_keys=True)


if __name__ == '__main__':
    main()