Skip to content

Commit

Permalink
Add language parameter to evaluate_corpus (#59)
Browse files Browse the repository at this point in the history
This aligns the command line arguments of evaluate_corpus.py with the evaluate_run.py arguments.
  • Loading branch information
jantrienes authored Jun 3, 2021
1 parent a3f26f8 commit f75b747
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 9 deletions.
12 changes: 4 additions & 8 deletions deidentify/evaluation/evaluate_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,13 @@
from loguru import logger

from deidentify.evaluation.evaluate_run import evaluate
from deidentify.evaluation.evaluator import Evaluator

PREDICTIONS_PATH = join(dirname(__file__), '../../output/predictions/')
OUTPUT_PATH = join(dirname(__file__), '../../output/evaluation')
CORPUS_PATH = join(dirname(__file__), '../../data/corpus/')


def _language_for_corpus(corpus: str):
if corpus.startswith('ons'):
return 'nl'

return 'en'


def main(args):
runs = glob.glob(join(PREDICTIONS_PATH, args.corpus, '*'))
logger.info('Number of runs: {}'.format(len(runs)))
Expand All @@ -46,7 +40,7 @@ def main(args):
evaluator = evaluate(documents_path=corpus_path,
gold_path=corpus_path,
pred_path=join(run, part),
language=_language_for_corpus(args.corpus))
language=args.language)

entity = evaluator.entity_level()
token = evaluator.token_level()
Expand Down Expand Up @@ -99,6 +93,8 @@ def main(args):
def arg_parser():
parser = argparse.ArgumentParser()
parser.add_argument("corpus", help="Name of corpus (e.g., 'dummy', 'ons')", type=str)
parser.add_argument("language", help="Language to use for tokenizer",
choices=Evaluator.supported_languages())
return parser.parse_args()


Expand Down
2 changes: 1 addition & 1 deletion docs/02_train_evaluate_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ ENT tp: 5016 - fp: 187 - fn: 379 - tn: 115532 - precision:
You can use the `evaluate_corpus.py` script to evaluate all runs for a given corpus. The script produces a CSV file with the evaluation measures for each corpus part (i.e., train/dev/test) that you can use this for further analysis.

```sh
> python deidentify/evaluation/evaluate_corpus.py <corpus_name>
> python deidentify/evaluation/evaluate_corpus.py <corpus_name> <language>
[...]
> tree output/evaluation/<corpus_name>
output/evaluation/<corpus_name>
Expand Down

0 comments on commit f75b747

Please sign in to comment.