Skip to content

Commit

Permalink
Cleanup the scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
avaucher committed Oct 11, 2023
1 parent 8d49ce8 commit 279a3e0
Show file tree
Hide file tree
Showing 9 changed files with 102 additions and 68 deletions.
24 changes: 9 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,20 +84,15 @@ src-test.txt src-train.txt src-valid.txt tgt-test.txt tgt-train.txt
We train a SentencePiece tokenizer on the train split:
```bash
export VOCAB_SIZE=200 # for the production model, a size of 16000 is used
python $CODE_DIR/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py \
-i $DATA_DIR/src-train.txt -i $DATA_DIR/tgt-train.txt -m $DATA_DIR/sp_model -v $VOCAB_SIZE
paragraph2actions-create-tokenizer -i $DATA_DIR/src-train.txt -i $DATA_DIR/tgt-train.txt -m $DATA_DIR/sp_model -v $VOCAB_SIZE
```

We then tokenize the data:
```bash
python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-m $DATA_DIR/sp_model.model -i $DATA_DIR/src-train.txt -o $DATA_DIR/tok-src-train.txt
python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-m $DATA_DIR/sp_model.model -i $DATA_DIR/src-valid.txt -o $DATA_DIR/tok-src-valid.txt
python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-train.txt -o $DATA_DIR/tok-tgt-train.txt
python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-valid.txt -o $DATA_DIR/tok-tgt-valid.txt
paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-train.txt -o $DATA_DIR/tok-src-train.txt
paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-valid.txt -o $DATA_DIR/tok-src-valid.txt
paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-train.txt -o $DATA_DIR/tok-tgt-train.txt
paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-valid.txt -o $DATA_DIR/tok-tgt-valid.txt
```

## Training
Expand Down Expand Up @@ -157,15 +152,14 @@ Experimental procedure sentences can then be translated to action sequences with
# Update the path to the OpenNMT model as required
export MODEL="$DATA_DIR/models/model_step_520000.pt"

python $CODE_DIR/src/paragraph2actions/scripts/translate_actions.py \
-t $MODEL -p $DATA_DIR/sp_model.model -s $DATA_DIR/src-test.txt -o $DATA_DIR/pred.txt
paragraph2actions-translate -t $MODEL -p $DATA_DIR/sp_model.model -s $DATA_DIR/src-test.txt -o $DATA_DIR/pred.txt
```

## Evaluation

To print the metrics on the predictions, the following command can be used:
```bash
python $CODE_DIR/src/paragraph2actions/scripts/calculate_metrics.py -g $DATA_DIR/tgt-test.txt -p $DATA_DIR/pred.txt
paragraph2actions-calculate-metrics -g $DATA_DIR/tgt-test.txt -p $DATA_DIR/pred.txt
```


Expand All @@ -174,12 +168,12 @@ python $CODE_DIR/src/paragraph2actions/scripts/calculate_metrics.py -g $DATA_DIR
The following code illustrate how to augment the data for existing sentences and associated action sequences.

```python
from paragraph2actions.action_string_converter import ReadableConverter
from paragraph2actions.augmentation.compound_name_augmenter import CompoundNameAugmenter
from paragraph2actions.augmentation.compound_quantity_augmenter import CompoundQuantityAugmenter
from paragraph2actions.augmentation.duration_augmenter import DurationAugmenter
from paragraph2actions.augmentation.temperature_augmenter import TemperatureAugmenter
from paragraph2actions.misc import load_samples, TextWithActions
from paragraph2actions.readable_converter import ReadableConverter

converter = ReadableConverter()
samples = load_samples('test_data/src-test.txt', 'test_data/tgt-test.txt', converter)
Expand Down Expand Up @@ -228,11 +222,11 @@ STIR for overnight at room temperature.
The following code illustrate the postprocessing of actions.

```python
from paragraph2actions.action_string_converter import ReadableConverter
from paragraph2actions.postprocessing.filter_postprocessor import FilterPostprocessor
from paragraph2actions.postprocessing.noaction_postprocessor import NoActionPostprocessor
from paragraph2actions.postprocessing.postprocessor_combiner import PostprocessorCombiner
from paragraph2actions.postprocessing.wait_postprocessor import WaitPostprocessor
from paragraph2actions.readable_converter import ReadableConverter

converter = ReadableConverter()
postprocessor = PostprocessorCombiner([
Expand Down
8 changes: 8 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,13 @@ dev =
cde =
ChemDataExtractor>=1.3.0

[options.entry_points]
console_scripts =
paragraph2actions-calculate-metrics = paragraph2actions.scripts.calculate_metrics:main
paragraph2actions-create-tokenizer = paragraph2actions.scripts.create_sentencepiece_tokenizer:main
paragraph2actions-generate-annotation-samples = paragraph2actions.scripts.generate_samples_to_annotate:main
paragraph2actions-tokenize = paragraph2actions.scripts.tokenize_with_sentencepiece:main
paragraph2actions-translate = paragraph2actions.scripts.translate_actions:main

[flake8]
extend-ignore = E203, E501
7 changes: 4 additions & 3 deletions src/paragraph2actions/misc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Iterable, List

import attr
from rxn.utilities.files import PathLike

from .actions import Action
from .converter_interface import ActionStringConverter
Expand All @@ -17,7 +18,7 @@ class TextWithActions:


def load_samples(
text_file: str, actions_file: str, converter: ActionStringConverter
text_file: PathLike, actions_file: PathLike, converter: ActionStringConverter
) -> List[TextWithActions]:
"""
Loads samples of sentences with corresponding actions from files.
Expand All @@ -43,8 +44,8 @@ def load_samples(
def save_samples(
samples: Iterable[TextWithActions],
converter: ActionStringConverter,
text_file: str,
actions_file: str,
text_file: PathLike,
actions_file: PathLike,
) -> None:
"""
Saves samples of sentences with corresponding actions to files.
Expand Down
22 changes: 12 additions & 10 deletions src/paragraph2actions/scripts/calculate_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pathlib import Path
from typing import Tuple

import click
from rxn.utilities.files import load_list_from_file

from paragraph2actions.analysis import (
action_string_validity,
Expand All @@ -13,27 +15,27 @@

@click.command()
@click.option(
"--ground_truth_file", "-g", required=True, help="File containing the ground truth"
"--ground_truth_file",
"-g",
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="File containing the ground truth",
)
@click.option(
"--prediction_files",
"-p",
multiple=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="File containing the translations to compare with the" "ground truth",
)
def calculate_metrics(
ground_truth_file: str, prediction_files: Tuple[str, ...]
) -> None:
def main(ground_truth_file: Path, prediction_files: Tuple[Path, ...]) -> None:
"""Calculate metrics for predictions generated by one or several translation models"""

with open(ground_truth_file, "rt") as f:
ground_truth = [s.strip() for s in f]
ground_truth = load_list_from_file(ground_truth_file)

predictions = []
for prediction_file in prediction_files:
with open(prediction_file, "rt") as f:
p = [s.strip() for s in f]
predictions.append(p)
predictions.append(load_list_from_file(prediction_file))

for filename, p in zip(prediction_files, predictions):
print(filename)
Expand All @@ -48,4 +50,4 @@ def calculate_metrics(


if __name__ == "__main__":
calculate_metrics()
main()
14 changes: 10 additions & 4 deletions src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from typing import Tuple

import click
Expand All @@ -9,15 +10,20 @@
"--inputs",
"-i",
multiple=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="Input file(s) on which to train sentencepiece",
)
@click.option(
"--model", "-m", required=True, help="Where to save the sentencepiece model"
"--model",
"-m",
required=True,
type=click.Path(writable=True, path_type=Path),
help="Where to save the sentencepiece model",
)
@click.option("--vocab_size", "-v", default=16000, type=int, help="Vocabulary size")
def save_tokenizer(inputs: Tuple[str, ...], model: str, vocab_size: int) -> None:
def main(inputs: Tuple[Path, ...], model: Path, vocab_size: int) -> None:
"""Learn sentencepiece model"""
input_files = ",".join(inputs)
input_files = ",".join(str(p) for p in inputs)
spm.SentencePieceTrainer.Train(
f"--input={input_files} --model_prefix={model} "
f"--vocab_size={vocab_size} --character_coverage=1.0 "
Expand All @@ -26,4 +32,4 @@ def save_tokenizer(inputs: Tuple[str, ...], model: str, vocab_size: int) -> None


if __name__ == "__main__":
save_tokenizer()
main()
31 changes: 23 additions & 8 deletions src/paragraph2actions/scripts/generate_samples_to_annotate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import random
from pathlib import Path
from typing import Callable, List, Set

import click
Expand Down Expand Up @@ -168,17 +169,31 @@ def select_samples(


@click.command()
@click.option("--src_in", required=True, help="File containing original sentences")
@click.option("--tgt_in", required=True, help="File containing original sequences")
@click.option(
"--src_out", required=True, help="Where to save sentences selected for annotation"
"--src_in",
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="File containing original sentences",
)
@click.option(
"--tgt_out", required=True, help="Where to save sequences selected for annotation"
"--tgt_in",
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="File containing original sequences",
)
def generate_samples_to_annotate(
src_in: str, tgt_in: str, src_out: str, tgt_out: str
) -> None:
@click.option(
"--src_out",
required=True,
type=click.Path(writable=True, path_type=Path),
help="Where to save sentences selected for annotation",
)
@click.option(
"--tgt_out",
required=True,
type=click.Path(writable=True, path_type=Path),
help="Where to save sequences selected for annotation",
)
def main(src_in: Path, tgt_in: Path, src_out: Path, tgt_out: Path) -> None:
"""Generate samples for annotation"""
action_string_converter = ReadableConverter()

Expand All @@ -198,4 +213,4 @@ def generate_samples_to_annotate(


if __name__ == "__main__":
generate_samples_to_annotate()
main()
6 changes: 2 additions & 4 deletions src/paragraph2actions/scripts/tokenize_with_sentencepiece.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
"--output_filename", "-o", required=True, help="Where to save (de)tokenized text"
)
@click.option("--reverse", "-r", is_flag=True, help="If given, will do detokenization")
def tokenize_with_sentencepiece(
model: str, input_filename: str, output_filename: str, reverse: bool
) -> None:
def main(model: str, input_filename: str, output_filename: str, reverse: bool) -> None:
"""Tokenize / detokenize with sentencepiece"""

sp = SentencePieceTokenizer(model)
Expand All @@ -31,4 +29,4 @@ def tokenize_with_sentencepiece(


if __name__ == "__main__":
tokenize_with_sentencepiece()
main()
51 changes: 33 additions & 18 deletions src/paragraph2actions/scripts/translate_actions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pathlib import Path
from typing import Tuple

import click
from rxn.utilities.files import dump_list_to_file, load_list_from_file

from paragraph2actions.translator import Translator

Expand All @@ -10,18 +12,35 @@
"--translation_models",
"-t",
multiple=True,
help="Translation model file. If multiple are given, will " "be an ensemble model.",
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="Translation model file. If multiple are given, will be an ensemble model.",
)
@click.option(
"--sentencepiece_model", "-p", required=True, help="SentencePiece model file"
"--sentencepiece_model",
"-p",
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="SentencePiece model file",
)
@click.option("--src_file", "-s", required=True, help="File to translate")
@click.option("--output_file", "-o", required=True, help="Where to save translation")
def translate_actions(
translation_models: Tuple[str, ...],
sentencepiece_model: str,
src_file: str,
output_file: str,
@click.option(
"--src_file",
"-s",
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="File to translate",
)
@click.option(
"--output_file",
"-o",
required=True,
type=click.Path(writable=True, path_type=Path),
help="Where to save translation",
)
def main(
translation_models: Tuple[Path, ...],
sentencepiece_model: Path,
src_file: Path,
output_file: Path,
) -> None:
"""
Translate a text with an OpenNMT model.
Expand All @@ -30,18 +49,14 @@ def translate_actions(
in the form of tokenization and de-tokenization with sentencepiece.
"""
translator = Translator(
translation_model=translation_models, sentencepiece_model=sentencepiece_model
translation_model=[str(m) for m in translation_models],
sentencepiece_model=str(sentencepiece_model),
)

with open(src_file, "rt") as f:
sentences = [line.strip() for line in f]

sentences = load_list_from_file(src_file)
translations = translator.translate_sentences(sentences)

with open(output_file, "wt") as f:
for t in translations:
f.write(f"{t}\n")
dump_list_to_file(translations, output_file)


if __name__ == "__main__":
translate_actions()
main()
7 changes: 1 addition & 6 deletions src/paragraph2actions/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@ def __init__(
sentencepiece_model: path to the sentencepiece model file
"""
self.sp = SentencePieceTokenizer(sentencepiece_model)

if isinstance(translation_model, str):
translation_model = [translation_model]
self.translation_model = list(translation_model)

self.onmt_translator = RawTranslator.from_model_path(self.translation_model)
self.onmt_translator = RawTranslator.from_model_path(translation_model)

def translate_single(self, sentence: str) -> str:
"""
Expand Down

0 comments on commit 279a3e0

Please sign in to comment.