Cleanup the scripts

rxn4chemistry · Oct 11, 2023 · 279a3e0 · 279a3e0
1 parent 8d49ce8
commit 279a3e0
Show file tree

Hide file tree

Showing 9 changed files with 102 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -84,20 +84,15 @@ src-test.txt    src-train.txt   src-valid.txt   tgt-test.txt    tgt-train.txt
 We train a SentencePiece tokenizer on the train split:
 ```bash
 export VOCAB_SIZE=200  # for the production model, a size of 16000 is used
-python $CODE_DIR/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py \
-  -i $DATA_DIR/src-train.txt -i $DATA_DIR/tgt-train.txt -m $DATA_DIR/sp_model -v $VOCAB_SIZE
+paragraph2actions-create-tokenizer -i $DATA_DIR/src-train.txt -i $DATA_DIR/tgt-train.txt -m $DATA_DIR/sp_model -v $VOCAB_SIZE
 ```
 
 We then tokenize the data:
 ```bash
-python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-  -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-train.txt -o $DATA_DIR/tok-src-train.txt
-python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-  -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-valid.txt -o $DATA_DIR/tok-src-valid.txt
-python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-  -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-train.txt -o $DATA_DIR/tok-tgt-train.txt
-python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \
-  -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-valid.txt -o $DATA_DIR/tok-tgt-valid.txt
+paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-train.txt -o $DATA_DIR/tok-src-train.txt
+paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-valid.txt -o $DATA_DIR/tok-src-valid.txt
+paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-train.txt -o $DATA_DIR/tok-tgt-train.txt
+paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-valid.txt -o $DATA_DIR/tok-tgt-valid.txt
 ```
 
 ## Training
@@ -157,15 +152,14 @@ Experimental procedure sentences can then be translated to action sequences with
 # Update the path to the OpenNMT model as required
 export MODEL="$DATA_DIR/models/model_step_520000.pt"
 
-python $CODE_DIR/src/paragraph2actions/scripts/translate_actions.py \
-  -t $MODEL -p $DATA_DIR/sp_model.model -s $DATA_DIR/src-test.txt -o $DATA_DIR/pred.txt
+paragraph2actions-translate -t $MODEL -p $DATA_DIR/sp_model.model -s $DATA_DIR/src-test.txt -o $DATA_DIR/pred.txt
 ```
 
 ## Evaluation
 
 To print the metrics on the predictions, the following command can be used:
 ```bash
-python $CODE_DIR/src/paragraph2actions/scripts/calculate_metrics.py -g $DATA_DIR/tgt-test.txt -p $DATA_DIR/pred.txt
+paragraph2actions-calculate-metrics -g $DATA_DIR/tgt-test.txt -p $DATA_DIR/pred.txt
 ```
 
 
@@ -174,12 +168,12 @@ python $CODE_DIR/src/paragraph2actions/scripts/calculate_metrics.py -g $DATA_DIR
 The following code illustrate how to augment the data for existing sentences and associated action sequences.
 
 ```python
-from paragraph2actions.action_string_converter import ReadableConverter
 from paragraph2actions.augmentation.compound_name_augmenter import CompoundNameAugmenter
 from paragraph2actions.augmentation.compound_quantity_augmenter import CompoundQuantityAugmenter
 from paragraph2actions.augmentation.duration_augmenter import DurationAugmenter
 from paragraph2actions.augmentation.temperature_augmenter import TemperatureAugmenter
 from paragraph2actions.misc import load_samples, TextWithActions
+from paragraph2actions.readable_converter import ReadableConverter
 
 converter = ReadableConverter()
 samples = load_samples('test_data/src-test.txt', 'test_data/tgt-test.txt', converter)
@@ -228,11 +222,11 @@ STIR for overnight at room temperature.
 The following code illustrate the postprocessing of actions.
 
 ```python
-from paragraph2actions.action_string_converter import ReadableConverter
 from paragraph2actions.postprocessing.filter_postprocessor import FilterPostprocessor
 from paragraph2actions.postprocessing.noaction_postprocessor import NoActionPostprocessor
 from paragraph2actions.postprocessing.postprocessor_combiner import PostprocessorCombiner
 from paragraph2actions.postprocessing.wait_postprocessor import WaitPostprocessor
+from paragraph2actions.readable_converter import ReadableConverter
 
 converter = ReadableConverter()
 postprocessor = PostprocessorCombiner([

diff --git a/setup.cfg b/setup.cfg
@@ -50,5 +50,13 @@ dev =
 cde =
     ChemDataExtractor>=1.3.0
 
+[options.entry_points]
+console_scripts =
+    paragraph2actions-calculate-metrics = paragraph2actions.scripts.calculate_metrics:main
+    paragraph2actions-create-tokenizer = paragraph2actions.scripts.create_sentencepiece_tokenizer:main
+    paragraph2actions-generate-annotation-samples = paragraph2actions.scripts.generate_samples_to_annotate:main
+    paragraph2actions-tokenize = paragraph2actions.scripts.tokenize_with_sentencepiece:main
+    paragraph2actions-translate = paragraph2actions.scripts.translate_actions:main
+
 [flake8]
 extend-ignore = E203, E501
diff --git a/src/paragraph2actions/misc.py b/src/paragraph2actions/misc.py
@@ -1,6 +1,7 @@
 from typing import Iterable, List
 
 import attr
+from rxn.utilities.files import PathLike
 
 from .actions import Action
 from .converter_interface import ActionStringConverter
@@ -17,7 +18,7 @@ class TextWithActions:
 
 
 def load_samples(
-    text_file: str, actions_file: str, converter: ActionStringConverter
+    text_file: PathLike, actions_file: PathLike, converter: ActionStringConverter
 ) -> List[TextWithActions]:
     """
     Loads samples of sentences with corresponding actions from files.
@@ -43,8 +44,8 @@ def load_samples(
 def save_samples(
     samples: Iterable[TextWithActions],
     converter: ActionStringConverter,
-    text_file: str,
-    actions_file: str,
+    text_file: PathLike,
+    actions_file: PathLike,
 ) -> None:
     """
     Saves samples of sentences with corresponding actions to files.

diff --git a/src/paragraph2actions/scripts/calculate_metrics.py b/src/paragraph2actions/scripts/calculate_metrics.py
@@ -1,6 +1,8 @@
+from pathlib import Path
 from typing import Tuple
 
 import click
+from rxn.utilities.files import load_list_from_file
 
 from paragraph2actions.analysis import (
     action_string_validity,
@@ -13,27 +15,27 @@
 
 @click.command()
 @click.option(
-    "--ground_truth_file", "-g", required=True, help="File containing the ground truth"
+    "--ground_truth_file",
+    "-g",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="File containing the ground truth",
 )
 @click.option(
     "--prediction_files",
     "-p",
     multiple=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
     help="File containing the translations to compare with the" "ground truth",
 )
-def calculate_metrics(
-    ground_truth_file: str, prediction_files: Tuple[str, ...]
-) -> None:
+def main(ground_truth_file: Path, prediction_files: Tuple[Path, ...]) -> None:
     """Calculate metrics for predictions generated by one or several translation models"""
 
-    with open(ground_truth_file, "rt") as f:
-        ground_truth = [s.strip() for s in f]
+    ground_truth = load_list_from_file(ground_truth_file)
 
     predictions = []
     for prediction_file in prediction_files:
-        with open(prediction_file, "rt") as f:
-            p = [s.strip() for s in f]
-            predictions.append(p)
+        predictions.append(load_list_from_file(prediction_file))
 
     for filename, p in zip(prediction_files, predictions):
         print(filename)
@@ -48,4 +50,4 @@ def calculate_metrics(
 
 
 if __name__ == "__main__":
-    calculate_metrics()
+    main()
diff --git a/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py b/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Tuple
 
 import click
@@ -9,15 +10,20 @@
     "--inputs",
     "-i",
     multiple=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
     help="Input file(s) on which to train sentencepiece",
 )
 @click.option(
-    "--model", "-m", required=True, help="Where to save the sentencepiece model"
+    "--model",
+    "-m",
+    required=True,
+    type=click.Path(writable=True, path_type=Path),
+    help="Where to save the sentencepiece model",
 )
 @click.option("--vocab_size", "-v", default=16000, type=int, help="Vocabulary size")
-def save_tokenizer(inputs: Tuple[str, ...], model: str, vocab_size: int) -> None:
+def main(inputs: Tuple[Path, ...], model: Path, vocab_size: int) -> None:
     """Learn sentencepiece model"""
-    input_files = ",".join(inputs)
+    input_files = ",".join(str(p) for p in inputs)
     spm.SentencePieceTrainer.Train(
         f"--input={input_files} --model_prefix={model} "
         f"--vocab_size={vocab_size} --character_coverage=1.0 "
@@ -26,4 +32,4 @@ def save_tokenizer(inputs: Tuple[str, ...], model: str, vocab_size: int) -> None
 
 
 if __name__ == "__main__":
-    save_tokenizer()
+    main()
diff --git a/src/paragraph2actions/scripts/generate_samples_to_annotate.py b/src/paragraph2actions/scripts/generate_samples_to_annotate.py
@@ -1,4 +1,5 @@
 import random
+from pathlib import Path
 from typing import Callable, List, Set
 
 import click
@@ -168,17 +169,31 @@ def select_samples(
 
 
 @click.command()
-@click.option("--src_in", required=True, help="File containing original sentences")
-@click.option("--tgt_in", required=True, help="File containing original sequences")
 @click.option(
-    "--src_out", required=True, help="Where to save sentences selected for annotation"
+    "--src_in",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="File containing original sentences",
 )
 @click.option(
-    "--tgt_out", required=True, help="Where to save sequences selected for annotation"
+    "--tgt_in",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="File containing original sequences",
 )
-def generate_samples_to_annotate(
-    src_in: str, tgt_in: str, src_out: str, tgt_out: str
-) -> None:
+@click.option(
+    "--src_out",
+    required=True,
+    type=click.Path(writable=True, path_type=Path),
+    help="Where to save sentences selected for annotation",
+)
+@click.option(
+    "--tgt_out",
+    required=True,
+    type=click.Path(writable=True, path_type=Path),
+    help="Where to save sequences selected for annotation",
+)
+def main(src_in: Path, tgt_in: Path, src_out: Path, tgt_out: Path) -> None:
     """Generate samples for annotation"""
     action_string_converter = ReadableConverter()
 
@@ -198,4 +213,4 @@ def generate_samples_to_annotate(
 
 
 if __name__ == "__main__":
-    generate_samples_to_annotate()
+    main()
diff --git a/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py b/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py
@@ -10,9 +10,7 @@
     "--output_filename", "-o", required=True, help="Where to save (de)tokenized text"
 )
 @click.option("--reverse", "-r", is_flag=True, help="If given, will do detokenization")
-def tokenize_with_sentencepiece(
-    model: str, input_filename: str, output_filename: str, reverse: bool
-) -> None:
+def main(model: str, input_filename: str, output_filename: str, reverse: bool) -> None:
     """Tokenize / detokenize with sentencepiece"""
 
     sp = SentencePieceTokenizer(model)
@@ -31,4 +29,4 @@ def tokenize_with_sentencepiece(
 
 
 if __name__ == "__main__":
-    tokenize_with_sentencepiece()
+    main()
diff --git a/src/paragraph2actions/scripts/translate_actions.py b/src/paragraph2actions/scripts/translate_actions.py
@@ -1,6 +1,8 @@
+from pathlib import Path
 from typing import Tuple
 
 import click
+from rxn.utilities.files import dump_list_to_file, load_list_from_file
 
 from paragraph2actions.translator import Translator
 
@@ -10,18 +12,35 @@
     "--translation_models",
     "-t",
     multiple=True,
-    help="Translation model file. If multiple are given, will " "be an ensemble model.",
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="Translation model file. If multiple are given, will be an ensemble model.",
 )
 @click.option(
-    "--sentencepiece_model", "-p", required=True, help="SentencePiece model file"
+    "--sentencepiece_model",
+    "-p",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="SentencePiece model file",
 )
-@click.option("--src_file", "-s", required=True, help="File to translate")
-@click.option("--output_file", "-o", required=True, help="Where to save translation")
-def translate_actions(
-    translation_models: Tuple[str, ...],
-    sentencepiece_model: str,
-    src_file: str,
-    output_file: str,
+@click.option(
+    "--src_file",
+    "-s",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="File to translate",
+)
+@click.option(
+    "--output_file",
+    "-o",
+    required=True,
+    type=click.Path(writable=True, path_type=Path),
+    help="Where to save translation",
+)
+def main(
+    translation_models: Tuple[Path, ...],
+    sentencepiece_model: Path,
+    src_file: Path,
+    output_file: Path,
 ) -> None:
     """
     Translate a text with an OpenNMT model.
@@ -30,18 +49,14 @@ def translate_actions(
     in the form of tokenization and de-tokenization with sentencepiece.
     """
     translator = Translator(
-        translation_model=translation_models, sentencepiece_model=sentencepiece_model
+        translation_model=[str(m) for m in translation_models],
+        sentencepiece_model=str(sentencepiece_model),
     )
 
-    with open(src_file, "rt") as f:
-        sentences = [line.strip() for line in f]
-
+    sentences = load_list_from_file(src_file)
     translations = translator.translate_sentences(sentences)
-
-    with open(output_file, "wt") as f:
-        for t in translations:
-            f.write(f"{t}\n")
+    dump_list_to_file(translations, output_file)
 
 
 if __name__ == "__main__":
-    translate_actions()
+    main()
diff --git a/src/paragraph2actions/translator.py b/src/paragraph2actions/translator.py
@@ -20,12 +20,7 @@ def __init__(
             sentencepiece_model: path to the sentencepiece model file
         """
         self.sp = SentencePieceTokenizer(sentencepiece_model)
-
-        if isinstance(translation_model, str):
-            translation_model = [translation_model]
-        self.translation_model = list(translation_model)
-
-        self.onmt_translator = RawTranslator.from_model_path(self.translation_model)
+        self.onmt_translator = RawTranslator.from_model_path(translation_model)
 
     def translate_single(self, sentence: str) -> str:
         """