diff --git a/README.md b/README.md index 8368e96..a529cbc 100644 --- a/README.md +++ b/README.md @@ -84,20 +84,15 @@ src-test.txt src-train.txt src-valid.txt tgt-test.txt tgt-train.txt We train a SentencePiece tokenizer on the train split: ```bash export VOCAB_SIZE=200 # for the production model, a size of 16000 is used -python $CODE_DIR/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py \ - -i $DATA_DIR/src-train.txt -i $DATA_DIR/tgt-train.txt -m $DATA_DIR/sp_model -v $VOCAB_SIZE +paragraph2actions-create-tokenizer -i $DATA_DIR/src-train.txt -i $DATA_DIR/tgt-train.txt -m $DATA_DIR/sp_model -v $VOCAB_SIZE ``` We then tokenize the data: ```bash -python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \ - -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-train.txt -o $DATA_DIR/tok-src-train.txt -python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \ - -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-valid.txt -o $DATA_DIR/tok-src-valid.txt -python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \ - -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-train.txt -o $DATA_DIR/tok-tgt-train.txt -python $CODE_DIR/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py \ - -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-valid.txt -o $DATA_DIR/tok-tgt-valid.txt +paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-train.txt -o $DATA_DIR/tok-src-train.txt +paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/src-valid.txt -o $DATA_DIR/tok-src-valid.txt +paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-train.txt -o $DATA_DIR/tok-tgt-train.txt +paragraph2actions-tokenize -m $DATA_DIR/sp_model.model -i $DATA_DIR/tgt-valid.txt -o $DATA_DIR/tok-tgt-valid.txt ``` ## Training @@ -157,15 +152,14 @@ Experimental procedure sentences can then be translated to action sequences with # Update the path to the OpenNMT model as required export MODEL="$DATA_DIR/models/model_step_520000.pt" -python $CODE_DIR/src/paragraph2actions/scripts/translate_actions.py \ - -t $MODEL -p $DATA_DIR/sp_model.model -s $DATA_DIR/src-test.txt -o $DATA_DIR/pred.txt +paragraph2actions-translate -t $MODEL -p $DATA_DIR/sp_model.model -s $DATA_DIR/src-test.txt -o $DATA_DIR/pred.txt ``` ## Evaluation To print the metrics on the predictions, the following command can be used: ```bash -python $CODE_DIR/src/paragraph2actions/scripts/calculate_metrics.py -g $DATA_DIR/tgt-test.txt -p $DATA_DIR/pred.txt +paragraph2actions-calculate-metrics -g $DATA_DIR/tgt-test.txt -p $DATA_DIR/pred.txt ``` @@ -174,12 +168,12 @@ python $CODE_DIR/src/paragraph2actions/scripts/calculate_metrics.py -g $DATA_DIR The following code illustrate how to augment the data for existing sentences and associated action sequences. ```python -from paragraph2actions.action_string_converter import ReadableConverter from paragraph2actions.augmentation.compound_name_augmenter import CompoundNameAugmenter from paragraph2actions.augmentation.compound_quantity_augmenter import CompoundQuantityAugmenter from paragraph2actions.augmentation.duration_augmenter import DurationAugmenter from paragraph2actions.augmentation.temperature_augmenter import TemperatureAugmenter from paragraph2actions.misc import load_samples, TextWithActions +from paragraph2actions.readable_converter import ReadableConverter converter = ReadableConverter() samples = load_samples('test_data/src-test.txt', 'test_data/tgt-test.txt', converter) @@ -228,11 +222,11 @@ STIR for overnight at room temperature. The following code illustrate the postprocessing of actions. ```python -from paragraph2actions.action_string_converter import ReadableConverter from paragraph2actions.postprocessing.filter_postprocessor import FilterPostprocessor from paragraph2actions.postprocessing.noaction_postprocessor import NoActionPostprocessor from paragraph2actions.postprocessing.postprocessor_combiner import PostprocessorCombiner from paragraph2actions.postprocessing.wait_postprocessor import WaitPostprocessor +from paragraph2actions.readable_converter import ReadableConverter converter = ReadableConverter() postprocessor = PostprocessorCombiner([ diff --git a/setup.cfg b/setup.cfg index bedf86e..e0ffa18 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,5 +50,13 @@ dev = cde = ChemDataExtractor>=1.3.0 +[options.entry_points] +console_scripts = + paragraph2actions-calculate-metrics = paragraph2actions.scripts.calculate_metrics:main + paragraph2actions-create-tokenizer = paragraph2actions.scripts.create_sentencepiece_tokenizer:main + paragraph2actions-generate-annotation-samples = paragraph2actions.scripts.generate_samples_to_annotate:main + paragraph2actions-tokenize = paragraph2actions.scripts.tokenize_with_sentencepiece:main + paragraph2actions-translate = paragraph2actions.scripts.translate_actions:main + [flake8] extend-ignore = E203, E501 diff --git a/src/paragraph2actions/misc.py b/src/paragraph2actions/misc.py index 934afff..9c2aea0 100644 --- a/src/paragraph2actions/misc.py +++ b/src/paragraph2actions/misc.py @@ -1,6 +1,7 @@ from typing import Iterable, List import attr +from rxn.utilities.files import PathLike from .actions import Action from .converter_interface import ActionStringConverter @@ -17,7 +18,7 @@ class TextWithActions: def load_samples( - text_file: str, actions_file: str, converter: ActionStringConverter + text_file: PathLike, actions_file: PathLike, converter: ActionStringConverter ) -> List[TextWithActions]: """ Loads samples of sentences with corresponding actions from files. @@ -43,8 +44,8 @@ def load_samples( def save_samples( samples: Iterable[TextWithActions], converter: ActionStringConverter, - text_file: str, - actions_file: str, + text_file: PathLike, + actions_file: PathLike, ) -> None: """ Saves samples of sentences with corresponding actions to files. diff --git a/src/paragraph2actions/scripts/calculate_metrics.py b/src/paragraph2actions/scripts/calculate_metrics.py index b1bc981..36d06a6 100644 --- a/src/paragraph2actions/scripts/calculate_metrics.py +++ b/src/paragraph2actions/scripts/calculate_metrics.py @@ -1,6 +1,8 @@ +from pathlib import Path from typing import Tuple import click +from rxn.utilities.files import load_list_from_file from paragraph2actions.analysis import ( action_string_validity, @@ -13,27 +15,27 @@ @click.command() @click.option( - "--ground_truth_file", "-g", required=True, help="File containing the ground truth" + "--ground_truth_file", + "-g", + required=True, + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="File containing the ground truth", ) @click.option( "--prediction_files", "-p", multiple=True, + type=click.Path(exists=True, dir_okay=False, path_type=Path), help="File containing the translations to compare with the" "ground truth", ) -def calculate_metrics( - ground_truth_file: str, prediction_files: Tuple[str, ...] -) -> None: +def main(ground_truth_file: Path, prediction_files: Tuple[Path, ...]) -> None: """Calculate metrics for predictions generated by one or several translation models""" - with open(ground_truth_file, "rt") as f: - ground_truth = [s.strip() for s in f] + ground_truth = load_list_from_file(ground_truth_file) predictions = [] for prediction_file in prediction_files: - with open(prediction_file, "rt") as f: - p = [s.strip() for s in f] - predictions.append(p) + predictions.append(load_list_from_file(prediction_file)) for filename, p in zip(prediction_files, predictions): print(filename) @@ -48,4 +50,4 @@ def calculate_metrics( if __name__ == "__main__": - calculate_metrics() + main() diff --git a/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py b/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py index 878d36c..5f1d301 100644 --- a/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py +++ b/src/paragraph2actions/scripts/create_sentencepiece_tokenizer.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Tuple import click @@ -9,15 +10,20 @@ "--inputs", "-i", multiple=True, + type=click.Path(exists=True, dir_okay=False, path_type=Path), help="Input file(s) on which to train sentencepiece", ) @click.option( - "--model", "-m", required=True, help="Where to save the sentencepiece model" + "--model", + "-m", + required=True, + type=click.Path(writable=True, path_type=Path), + help="Where to save the sentencepiece model", ) @click.option("--vocab_size", "-v", default=16000, type=int, help="Vocabulary size") -def save_tokenizer(inputs: Tuple[str, ...], model: str, vocab_size: int) -> None: +def main(inputs: Tuple[Path, ...], model: Path, vocab_size: int) -> None: """Learn sentencepiece model""" - input_files = ",".join(inputs) + input_files = ",".join(str(p) for p in inputs) spm.SentencePieceTrainer.Train( f"--input={input_files} --model_prefix={model} " f"--vocab_size={vocab_size} --character_coverage=1.0 " @@ -26,4 +32,4 @@ def save_tokenizer(inputs: Tuple[str, ...], model: str, vocab_size: int) -> None if __name__ == "__main__": - save_tokenizer() + main() diff --git a/src/paragraph2actions/scripts/generate_samples_to_annotate.py b/src/paragraph2actions/scripts/generate_samples_to_annotate.py index f41cd2b..41c378b 100644 --- a/src/paragraph2actions/scripts/generate_samples_to_annotate.py +++ b/src/paragraph2actions/scripts/generate_samples_to_annotate.py @@ -1,4 +1,5 @@ import random +from pathlib import Path from typing import Callable, List, Set import click @@ -168,17 +169,31 @@ def select_samples( @click.command() -@click.option("--src_in", required=True, help="File containing original sentences") -@click.option("--tgt_in", required=True, help="File containing original sequences") @click.option( - "--src_out", required=True, help="Where to save sentences selected for annotation" + "--src_in", + required=True, + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="File containing original sentences", ) @click.option( - "--tgt_out", required=True, help="Where to save sequences selected for annotation" + "--tgt_in", + required=True, + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="File containing original sequences", ) -def generate_samples_to_annotate( - src_in: str, tgt_in: str, src_out: str, tgt_out: str -) -> None: +@click.option( + "--src_out", + required=True, + type=click.Path(writable=True, path_type=Path), + help="Where to save sentences selected for annotation", +) +@click.option( + "--tgt_out", + required=True, + type=click.Path(writable=True, path_type=Path), + help="Where to save sequences selected for annotation", +) +def main(src_in: Path, tgt_in: Path, src_out: Path, tgt_out: Path) -> None: """Generate samples for annotation""" action_string_converter = ReadableConverter() @@ -198,4 +213,4 @@ def generate_samples_to_annotate( if __name__ == "__main__": - generate_samples_to_annotate() + main() diff --git a/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py b/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py index 2572f46..7bd77e0 100644 --- a/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py +++ b/src/paragraph2actions/scripts/tokenize_with_sentencepiece.py @@ -10,9 +10,7 @@ "--output_filename", "-o", required=True, help="Where to save (de)tokenized text" ) @click.option("--reverse", "-r", is_flag=True, help="If given, will do detokenization") -def tokenize_with_sentencepiece( - model: str, input_filename: str, output_filename: str, reverse: bool -) -> None: +def main(model: str, input_filename: str, output_filename: str, reverse: bool) -> None: """Tokenize / detokenize with sentencepiece""" sp = SentencePieceTokenizer(model) @@ -31,4 +29,4 @@ def tokenize_with_sentencepiece( if __name__ == "__main__": - tokenize_with_sentencepiece() + main() diff --git a/src/paragraph2actions/scripts/translate_actions.py b/src/paragraph2actions/scripts/translate_actions.py index f56d86c..422baee 100644 --- a/src/paragraph2actions/scripts/translate_actions.py +++ b/src/paragraph2actions/scripts/translate_actions.py @@ -1,6 +1,8 @@ +from pathlib import Path from typing import Tuple import click +from rxn.utilities.files import dump_list_to_file, load_list_from_file from paragraph2actions.translator import Translator @@ -10,18 +12,35 @@ "--translation_models", "-t", multiple=True, - help="Translation model file. If multiple are given, will " "be an ensemble model.", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="Translation model file. If multiple are given, will be an ensemble model.", ) @click.option( - "--sentencepiece_model", "-p", required=True, help="SentencePiece model file" + "--sentencepiece_model", + "-p", + required=True, + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="SentencePiece model file", ) -@click.option("--src_file", "-s", required=True, help="File to translate") -@click.option("--output_file", "-o", required=True, help="Where to save translation") -def translate_actions( - translation_models: Tuple[str, ...], - sentencepiece_model: str, - src_file: str, - output_file: str, +@click.option( + "--src_file", + "-s", + required=True, + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="File to translate", +) +@click.option( + "--output_file", + "-o", + required=True, + type=click.Path(writable=True, path_type=Path), + help="Where to save translation", +) +def main( + translation_models: Tuple[Path, ...], + sentencepiece_model: Path, + src_file: Path, + output_file: Path, ) -> None: """ Translate a text with an OpenNMT model. @@ -30,18 +49,14 @@ def translate_actions( in the form of tokenization and de-tokenization with sentencepiece. """ translator = Translator( - translation_model=translation_models, sentencepiece_model=sentencepiece_model + translation_model=[str(m) for m in translation_models], + sentencepiece_model=str(sentencepiece_model), ) - with open(src_file, "rt") as f: - sentences = [line.strip() for line in f] - + sentences = load_list_from_file(src_file) translations = translator.translate_sentences(sentences) - - with open(output_file, "wt") as f: - for t in translations: - f.write(f"{t}\n") + dump_list_to_file(translations, output_file) if __name__ == "__main__": - translate_actions() + main() diff --git a/src/paragraph2actions/translator.py b/src/paragraph2actions/translator.py index 09d63b5..54babd0 100644 --- a/src/paragraph2actions/translator.py +++ b/src/paragraph2actions/translator.py @@ -20,12 +20,7 @@ def __init__( sentencepiece_model: path to the sentencepiece model file """ self.sp = SentencePieceTokenizer(sentencepiece_model) - - if isinstance(translation_model, str): - translation_model = [translation_model] - self.translation_model = list(translation_model) - - self.onmt_translator = RawTranslator.from_model_path(self.translation_model) + self.onmt_translator = RawTranslator.from_model_path(translation_model) def translate_single(self, sentence: str) -> str: """