From 5aca48ceb0597bc5322cdc4c1647e02bad92ba91 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 26 Mar 2024 14:55:02 +0100 Subject: [PATCH 01/17] depedencies: get rid of fork opennmt dependency --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 1922784..f4108c7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,7 @@ include_package_data = True install_requires = attrs>=21.2.0 click>=8.0 - rxn-opennmt-py>=1.1.1 + #rxn-opennmt-py#>=1.1.1 rxn-utils>=1.6.0 [options.packages.find] From 6319d5dd8c2cd2e7735c4c029b8ee8c5d2cebfc0 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 26 Mar 2024 14:56:05 +0100 Subject: [PATCH 02/17] fix: official omnt drops split_corpus(), I add it back as part of source --- .../onmt_utils/internal_translation_utils.py | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/rxn/onmt_utils/internal_translation_utils.py b/src/rxn/onmt_utils/internal_translation_utils.py index 6eba198..3d2afbd 100644 --- a/src/rxn/onmt_utils/internal_translation_utils.py +++ b/src/rxn/onmt_utils/internal_translation_utils.py @@ -7,10 +7,39 @@ import attr import onmt.opts as opts from onmt.translate.translator import build_translator -from onmt.utils.misc import split_corpus +#from onmt.utils.misc import split_corpus from onmt.utils.parse import ArgumentParser from rxn.utilities.files import named_temporary_path - +from itertools import islice, repeat + +# Introduced back _split_corpus and split_corpus originally in onmt.utils.misc +# This commit gets rid of it: https://github.com/OpenNMT/OpenNMT-py/commit/4dcb2b9478eba32a480364e595f5fff7bd8ca887 +# Since dependencies of split_corpus and _split_corpus are only itertools, it's easier to add them to source code +def _split_corpus(path, shard_size): + """Yield a `list` containing `shard_size` line of `path`. + """ + with open(path, "rb") as f: + if shard_size <= 0: + yield f.readlines() + else: + while True: + shard = list(islice(f, shard_size)) + if not shard: + break + yield shard + + +def split_corpus(path, shard_size, default=None): + """yield a `list` containing `shard_size` line of `path`, + or repeatly generate `default` if `path` is None. + """ + if path is not None: + return _split_corpus(path, shard_size) + else: + return repeat(default) + + + @attr.s(auto_attribs=True) class TranslationResult: From c8591b42bf04400e6c6c5847d5edb3326481ba34 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 9 Apr 2024 17:03:47 +0200 Subject: [PATCH 03/17] temptative removal of rxn-openmt-py dependency --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index f4108c7..65d33f0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,7 @@ include_package_data = True install_requires = attrs>=21.2.0 click>=8.0 - #rxn-opennmt-py#>=1.1.1 + #rxn-opennmt-py>=1.1.1 # Remove opennmt-py fork dependence rxn-utils>=1.6.0 [options.packages.find] From bda7384ecc2c9a0c965ea3ecd5f8d3298ac5ea8d Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Mon, 15 Apr 2024 11:44:42 +0200 Subject: [PATCH 04/17] tmp fix: remove some rxn-onmt-py imports --- src/rxn/onmt_utils/train_command.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index 0a1e3dd..3d628f9 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -2,9 +2,10 @@ from enum import Flag from typing import Any, List, Optional, Tuple +from onmt.trainer import Trainer from rxn.utilities.files import PathLike -from .model_introspection import get_model_rnn_size +#from .model_introspection import get_model_rnn_size logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) @@ -88,7 +89,7 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): ] -class OnmtTrainCommand: +class OnmtTrainCommand(Trainer): """ Class to build the onmt_command for training models, continuing the training, or finetuning. @@ -272,7 +273,7 @@ def finetune( # In principle, the rnn_size should not be needed for finetuning. However, # when resetting the decay algorithm for the learning rate, this value # is necessary - and does not get it from the model checkpoint (OpenNMT bug). - rnn_size = get_model_rnn_size(train_from) + #rnn_size = get_model_rnn_size(train_from) logger.info(f"Loaded the value of rnn_size from the model: {rnn_size}.") return cls( @@ -285,7 +286,7 @@ def finetune( keep_checkpoint=keep_checkpoint, learning_rate=learning_rate, reset_optim="all", - rnn_size=rnn_size, + #rnn_size=rnn_size, save_model=save_model, seed=seed, train_from=train_from, From 1ab01a9519c42abed8052ae26d4e43c11076d685 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 16 Apr 2024 09:28:03 +0200 Subject: [PATCH 05/17] fix: change OnmtTrainCommand cli config argument pass for explicit yaml --- src/rxn/onmt_utils/train_command.py | 42 +++++++++++++++++++---------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index 3d628f9..701c3bc 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -1,8 +1,9 @@ import logging from enum import Flag +from pathlib import Path from typing import Any, List, Optional, Tuple -from onmt.trainer import Trainer +import yaml from rxn.utilities.files import PathLike #from .model_introspection import get_model_rnn_size @@ -46,7 +47,7 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): self.default = default self.needed_for = needed_for - +# See https://opennmt.net/OpenNMT-py/options/train.html ONMT_TRAIN_ARGS: List[Arg] = [ Arg("accum_count", "4", RxnCommand.TCF), Arg("adam_beta1", "0.9", RxnCommand.TF), @@ -74,12 +75,14 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): Arg("position_encoding", "", RxnCommand.T), # note: empty means "nothing" Arg("report_every", "1000", RxnCommand.TCF), Arg("reset_optim", None, RxnCommand.CF), - Arg("rnn_size", None, RxnCommand.TF), + Arg("hidden_size", None, RxnCommand.TF), Arg("save_checkpoint_steps", "5000", RxnCommand.TCF), Arg("save_model", None, RxnCommand.TCF), Arg("seed", None, RxnCommand.TCF), Arg("self_attn_type", "scaled-dot", RxnCommand.T), Arg("share_embeddings", "", RxnCommand.T), # note: empty means "nothing" + Arg("src_vocab", None, RxnCommand.T), + Arg("tgt_vocab", None, RxnCommand.T), Arg("train_from", None, RxnCommand.CF), Arg("train_steps", None, RxnCommand.TCF), Arg("transformer_ff", None, RxnCommand.T), @@ -89,7 +92,7 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): ] -class OnmtTrainCommand(Trainer): +class OnmtTrainCommand: """ Class to build the onmt_command for training models, continuing the training, or finetuning. @@ -112,6 +115,7 @@ def _build_cmd(self) -> List[str]: Build the base command. """ command = ["onmt_train"] + for arg in ONMT_TRAIN_ARGS: arg_given = arg.key in self._kwargs @@ -168,11 +172,18 @@ def cmd(self) -> List[str]: """ return self._build_cmd() - def save_to_config_cmd(self, config_file: PathLike) -> List[str]: + def save_to_config_cmd(self, config_file_path: PathLike) -> None: """ - Return the command for saving the config to a file. + Save the training config to a file. + See https://opennmt.net/OpenNMT-py/quickstart.html part 2 """ - return self._build_cmd() + ["-save_config", str(config_file)] + # Build dictionary with build vocab config content + # See structure https://opennmt.net/OpenNMT-py/quickstart.html (Step 1: Prepare the data) + train_config: Dict[str, Any] = {} + + + with open(config_file_path, "w+") as file: + yaml.dump(train_config, file) @staticmethod def execute_from_config_cmd(config_file: PathLike) -> List[str]: @@ -186,11 +197,13 @@ def train( cls, batch_size: int, data: PathLike, + src_vocab: Path, + tgt_vocab: Path, dropout: float, heads: int, layers: int, learning_rate: float, - rnn_size: int, + hidden_size: int, save_model: PathLike, seed: int, train_steps: int, @@ -207,12 +220,14 @@ def train( data_weights=data_weights, batch_size=batch_size, data=data, + src_vocab=src_vocab, + tgt_vocab=tgt_vocab, dropout=dropout, heads=heads, keep_checkpoint=keep_checkpoint, layers=layers, learning_rate=learning_rate, - rnn_size=rnn_size, + hidden_size=hidden_size, save_model=save_model, seed=seed, train_steps=train_steps, @@ -263,18 +278,17 @@ def finetune( train_steps: int, warmup_steps: int, no_gpu: bool, - data_weights: Tuple[int, ...], report_every: int, save_checkpoint_steps: int, keep_checkpoint: int = -1, - rnn_size: Optional[int] = None, + hidden_size: Optional[int] = None, ) -> "OnmtTrainCommand": - if rnn_size is None: + if hidden_size is None: # In principle, the rnn_size should not be needed for finetuning. However, # when resetting the decay algorithm for the learning rate, this value # is necessary - and does not get it from the model checkpoint (OpenNMT bug). #rnn_size = get_model_rnn_size(train_from) - logger.info(f"Loaded the value of rnn_size from the model: {rnn_size}.") + logger.info(f"Loaded the value of hidden_size from the model: {hidden_size}.") return cls( command_type=RxnCommand.F, @@ -286,7 +300,7 @@ def finetune( keep_checkpoint=keep_checkpoint, learning_rate=learning_rate, reset_optim="all", - #rnn_size=rnn_size, + hidden_size=hidden_size, save_model=save_model, seed=seed, train_from=train_from, From 3509f07eed36cc6c2e6a3da0dd1e3440c3eec7dd Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 16 Apr 2024 09:42:26 +0200 Subject: [PATCH 06/17] chore: black isort flake8 --- src/rxn/onmt_utils/train_command.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index 701c3bc..5ddcaea 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -1,12 +1,12 @@ import logging from enum import Flag from pathlib import Path -from typing import Any, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import yaml from rxn.utilities.files import PathLike -#from .model_introspection import get_model_rnn_size +# from .model_introspection import get_model_rnn_size logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) @@ -47,6 +47,7 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): self.default = default self.needed_for = needed_for + # See https://opennmt.net/OpenNMT-py/options/train.html ONMT_TRAIN_ARGS: List[Arg] = [ Arg("accum_count", "4", RxnCommand.TCF), @@ -181,7 +182,6 @@ def save_to_config_cmd(self, config_file_path: PathLike) -> None: # See structure https://opennmt.net/OpenNMT-py/quickstart.html (Step 1: Prepare the data) train_config: Dict[str, Any] = {} - with open(config_file_path, "w+") as file: yaml.dump(train_config, file) @@ -278,6 +278,7 @@ def finetune( train_steps: int, warmup_steps: int, no_gpu: bool, + data_weights: Tuple[int, ...], report_every: int, save_checkpoint_steps: int, keep_checkpoint: int = -1, @@ -287,8 +288,10 @@ def finetune( # In principle, the rnn_size should not be needed for finetuning. However, # when resetting the decay algorithm for the learning rate, this value # is necessary - and does not get it from the model checkpoint (OpenNMT bug). - #rnn_size = get_model_rnn_size(train_from) - logger.info(f"Loaded the value of hidden_size from the model: {hidden_size}.") + # rnn_size = get_model_rnn_size(train_from) + logger.info( + f"Loaded the value of hidden_size from the model: {hidden_size}." + ) return cls( command_type=RxnCommand.F, From 153f682fc639ef8e0e04707b4ce12c43218e9e93 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 16 Apr 2024 10:17:00 +0200 Subject: [PATCH 07/17] chore: more mypy flake8 isort --- pyproject.toml | 1 + .../onmt_utils/internal_translation_utils.py | 44 +++++++++---------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 740ff57..b77ad08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ check_untyped_defs = true [[tool.mypy.overrides]] module = [ "onmt.*", + "yaml.*", ] ignore_missing_imports = true diff --git a/src/rxn/onmt_utils/internal_translation_utils.py b/src/rxn/onmt_utils/internal_translation_utils.py index 3d2afbd..513908d 100644 --- a/src/rxn/onmt_utils/internal_translation_utils.py +++ b/src/rxn/onmt_utils/internal_translation_utils.py @@ -1,45 +1,43 @@ import copy import os from argparse import Namespace -from itertools import repeat +from itertools import islice, repeat from typing import Any, Iterable, Iterator, List, Optional import attr import onmt.opts as opts from onmt.translate.translator import build_translator -#from onmt.utils.misc import split_corpus + +# from onmt.utils.misc import split_corpus from onmt.utils.parse import ArgumentParser from rxn.utilities.files import named_temporary_path -from itertools import islice, repeat + # Introduced back _split_corpus and split_corpus originally in onmt.utils.misc # This commit gets rid of it: https://github.com/OpenNMT/OpenNMT-py/commit/4dcb2b9478eba32a480364e595f5fff7bd8ca887 # Since dependencies of split_corpus and _split_corpus are only itertools, it's easier to add them to source code def _split_corpus(path, shard_size): - """Yield a `list` containing `shard_size` line of `path`. - """ - with open(path, "rb") as f: - if shard_size <= 0: - yield f.readlines() - else: - while True: - shard = list(islice(f, shard_size)) - if not shard: - break - yield shard + """Yield a `list` containing `shard_size` line of `path`.""" + with open(path, "rb") as f: + if shard_size <= 0: + yield f.readlines() + else: + while True: + shard = list(islice(f, shard_size)) + if not shard: + break + yield shard def split_corpus(path, shard_size, default=None): - """yield a `list` containing `shard_size` line of `path`, - or repeatly generate `default` if `path` is None. - """ - if path is not None: - return _split_corpus(path, shard_size) - else: - return repeat(default) - + """yield a `list` containing `shard_size` line of `path`, + or repeatly generate `default` if `path` is None. + """ + if path is not None: + return _split_corpus(path, shard_size) + else: + return repeat(default) - @attr.s(auto_attribs=True) class TranslationResult: From 80a2187060c1c4733bf0ac96d8bf3af0c5eedd3e Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 16 Apr 2024 10:32:09 +0200 Subject: [PATCH 08/17] upgraded tests for python 3.11 --- .github/workflows/docs.yaml | 7 +++++-- .github/workflows/pypi.yaml | 8 +++++--- .github/workflows/tests.yaml | 7 +++++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 0774f3a..db0b0ac 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -9,12 +9,15 @@ jobs: build: runs-on: ubuntu-latest name: Build the Sphinx docs + strategy: + matrix: + python-version: ["3.11"] steps: - uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: ${{ matrix.python-version }} - name: Install package dependencies run: pip install -e . - name: Install sphinx dependencies diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index e5415f1..aa4f2e4 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -9,13 +9,15 @@ jobs: build-and-publish: name: Build and publish rxn-onmt-utils on PyPI runs-on: ubuntu-latest - + strategy: + matrix: + python-version: ["3.11"] steps: - uses: actions/checkout@master - - name: Python setup 3.9 + - name: Python setup ${{ matrix.python-version }} uses: actions/setup-python@v1 with: - python-version: 3.9 + python-version: ${{ matrix.python-version }} - name: Install build package (for packaging) run: pip install --upgrade build - name: Build dist diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 2c92ef9..af548a2 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -6,12 +6,15 @@ jobs: tests: runs-on: ubuntu-latest name: Style, mypy + strategy: + matrix: + python-version: ["3.11"] steps: - uses: actions/checkout@v3 - - name: Set up Python 3.7 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: - python-version: 3.7 + python-version: ${{ matrix.python-version }} - name: Install Dependencies run: pip install -e .[dev] - name: Check black From 7dc2c858074e401058f937ab5bf41a36bc5673e1 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 16 Apr 2024 10:38:49 +0200 Subject: [PATCH 09/17] chore: add torch to mypy stub ignore --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b77ad08..9087f7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ check_untyped_defs = true module = [ "onmt.*", "yaml.*", + "torch.*", ] ignore_missing_imports = true From d568d0fbeb11dace1de289d0d95fe1e00b22069d Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 16 Apr 2024 14:56:38 +0200 Subject: [PATCH 10/17] fix: correct format of config.yaml to train --- src/rxn/onmt_utils/train_command.py | 73 ++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index 5ddcaea..415275e 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -91,6 +91,7 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): Arg("warmup_steps", None, RxnCommand.TF), Arg("word_vec_size", None, RxnCommand.T), ] +# TODO: (Irina) Add new v.3.5.1 arguments like lora_layers, quant_layers if necessary class OnmtTrainCommand: @@ -173,15 +174,83 @@ def cmd(self) -> List[str]: """ return self._build_cmd() + def is_valid_kwarg_value(self, kwarg, value) -> bool: + # NOTE: upgrade to v.3.5.1 + # A lot of the code below is from self._build_cmd() + # In theory, self._build_cmd() could be deprecated but to avoid breaking something, + # it will stay until 100% sure + # Here we jsut need the checks and not construct a command + # TODO: assess deprecation of self._build_cmd() + + # Check if argument is in ONMT_TRAIN_ARGS + for arg in ONMT_TRAIN_ARGS: + if arg.key == kwarg: + onmt_train_kwarg = arg + + try: + onmt_train_kwarg + except NameError: + NameError(f"Argument {kwarg} doesn't exist in ONMT_TRAIN_ARGS.") + + # Check argument is needed for command + if self._command_type not in onmt_train_kwarg.needed_for: + raise ValueError( + f'"{value}" value given for arg {kwarg}, but not necessary for command {self._command_type}' + ) + # Check if argument has no default and needs a value + if onmt_train_kwarg.default is None and value is None: + raise ValueError(f"No value given for {kwarg} and needs one.") + + return True + def save_to_config_cmd(self, config_file_path: PathLike) -> None: """ Save the training config to a file. See https://opennmt.net/OpenNMT-py/quickstart.html part 2 """ - # Build dictionary with build vocab config content - # See structure https://opennmt.net/OpenNMT-py/quickstart.html (Step 1: Prepare the data) + # Build train config content, it will not include defaults not specified in cli + # See structure https://opennmt.net/OpenNMT-py/quickstart.html (Step 2: Train) train_config: Dict[str, Any] = {} + # Dump all cli arguments to dict + for kwarg, value in self._kwargs.items(): + if self.is_valid_kwarg_value(kwarg, value): + train_config[kwarg] = value + else: + raise ValueError(f'"Value {value}" for argument {kwarg} is invalid') + + # Reformat "data" argument as in ONMT-py v.3.5.0 + path_save_prepr_data = train_config["data"] + train_config["save_data"] = str(path_save_prepr_data) + # TODO: update to > 1 corpus + train_config["data"] = {"corpus_1": {}, "valid": {}} + train_config["data"]["corpus_1"]["path_src"] = str( + path_save_prepr_data.parent.parent + / "data.processed.train.precursors_tokens" + ) + train_config["data"]["corpus_1"]["path_tgt"] = str( + path_save_prepr_data.parent.parent / "data.processed.train.products_tokens" + ) + train_config["data"]["valid"]["path_src"] = str( + path_save_prepr_data.parent.parent + / "data.processed.validation.precursors_tokens" + ) + train_config["data"]["valid"]["path_tgt"] = str( + path_save_prepr_data.parent.parent + / "data.processed.validation.products_tokens" + ) + + train_config["src_vocab"] = str( + train_config["src_vocab"] + ) # avoid posix bad format in yaml + train_config["tgt_vocab"] = str( + train_config["tgt_vocab"] + ) # avoid posix bad format in yaml + train_config["save_model"] = str( + train_config["save_model"] + ) # avoid posix bad format in yaml + + # Dump to config.yaml with open(config_file_path, "w+") as file: yaml.dump(train_config, file) From 765d97e1371bc2ed00b4954defa8c4e14b5f86c5 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 30 Apr 2024 10:11:45 +0200 Subject: [PATCH 11/17] fix: correct gpu argument format onmt-py --- src/rxn/onmt_utils/train_command.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index 415275e..8039f5e 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +import torch import yaml from rxn.utilities.files import PathLike @@ -212,6 +213,10 @@ def save_to_config_cmd(self, config_file_path: PathLike) -> None: # See structure https://opennmt.net/OpenNMT-py/quickstart.html (Step 2: Train) train_config: Dict[str, Any] = {} + # GPUs + if torch.cuda.is_available() and self._no_gpu is False: + train_config["gpu_ranks"] = [0] + # Dump all cli arguments to dict for kwarg, value in self._kwargs.items(): if self.is_valid_kwarg_value(kwarg, value): From f1a0b970411aac308a3cba36c942297933a4dd91 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 30 Apr 2024 14:59:22 +0200 Subject: [PATCH 12/17] fix: added updated defaults arguments --- src/rxn/onmt_utils/train_command.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index 8039f5e..bfebdee 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -73,8 +73,8 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): Arg("normalization", "tokens", RxnCommand.TCF), Arg("optim", "adam", RxnCommand.TF), Arg("param_init", "0", RxnCommand.T), - Arg("param_init_glorot", "", RxnCommand.T), # note: empty means "nothing" - Arg("position_encoding", "", RxnCommand.T), # note: empty means "nothing" + Arg("param_init_glorot", "true", RxnCommand.T), # note: empty means "nothing" + Arg("position_encoding", "true", RxnCommand.T), # note: empty means "nothing" Arg("report_every", "1000", RxnCommand.TCF), Arg("reset_optim", None, RxnCommand.CF), Arg("hidden_size", None, RxnCommand.TF), @@ -82,7 +82,7 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): Arg("save_model", None, RxnCommand.TCF), Arg("seed", None, RxnCommand.TCF), Arg("self_attn_type", "scaled-dot", RxnCommand.T), - Arg("share_embeddings", "", RxnCommand.T), # note: empty means "nothing" + Arg("share_embeddings", "true", RxnCommand.T), # note: empty means "nothing" Arg("src_vocab", None, RxnCommand.T), Arg("tgt_vocab", None, RxnCommand.T), Arg("train_from", None, RxnCommand.CF), @@ -217,12 +217,20 @@ def save_to_config_cmd(self, config_file_path: PathLike) -> None: if torch.cuda.is_available() and self._no_gpu is False: train_config["gpu_ranks"] = [0] - # Dump all cli arguments to dict - for kwarg, value in self._kwargs.items(): - if self.is_valid_kwarg_value(kwarg, value): - train_config[kwarg] = value + for arg in ONMT_TRAIN_ARGS: + arg_given = arg.key in self._kwargs + + if arg_given: + train_config[arg.key] = self._kwargs[arg.key] else: - raise ValueError(f'"Value {value}" for argument {kwarg} is invalid') + train_config[arg.key] = arg.default + + # Dump all cli arguments to dict + # for kwarg, value in self._kwargs.items(): + # if self.is_valid_kwarg_value(kwarg, value): + # train_config[kwarg] = value + # else: + # raise ValueError(f'"Value {value}" for argument {kwarg} is invalid') # Reformat "data" argument as in ONMT-py v.3.5.0 path_save_prepr_data = train_config["data"] @@ -255,6 +263,9 @@ def save_to_config_cmd(self, config_file_path: PathLike) -> None: train_config["save_model"] ) # avoid posix bad format in yaml + # share_vocab + train_config["share_vocab"] = str(True) + # Dump to config.yaml with open(config_file_path, "w+") as file: yaml.dump(train_config, file) From 6725e6dad0c3fc563aa3eba498289c207fe9b8e2 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Fri, 24 May 2024 12:41:30 +0200 Subject: [PATCH 13/17] fix: config file paths for src, tgt also for retro --- src/rxn/onmt_utils/train_command.py | 55 +++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index bfebdee..b92e7b5 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -13,6 +13,28 @@ logger.addHandler(logging.NullHandler()) +def get_paths_src_tgt(dir: PathLike, model_task: str) -> Tuple[str, str, str, str]: + # reaction is A -> B irregardless of retro or forward + if model_task == "forward": + A = "precursors" + B = "products" + elif model_task == "retro": + A = "products" + B = "precursors" + pass + else: + raise ValueError( + f"Argument model_task can only be 'forward' or 'retro' but received {model_task}" + ) + + corpus_path_src = f"{dir}/data.processed.train.{A}_tokens" + corpus_path_tgt = f"{dir}/data.processed.train.{B}_tokens" + valid_path_src = f"{dir}/data.processed.validation.{A}_tokens" + valid_path_tgt = f"{dir}/data.processed.validation.{B}_tokens" + + return corpus_path_src, corpus_path_tgt, valid_path_src, valid_path_tgt + + class RxnCommand(Flag): """ Flag indicating which command(s) the parameters relate to. @@ -106,12 +128,14 @@ def __init__( command_type: RxnCommand, no_gpu: bool, data_weights: Tuple[int, ...], + model_task: str, **kwargs: Any, ): self._command_type = command_type self._no_gpu = no_gpu self._data_weights = data_weights self._kwargs = kwargs + self.model_task = model_task def _build_cmd(self) -> List[str]: """ @@ -237,22 +261,19 @@ def save_to_config_cmd(self, config_file_path: PathLike) -> None: train_config["save_data"] = str(path_save_prepr_data) # TODO: update to > 1 corpus train_config["data"] = {"corpus_1": {}, "valid": {}} - train_config["data"]["corpus_1"]["path_src"] = str( - path_save_prepr_data.parent.parent - / "data.processed.train.precursors_tokens" - ) - train_config["data"]["corpus_1"]["path_tgt"] = str( - path_save_prepr_data.parent.parent / "data.processed.train.products_tokens" - ) - train_config["data"]["valid"]["path_src"] = str( - path_save_prepr_data.parent.parent - / "data.processed.validation.precursors_tokens" - ) - train_config["data"]["valid"]["path_tgt"] = str( - path_save_prepr_data.parent.parent - / "data.processed.validation.products_tokens" + + # get data files path, caution depends on task because ONMT preprocessed files in v.3.5.1 aren't fully processed as with earlier versions + corpus_path_src, corpus_path_tgt, valid_path_src, valid_path_tgt = ( + get_paths_src_tgt( + dir=path_save_prepr_data.parent.parent, model_task=self.model_task + ) ) + train_config["data"]["corpus_1"]["path_src"] = corpus_path_src + train_config["data"]["corpus_1"]["path_tgt"] = corpus_path_tgt + train_config["data"]["valid"]["path_src"] = valid_path_src + train_config["data"]["valid"]["path_tgt"] = valid_path_tgt + train_config["src_vocab"] = str( train_config["src_vocab"] ) # avoid posix bad format in yaml @@ -297,6 +318,7 @@ def train( word_vec_size: int, no_gpu: bool, data_weights: Tuple[int, ...], + model_task: str, keep_checkpoint: int = -1, ) -> "OnmtTrainCommand": return cls( @@ -319,6 +341,7 @@ def train( transformer_ff=transformer_ff, warmup_steps=warmup_steps, word_vec_size=word_vec_size, + model_task=model_task, ) @classmethod @@ -333,6 +356,7 @@ def continue_training( train_steps: int, no_gpu: bool, data_weights: Tuple[int, ...], + model_task: str, keep_checkpoint: int = -1, ) -> "OnmtTrainCommand": return cls( @@ -348,6 +372,7 @@ def continue_training( seed=seed, train_from=train_from, train_steps=train_steps, + model_task=model_task, ) @classmethod @@ -366,6 +391,7 @@ def finetune( data_weights: Tuple[int, ...], report_every: int, save_checkpoint_steps: int, + model_task: str, keep_checkpoint: int = -1, hidden_size: Optional[int] = None, ) -> "OnmtTrainCommand": @@ -396,6 +422,7 @@ def finetune( warmup_steps=warmup_steps, report_every=report_every, save_checkpoint_steps=save_checkpoint_steps, + model_task=model_task, ) From 6c7b563e17cae6d34cd4e9ce69cc85a791b7dc16 Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Mon, 27 May 2024 09:46:29 +0200 Subject: [PATCH 14/17] fix: remove -log_probs argument from rxn-translate --- src/rxn/onmt_utils/translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rxn/onmt_utils/translate.py b/src/rxn/onmt_utils/translate.py index 3c3a689..8c0d1ad 100644 --- a/src/rxn/onmt_utils/translate.py +++ b/src/rxn/onmt_utils/translate.py @@ -158,7 +158,7 @@ def translate_as_external_command( str(src), "-output", str(output), - "-log_probs", + #"-log_probs", "-n_best", str(n_best), "-beam_size", From bb9a2168aeabbfc265c4da4c3ef118a510cf130e Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Wed, 29 May 2024 15:05:23 +0200 Subject: [PATCH 15/17] temptative fix: translation utils upgrade --- .../onmt_utils/internal_translation_utils.py | 86 ++++++++++++++----- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/src/rxn/onmt_utils/internal_translation_utils.py b/src/rxn/onmt_utils/internal_translation_utils.py index 513908d..8675cd5 100644 --- a/src/rxn/onmt_utils/internal_translation_utils.py +++ b/src/rxn/onmt_utils/internal_translation_utils.py @@ -6,6 +6,8 @@ import attr import onmt.opts as opts +from onmt.constants import CorpusTask +from onmt.inputters.dynamic_iterator import build_dynamic_dataset_iter from onmt.translate.translator import build_translator # from onmt.utils.misc import split_corpus @@ -29,7 +31,7 @@ def _split_corpus(path, shard_size): yield shard -def split_corpus(path, shard_size, default=None): +def split_corpus(path, shard_size=0, default=None): """yield a `list` containing `shard_size` line of `path`, or repeatly generate `default` if `path` is None. """ @@ -128,29 +130,64 @@ def translate_with_onmt(self, opt) -> Iterator[List[TranslationResult]]: """ # for some versions, it seems that n_best is not updated, we therefore do it manually here self.internal_translator.n_best = opt.n_best - - src_shards = split_corpus(opt.src, opt.shard_size) - tgt_shards = ( - split_corpus(opt.tgt, opt.shard_size) - if opt.tgt is not None - else repeat(None) + opt.shard_size = 0 # IRINA + + opt.src_dir = opt.src.parent + #import ipdb + #ipdb.set_trace() + #src_shards = split_corpus(opt.src, opt.shard_size) + #tgt_shards = ( + # split_corpus(opt.tgt, opt.shard_size) + # if opt.tgt is not None + # else repeat(None) + #) + #shard_pairs = zip(src_shards, tgt_shards) + + infer_iter = build_dynamic_dataset_iter( + opt=opt, + transforms_cls=opt.transforms, + vocabs=self.internal_translator.vocabs, + task=CorpusTask.INFER, + device_id=opt.gpu, ) - shard_pairs = zip(src_shards, tgt_shards) - - for i, (src_shard, tgt_shard) in enumerate(shard_pairs): - l1, l2 = self.internal_translator.translate( - src=src_shard, - tgt=tgt_shard, - src_dir=opt.src_dir, - batch_size=opt.batch_size, - batch_type=opt.batch_type, + l1_total, l2_total = self.internal_translator._translate( # IRINA + infer_iter=infer_iter, attn_debug=opt.attn_debug, - ) - for score_list, translation_list in zip(l1, l2): - yield [ - TranslationResult(text=t, score=s.item()) - for s, t in zip(score_list, translation_list) - ] + + ) + for score_list, translation_list in zip(l1_total, l2_total): + yield [ + TranslationResult(text=t, score=s) + for s, t in zip(score_list, translation_list) + ] + + # for i, (src_shard, tgt_shard) in enumerate(shard_pairs): + # #import ipdb + # #ipdb.set_trace() + # infer_iter = build_dynamic_dataset_iter( + # opt=opt, + # transforms_cls=opt.transforms, + # vocabs=self.internal_translator.vocabs, + # task=CorpusTask.INFER, + # #device_id=self.device_id, + # src=src_shard, + # tgt=tgt_shard, + # ) + # l1, l2 = self.internal_translator._translate( # IRINA + # infer_iter=infer_iter, + # attn_debug=opt.attn_debug, + # # src=src_shard, + # # tgt=tgt_shard, + # # src_dir=opt.src_dir, + # # batch_size=opt.batch_size, + # # batch_type=opt.batch_type, + # # attn_debug=opt.attn_debug, + # ) + # for score_list, translation_list in zip(l1, l2): + # yield [ + # TranslationResult(text=t, score=s.item()) + # for s, t in zip(score_list, translation_list) + # ] def get_onmt_opt( @@ -192,7 +229,10 @@ def onmt_parser() -> ArgumentParser: parser = ArgumentParser(description="translate.py") - opts.config_opts(parser) + #opts.config_opts(parser) # IRINA + + #import ipdb + #ipdb.set_trace() opts.translate_opts(parser) return parser From 3561002cb13158fb91fa5690e0d1cd406a6b0f5f Mon Sep 17 00:00:00 2001 From: Irina Espejo Morales Date: Tue, 22 Oct 2024 12:30:58 +0200 Subject: [PATCH 16/17] fix: delete intermediate variables to avoid OOM when generating --- .../onmt_utils/internal_translation_utils.py | 28 +++++++++---------- src/rxn/onmt_utils/model_introspection.py | 3 +- src/rxn/onmt_utils/train_command.py | 1 + src/rxn/onmt_utils/translator.py | 5 +++- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/rxn/onmt_utils/internal_translation_utils.py b/src/rxn/onmt_utils/internal_translation_utils.py index 8675cd5..cc106ba 100644 --- a/src/rxn/onmt_utils/internal_translation_utils.py +++ b/src/rxn/onmt_utils/internal_translation_utils.py @@ -6,6 +6,7 @@ import attr import onmt.opts as opts +import torch from onmt.constants import CorpusTask from onmt.inputters.dynamic_iterator import build_dynamic_dataset_iter from onmt.translate.translator import build_translator @@ -117,6 +118,7 @@ def translate_sentences_with_onmt( else: yield translation_results + @torch.no_grad() def translate_with_onmt(self, opt) -> Iterator[List[TranslationResult]]: """ Do the translation (in tokenized format) with OpenNMT. @@ -130,18 +132,11 @@ def translate_with_onmt(self, opt) -> Iterator[List[TranslationResult]]: """ # for some versions, it seems that n_best is not updated, we therefore do it manually here self.internal_translator.n_best = opt.n_best - opt.shard_size = 0 # IRINA opt.src_dir = opt.src.parent - #import ipdb - #ipdb.set_trace() - #src_shards = split_corpus(opt.src, opt.shard_size) - #tgt_shards = ( - # split_corpus(opt.tgt, opt.shard_size) - # if opt.tgt is not None - # else repeat(None) - #) - #shard_pairs = zip(src_shards, tgt_shards) + + #pprint.pprint(opt) + infer_iter = build_dynamic_dataset_iter( opt=opt, @@ -150,16 +145,19 @@ def translate_with_onmt(self, opt) -> Iterator[List[TranslationResult]]: task=CorpusTask.INFER, device_id=opt.gpu, ) + l1_total, l2_total = self.internal_translator._translate( # IRINA infer_iter=infer_iter, attn_debug=opt.attn_debug, - ) + + del infer_iter for score_list, translation_list in zip(l1_total, l2_total): yield [ TranslationResult(text=t, score=s) for s, t in zip(score_list, translation_list) ] + del l1_total, l2_total # for i, (src_shard, tgt_shard) in enumerate(shard_pairs): # #import ipdb @@ -219,6 +217,10 @@ def get_onmt_opt( setattr(opt, key, value) ArgumentParser.validate_translate_opts(opt) + #opt.random_sampling_topk = 1.0 + #opt.length_penalty = "none" + #opt.alpha = 0 + return opt @@ -231,8 +233,6 @@ def onmt_parser() -> ArgumentParser: #opts.config_opts(parser) # IRINA - #import ipdb - #ipdb.set_trace() opts.translate_opts(parser) - return parser + return parser \ No newline at end of file diff --git a/src/rxn/onmt_utils/model_introspection.py b/src/rxn/onmt_utils/model_introspection.py index e6e9cd2..558355c 100644 --- a/src/rxn/onmt_utils/model_introspection.py +++ b/src/rxn/onmt_utils/model_introspection.py @@ -2,7 +2,8 @@ from typing import Any, Dict, List import torch -from onmt.inputters.text_dataset import TextMultiField + +#from onmt.inputters.text_dataset import TextMultiField from rxn.utilities.files import PathLike diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index b92e7b5..98694c8 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -113,6 +113,7 @@ def __init__(self, key: str, default: Any, needed_for: RxnCommand): Arg("valid_batch_size", "8", RxnCommand.TCF), Arg("warmup_steps", None, RxnCommand.TF), Arg("word_vec_size", None, RxnCommand.T), + Arg("pos_ffn_activation_fn", "relu", RxnCommand.T), ] # TODO: (Irina) Add new v.3.5.1 arguments like lora_layers, quant_layers if necessary diff --git a/src/rxn/onmt_utils/translator.py b/src/rxn/onmt_utils/translator.py index cc97173..415043d 100644 --- a/src/rxn/onmt_utils/translator.py +++ b/src/rxn/onmt_utils/translator.py @@ -1,6 +1,8 @@ from argparse import Namespace from typing import Any, Iterable, Iterator, List, Optional, Union +import torch + from .internal_translation_utils import RawTranslator, TranslationResult, get_onmt_opt @@ -27,6 +29,7 @@ def translate_single(self, sentence: str) -> str: assert len(translations) == 1 return translations[0] + @torch.no_grad() def translate_sentences(self, sentences: Iterable[str]) -> List[str]: """ Translate multiple sentences. @@ -34,6 +37,7 @@ def translate_sentences(self, sentences: Iterable[str]) -> List[str]: translations = self.translate_multiple_with_scores(sentences) return [t[0].text for t in translations] + @torch.no_grad() def translate_multiple_with_scores( self, sentences: Iterable[str], n_best: Optional[int] = None ) -> Iterator[List[TranslationResult]]: @@ -51,7 +55,6 @@ def translate_multiple_with_scores( translations = self.onmt_translator.translate_sentences_with_onmt( sentences, **additional_opt_kwargs ) - yield from translations @classmethod From 0058c723c7371c6ff3b88647247c9e44cf1ffaa7 Mon Sep 17 00:00:00 2001 From: Helder Lopes Date: Wed, 30 Oct 2024 10:46:29 +0000 Subject: [PATCH 17/17] fix: updates to continue training for newest OpenNMT-py --- src/rxn/onmt_utils/model_introspection.py | 18 +++++++++++++----- src/rxn/onmt_utils/train_command.py | 4 ++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/rxn/onmt_utils/model_introspection.py b/src/rxn/onmt_utils/model_introspection.py index 558355c..d07b6dc 100644 --- a/src/rxn/onmt_utils/model_introspection.py +++ b/src/rxn/onmt_utils/model_introspection.py @@ -30,6 +30,14 @@ def get_preprocessed_vocab(vocab_path: PathLike) -> List[str]: return _torch_vocab_to_list(vocab) +def read_vocab_file(file_path): + vocab = [] + with open(file_path, 'r') as file: + for line in file: + vocab.append(line.split()[0]) # Split each line and take the first element + return vocab + + def model_vocab_is_compatible(model_pt: PathLike, vocab_pt: PathLike) -> bool: """ Determine whether the vocabulary contained in a model checkpoint contains @@ -40,20 +48,20 @@ def model_vocab_is_compatible(model_pt: PathLike, vocab_pt: PathLike) -> bool: vocab_pt: vocab file, such as `preprocessed.vocab.pt`. """ model_vocab = set(get_model_vocab(model_pt)) - data_vocab = set(get_preprocessed_vocab(vocab_pt)) + data_vocab = set(read_vocab_file(vocab_pt)) return data_vocab.issubset(model_vocab) def _torch_vocab_to_list(vocab: Dict[str, Any]) -> List[str]: - src_vocab = _multifield_vocab_to_list(vocab["src"]) - tgt_vocab = _multifield_vocab_to_list(vocab["tgt"]) + src_vocab = vocab["src"] #_multifield_vocab_to_list(vocab["src"]) + tgt_vocab = vocab["tgt"] #_multifield_vocab_to_list(vocab["tgt"]) if src_vocab != tgt_vocab: raise RuntimeError("Handling of different src/tgt vocab not implemented") return src_vocab -def _multifield_vocab_to_list(multifield: TextMultiField) -> List[str]: - return multifield.base_field.vocab.itos[:] +#def _multifield_vocab_to_list(multifield: TextMultiField) -> List[str]: +# return multifield.base_field.vocab.itos[:] def get_model_opt(model_path: PathLike) -> Namespace: diff --git a/src/rxn/onmt_utils/train_command.py b/src/rxn/onmt_utils/train_command.py index 98694c8..4298543 100644 --- a/src/rxn/onmt_utils/train_command.py +++ b/src/rxn/onmt_utils/train_command.py @@ -350,6 +350,8 @@ def continue_training( cls, batch_size: int, data: PathLike, + src_vocab: Path, + tgt_vocab: Path, dropout: float, save_model: PathLike, seed: int, @@ -366,6 +368,8 @@ def continue_training( data_weights=data_weights, batch_size=batch_size, data=data, + src_vocab=src_vocab, + tgt_vocab=tgt_vocab, dropout=dropout, keep_checkpoint=keep_checkpoint, reset_optim="none",