Add model-last saving mechanism to pretraining (#12459)

* Adjust pretrain command * chane naming and add finally block * Add unit test * Add unit test assertions * Update spacy/training/pretrain.py Co-authored-by: Adriane Boyd <[email protected]> * change finally block * Add to docs * Update website/docs/usage/embeddings-transformers.mdx * Add flag to skip saving model-last --------- Co-authored-by: Adriane Boyd <[email protected]>
explosion · Apr 3, 2023 · 2fbd080 · 2fbd080
1 parent bbf232e
commit 2fbd080
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 33 deletions.
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
@@ -23,6 +23,7 @@ def pretrain_cli(
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
     # fmt: on
 ):
     """
@@ -74,6 +75,7 @@ def pretrain_cli(
         epoch_resume=epoch_resume,
         use_gpu=use_gpu,
         silent=False,
+        skip_last=skip_last,
     )
     msg.good("Successfully finished pretrain")
 

diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py
@@ -165,7 +165,8 @@ def test_pretraining_default():
 
 
 @pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
-def test_pretraining_tok2vec_characters(objective):
+@pytest.mark.parametrize("skip_last", (True, False))
+def test_pretraining_tok2vec_characters(objective, skip_last):
     """Test that pretraining works with the character objective"""
     config = Config().from_str(pretrain_string_listener)
     config["pretraining"]["objective"] = objective
@@ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
         filled["paths"]["raw_text"] = file_path
         filled = filled.interpolate()
         assert filled["pretraining"]["component"] == "tok2vec"
-        pretrain(filled, tmp_dir)
+        pretrain(filled, tmp_dir, skip_last=skip_last)
         assert Path(tmp_dir / "model0.bin").exists()
         assert Path(tmp_dir / "model4.bin").exists()
         assert not Path(tmp_dir / "model5.bin").exists()
+        if skip_last:
+            assert not Path(tmp_dir / "model-last.bin").exists()
+        else:
+            assert Path(tmp_dir / "model-last.bin").exists()
 
 
 @pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
@@ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
         pretrain(filled, tmp_dir)
         assert Path(tmp_dir / "model0.bin").exists()
         assert Path(tmp_dir / "model4.bin").exists()
+        assert Path(tmp_dir / "model-last.bin").exists()
         assert not Path(tmp_dir / "model5.bin").exists()
 
 

diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
@@ -24,6 +24,7 @@ def pretrain(
     epoch_resume: Optional[int] = None,
     use_gpu: int = -1,
     silent: bool = True,
+    skip_last: bool = False,
 ):
     msg = Printer(no_print=silent)
     if config["training"]["seed"] is not None:
@@ -60,10 +61,14 @@ def pretrain(
     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 
-    def _save_model(epoch, is_temp=False):
+    def _save_model(epoch, is_temp=False, is_last=False):
         is_temp_str = ".temp" if is_temp else ""
         with model.use_params(optimizer.averages):
-            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+            if is_last:
+                save_path = output_dir / f"model-last.bin"
+            else:
+                save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
+            with (save_path).open("wb") as file_:
                 file_.write(model.get_ref("tok2vec").to_bytes())
             log = {
                 "nr_word": tracker.nr_word,
@@ -76,22 +81,26 @@ def _save_model(epoch, is_temp=False):
 
     # TODO: I think we probably want this to look more like the
     # 'create_train_batches' function?
-    for epoch in range(epoch_resume, P["max_epochs"]):
-        for batch_id, batch in enumerate(batcher(corpus(nlp))):
-            docs = ensure_docs(batch)
-            loss = make_update(model, docs, optimizer, objective)
-            progress = tracker.update(epoch, loss, docs)
-            if progress:
-                msg.row(progress, **row_settings)
-            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
-                _save_model(epoch, is_temp=True)
-
-        if P["n_save_epoch"]:
-            if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+    try:
+        for epoch in range(epoch_resume, P["max_epochs"]):
+            for batch_id, batch in enumerate(batcher(corpus(nlp))):
+                docs = ensure_docs(batch)
+                loss = make_update(model, docs, optimizer, objective)
+                progress = tracker.update(epoch, loss, docs)
+                if progress:
+                    msg.row(progress, **row_settings)
+                if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
+                    _save_model(epoch, is_temp=True)
+
+            if P["n_save_epoch"]:
+                if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+                    _save_model(epoch)
+            else:
                 _save_model(epoch)
-        else:
-            _save_model(epoch)
-        tracker.epoch_loss = 0.0
+            tracker.epoch_loss = 0.0
+    finally:
+        if not skip_last:
+            _save_model(P["max_epochs"], is_last=True)
 
 
 def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
@@ -1122,17 +1122,18 @@ auto-generated by setting `--pretraining` on
 $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
 ```
 
-| Name                    | Description                                                                                                                                                                                                        |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
-| `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
-| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
-| `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
-| overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
-| **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
+| Name                                               | Description                                                                                                                                                                                                        |
+| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
+| `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
+| `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
+| `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
+| `--gpu-id`, `-g`                                   | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
+| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                 |
+| `--help`, `-h`                                     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
+| overrides                                          | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
+| **CREATES**                                        | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
 
 ## evaluate {id="evaluate",version="2",tag="command"}
 

diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
@@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
 that you want to use from pretraining.
 
 A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
-an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
-make use of the final output, you could fill in this value in your config file:
+an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
+copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
+configure `n_save_epoch` to tell pretraining in which epoch interval it should
+save the current training progress. To use the final output to initialize your
+`tok2vec` layer, you could fill in this value in your config file:
 
 ```ini {title="config.cfg"}
 
 [paths]
-init_tok2vec = "pretrain/model4.bin"
+init_tok2vec = "pretrain/model-last.bin"
 
 [initialize]
 init_tok2vec = ${paths.init_tok2vec}