gradient checkpoining

stanfordnlp · Jan 16, 2024 · 6be26ba · 6be26ba
1 parent 145eedf
commit 6be26ba
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 0 deletions.
diff --git a/stanza/models/ner/trainer.py b/stanza/models/ner/trainer.py
@@ -72,6 +72,9 @@ def __init__(self, args=None, vocab=None, pretrain=None, model_file=None, device
             self.vocab = vocab
             self.model = NERTagger(args, vocab, emb_matrix=pretrain.emb, foundation_cache=foundation_cache)
 
+            if self.args.get("gradient_checkpointing", False) and self.args.get("bert_finetune", False):
+                self.bert_model.gradient_checkpointing_enable()
+
         # if this wasn't set anywhere, we use a default of the 0th tagset
         # we don't set this as a default in the options so that
         # we can distinguish "intentionally set to 0" and "not set at all"

diff --git a/stanza/models/ner_tagger.py b/stanza/models/ner_tagger.py
@@ -80,6 +80,7 @@ def build_argparse():
     parser.add_argument('--no_bert_model', dest='bert_model', action="store_const", const=None, help="Don't use bert")
     parser.add_argument('--bert_hidden_layers', type=int, default=None, help="How many layers of hidden state to use from the transformer")
     parser.add_argument('--bert_finetune', default=False, action='store_true', help='Finetune the bert (or other transformer)')
+    parser.add_argument('--gradient_checkpointing', default=False, action='store_true', help='Checkpoint intermediate gradients between layers to save memory at the cost of training steps')
     parser.add_argument('--no_bert_finetune', dest='bert_finetune', action='store_false', help="Don't finetune the bert (or other transformer)")
     parser.add_argument('--bert_learning_rate', default=1.0, type=float, help='Scale the learning rate for transformer finetuning by this much')