Skip to content

Commit

Permalink
ADD: fixes and scripts for real-kie runs
Browse files Browse the repository at this point in the history
  • Loading branch information
benleetownsend committed Jan 29, 2025
1 parent 666d036 commit d277f5e
Show file tree
Hide file tree
Showing 12 changed files with 300 additions and 159 deletions.
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ services:
dockerfile: docker/Dockerfile.gpu
volumes:
- .:/Finetune
- ../project_fruitfly/datasets:/datasets
deploy:
resources:
reservations:
Expand Down
2 changes: 1 addition & 1 deletion finetune/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def _distribute_strategy(self, visible_gpus):
resolved_gpus = all_gpus()

resolved_gpus_string = ["/gpu:{}".format(gpu) for gpu in resolved_gpus]
if len(resolved_gpus_string) == 1:
if len(resolved_gpus_string) <= 1:
distribute_strategy = None
else:
if self.config.per_process_gpu_memory_fraction is not None:
Expand Down
4 changes: 3 additions & 1 deletion finetune/base_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def translate_base_model_format(cls):
)
from finetune.base_models.tcn.model import TCNModel
from finetune.base_models.oscar.model import GPCModel

from finetune.base_models.modern_bert.model import ModernBertModel, ModernBertLargeModel
# Aliases
GPT = GPTModel
GPT2 = GPT2Small = GPT2Model
Expand All @@ -85,3 +85,5 @@ def translate_base_model_format(cls):
DistilROBERTA = DistilRoBERTa
TCN = TCNModel
OSCAR = GPCModel
ModernBert = ModernBertModel
ModernBertLarge = ModernBertLargeModel
8 changes: 6 additions & 2 deletions finetune/base_models/modern_bert/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@

LOGGER = logging.getLogger("finetune")


class ModernBertEncoder(BaseEncoder):
def __init__(self):
self.tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
special_tokens_map = {tok.content: k for k, tok in self.tokenizer.get_added_tokens_decoder().items()}
special_tokens_map = {
tok.content: k
for k, tok in self.tokenizer.get_added_tokens_decoder().items()
}
self.start_token = special_tokens_map["[CLS]"]
self.delimiter_token = special_tokens_map["[SEP]"]
self.mask_token = special_tokens_map["[MASK]"]
Expand Down Expand Up @@ -59,4 +63,4 @@ def _encode(self, texts):

def decode(self, ids):
output = self.tokenizer.decode(ids, skip_special_tokens=True)
return output
return output
52 changes: 44 additions & 8 deletions finetune/base_models/modern_bert/model.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import os
import tensorflow as tf
from finetune.base_models.modern_bert.modelling import ModernBert, ModernBertConfig
from finetune.base_models.modern_bert.modelling import ModernBert
from finetune.base_models.modern_bert.encoding import ModernBertEncoder
from finetune.base_models import SourceModel


def featurizer(
X, encoder, config, train=False, reuse=None, lengths=None, **kwargs
):
def featurizer(X, encoder, config, train=False, reuse=None, lengths=None, **kwargs):
initial_shape = tf.shape(input=X)
X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-1:]), 0))
X.set_shape([None, None])
Expand All @@ -16,10 +14,11 @@ def featurizer(
seq_length = tf.shape(input=delimiters)[1]
mask = tf.sequence_mask(lengths, maxlen=seq_length, dtype=tf.float32)
with tf.compat.v1.variable_scope("model/featurizer", reuse=reuse):
# TODO: plumb in the config to the finetune config.
model = ModernBert(config=ModernBertConfig())
model = ModernBert(config=config, vocab_size=encoder.vocab_size)
embedding = model.embeddings
sequence_out = model(input_ids=X, attention_mask=mask, training=train, seq_len=seq_length)
sequence_out = model(
input_ids=X, attention_mask=mask, training=train, seq_len=seq_length
)
pooled_out = sequence_out[:, 0, :]
pooled_out.set_shape([None, config.n_embed])
n_embed = pooled_out.shape[-1]
Expand All @@ -43,11 +42,21 @@ def featurizer(

return output_state

SETTINGS = {
"lr": 1.5e-4,
"n_epochs": 6,
"predict_batch_size": 4,
"max_grad_norm": 3.0,
"l2_reg": 5e-4,
"lr_warmup": 0.25, # this seems high but is what the sweep ended up with.
"lr_schedule": "warmup_linear",
"low_memory_mode": True,
}

class ModernBertModel(SourceModel):
encoder = ModernBertEncoder
featurizer = featurizer
max_length = 2048
max_length = 512

settings = {
"base_model_path": os.path.join("modern_bert", "modern_bert.jl"),
Expand All @@ -57,6 +66,33 @@ class ModernBertModel(SourceModel):
"n_embed": 768,
"max_length": max_length,
"include_bos_eos": True,
"n_heads": 12,
"batch_size": 8,
# "accum_steps": 8,
"bert_intermediate_size": 1152,
**SETTINGS
}
required_files = []



class ModernBertLargeModel(SourceModel):
encoder = ModernBertEncoder
featurizer = featurizer
max_length = 512

settings = {
"base_model_path": os.path.join("modern_bert", "modern_bert_large.jl"),
"n_layer": 28,
"train_embeddings": True,
"num_layers_trained": 28,
"n_embed": 1024,
"max_length": max_length,
"include_bos_eos": True,
"n_heads": 16,
"bert_intermediate_size": 2624,
"batch_size": 4,
# "accum_steps": 16,
**SETTINGS
}
required_files = []
Loading

0 comments on commit d277f5e

Please sign in to comment.