Skip to content

Commit

Permalink
ADD: optimize for settings
Browse files Browse the repository at this point in the history
  • Loading branch information
benleetownsend committed Feb 7, 2025
1 parent d277f5e commit 005a846
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 40 deletions.
69 changes: 55 additions & 14 deletions finetune/base_models/modern_bert/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ def featurizer(X, encoder, config, train=False, reuse=None, lengths=None, **kwar
seq_length = tf.shape(input=delimiters)[1]
mask = tf.sequence_mask(lengths, maxlen=seq_length, dtype=tf.float32)
with tf.compat.v1.variable_scope("model/featurizer", reuse=reuse):
model = ModernBert(config=config, vocab_size=encoder.vocab_size)
model = ModernBert(
config=config,
vocab_size=encoder.vocab_size,
)
embedding = model.embeddings
sequence_out = model(
input_ids=X, attention_mask=mask, training=train, seq_len=seq_length
Expand Down Expand Up @@ -42,21 +45,13 @@ def featurizer(X, encoder, config, train=False, reuse=None, lengths=None, **kwar

return output_state

SETTINGS = {
"lr": 1.5e-4,
"n_epochs": 6,
"predict_batch_size": 4,
"max_grad_norm": 3.0,
"l2_reg": 5e-4,
"lr_warmup": 0.25, # this seems high but is what the sweep ended up with.
"lr_schedule": "warmup_linear",
"low_memory_mode": True,
}

class ModernBertModel(SourceModel):
encoder = ModernBertEncoder
featurizer = featurizer
max_length = 512
is_bidirectional = True


settings = {
"base_model_path": os.path.join("modern_bert", "modern_bert.jl"),
Expand All @@ -68,15 +63,55 @@ class ModernBertModel(SourceModel):
"include_bos_eos": True,
"n_heads": 12,
"batch_size": 8,
# "accum_steps": 8,
"bert_intermediate_size": 1152,
**SETTINGS
"lr": 3e-4, # previously 1.5e-4
"n_epochs": 12, # previously 6
"max_grad_norm": 3.0,
"l2_reg": 5e-4,
"lr_warmup": 0.25, # this seems high but is what the sweep ended up with.
"lr_schedule": "warmup_linear",
"low_memory_mode": True,
}
required_files = []

@classmethod
def get_optimal_params(cls, config):
base_n_epochs = config.base_model.settings["n_epochs"]
base_learning_rate = config.base_model.settings["lr"]
if config.optimize_for.lower() in ["accuracy", "accuracy_fp16"]:
overrides = {
"max_length": 2048,
"n_epochs": base_n_epochs,
"batch_size": 8,
"chunk_context": None,
"predict_batch_size": 8,
"mixed_precision": True,
"float_16_predict": True,
"lr": base_learning_rate,
}

elif config.optimize_for.lower() in ["predict_speed", "predict_speed_fp16"]:
overrides = {
"max_length": 512,
"n_epochs": base_n_epochs,
"batch_size": 24,
"chunk_context": 16,
"predict_batch_size": 8, # Lower actually seems to be faster - should run some benchmarks at some point
"mixed_precision": True,
"float_16_predict": True,
"lr": base_learning_rate,
}
else:
raise ValueError(
"Cannot optimise hyperparams for {}, must be either 'speed', 'predict_speed' or 'accuracy'".format(
config.optimize_for
)
)
return overrides


class ModernBertLargeModel(SourceModel):
is_bidirectional = True
encoder = ModernBertEncoder
featurizer = featurizer
max_length = 512
Expand All @@ -93,6 +128,12 @@ class ModernBertLargeModel(SourceModel):
"bert_intermediate_size": 2624,
"batch_size": 4,
# "accum_steps": 16,
**SETTINGS
"lr": 5e-4, # previously 1.5e-4
"n_epochs": 8, # previously 6
"max_grad_norm": 3.0,
"l2_reg": 5e-4,
"lr_warmup": 0.25, # this seems high but is what the sweep ended up with.
"lr_schedule": "warmup_linear",
"low_memory_mode": True,
}
required_files = []
8 changes: 5 additions & 3 deletions finetune/base_models/modern_bert/modelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ def call(self, position_ids):
# 1, 1, seq_len
position_ids_expanded = position_ids[:, None, :]
freqs = tf.transpose(
tf.matmul(inv_freq_expanded, position_ids_expanded), perm=[0, 2, 1]
tf.matmul(inv_freq_expanded, tf.cast(position_ids_expanded, tf.float32)), perm=[0, 2, 1]
)
emb = tf.concat([freqs, freqs], axis=-1)
cos = tf.cos(emb)
sin = tf.sin(emb)
return cos, sin
return tf.cast(cos, self.compute_dtype), tf.cast(sin, self.compute_dtype)


def rotate_half(x):
Expand Down Expand Up @@ -304,6 +304,7 @@ def call(

hidden_states = self.embeddings(input_ids=input_ids, training=training)
for encoder_layer in self.layers:
print("Hidden States: ", hidden_states)
if self.config.low_memory_mode and training:
encoder_layer.call = recompute_grads_w_kwargs(
encoder_layer.call,
Expand All @@ -323,11 +324,12 @@ def call(
return hidden_states

def _update_attention_mask(self, attention_mask: tf.Tensor) -> tf.Tensor:
attention_mask = tf.cast(attention_mask, self.compute_dtype)
expanded_mask = tf.tile(
attention_mask[:, None, None, :], [1, 1, tf.shape(attention_mask)[1], 1]
)
inverted_mask = 1.0 - expanded_mask
ignore_value = tf.fill(tf.shape(inverted_mask), tf.float32.min)
ignore_value = tf.fill(tf.shape(inverted_mask), tf.cast(tf.float16.min, self.compute_dtype))

global_attention_mask = tf.where(
inverted_mask > 0.5, ignore_value, inverted_mask
Expand Down
51 changes: 28 additions & 23 deletions finetune/datasets/real_kie.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
"bert_large": BERTLarge,
}

def get_model(model_name):
def get_model(model_name, **model_kwargs):
return SequenceLabeler(
base_model=MODELS[model_name],
auto_negative_sampling=True,
low_memory_mode=True,
class_weights="sqrt",
collapse_whitespace=True,
**model_kwargs
)


Expand All @@ -34,8 +35,8 @@ def get_dataset_split(dataset_path, split_name):
return x, y


def train_model(model_name, dataset_path):
model = get_model(model_name)
def train_model(model_name, dataset_path, **model_kwargs):
model = get_model(model_name, **model_kwargs)
x, y = get_dataset_split(dataset_path, "train")
model.fit(x, y)
return model
Expand All @@ -48,23 +49,27 @@ def evaluate_model(model: SequenceLabeler, dataset_path):
if __name__ == "__main__":
for dataset in ["/datasets/charities", "/datasets/fcc_invoices", "/datasets/nda"]:
print(f"Evaluating {dataset}")
for model_name in ["modern_bert", "roberta"]: #, "modern_bert_large", "bert_large"]:
results_path = f"results/{dataset.split('/')[-1]}_{model_name}.json"
if os.path.exists(results_path):
print(f"Skipping {model_name} for {dataset} because results already exist")
continue
print(f"Model: {model_name}")
start_time = time.time()
model = train_model(model_name, dataset)
end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time} seconds")
start_time = time.time()
results = evaluate_model(model, dataset)
end_time = time.time()
evaluation_time = end_time - start_time
print(f"Evaluation time: {evaluation_time} seconds")
results["training_time"] = training_time
results["evaluation_time"] = evaluation_time
with open(results_path, "w") as f:
json.dump(results, f, indent=1)
for model_name, version in [("modern_bert", 4), ("roberta", 3)]: #, "modern_bert_large", "bert_large"]:
for optimize_for in ["predict_speed", "accuracy"]:
results_path = f"results/{dataset.split('/')[-1]}_{model_name}_{optimize_for}_v{version}.json"
if model_name == "roberta" and optimize_for == "accuracy":
# This has already been run.
continue
if os.path.exists(results_path):
print(f"Skipping {model_name} for {dataset} because results already exist")
continue
print(f"Model: {model_name}")
start_time = time.time()
model = train_model(model_name, dataset, optimize_for=optimize_for)
end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time} seconds")
start_time = time.time()
results = evaluate_model(model, dataset)
end_time = time.time()
evaluation_time = end_time - start_time
print(f"Evaluation time: {evaluation_time} seconds")
results["training_time"] = training_time
results["evaluation_time"] = evaluation_time
with open(results_path, "w") as f:
json.dump(results, f, indent=1)
5 changes: 5 additions & 0 deletions finetune/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ def get_variable_getter(estimator_mode, features, fp16_predict, mixed_precision)
else feat,
features,
)
# becomes the more correctly named tf.keras.config.set_dtype_policy in 2.17
tf.keras.mixed_precision.set_global_policy("mixed_float16")
elif estimator_mode == tf.estimator.ModeKeys.PREDICT and fp16_predict:
custom_getter = fp16_variable_getter
features = tf.nest.map_structure(
Expand All @@ -140,7 +142,9 @@ def get_variable_getter(estimator_mode, features, fp16_predict, mixed_precision)
else feat,
features,
)
tf.keras.mixed_precision.set_global_policy("float16")
else:
tf.keras.mixed_precision.set_global_policy("float32")
custom_getter = None
return custom_getter, features

Expand Down Expand Up @@ -234,6 +238,7 @@ def _model_fn(features, labels, mode, params):
]

if build_target_model:
print("Featurizer state: ", featurizer_state)
target_model_state = target_model_op(
featurizer_state=featurizer_state, Y=Y, params=params, mode=mode
)
Expand Down

0 comments on commit 005a846

Please sign in to comment.