Skip to content

Commit

Permalink
Merge pull request #173 from macrocosm-os/dev
Browse files Browse the repository at this point in the history
Release 4.3.0
  • Loading branch information
cryptal-mc authored Sep 2, 2024
2 parents 7fa4bf4 + 66345d6 commit 3bebb14
Show file tree
Hide file tree
Showing 7 changed files with 682 additions and 234 deletions.
82 changes: 69 additions & 13 deletions constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
ModelConstraints,
NormValidationConstraints,
)
from taoverse.model.competition.epsilon import FixedEpsilon
from taoverse.model.competition.epsilon import (
FixedEpsilon,
LinearDecay
)
from competitions.data import CompetitionId

from typing import Dict, List, Tuple
Expand All @@ -34,7 +37,7 @@
# ---------------------------------

# Release
__version__ = "4.2.0"
__version__ = "4.3.0"

# Validator schema version
__validator_version__ = "3.1.0"
Expand Down Expand Up @@ -150,7 +153,62 @@
epsilon_func=FixedEpsilon(0.005),
max_bytes=29 * 1024 * 1024 * 1024,
),
}

# Defined model constraints by competition id with decaying epsilon
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY: Dict[CompetitionId, ModelConstraints] = {
CompetitionId.M772_MODEL: ModelConstraints(
max_model_parameter_size=772_000_000,
min_model_parameter_size=572_000_000,
sequence_length=1024,
allowed_architectures=ALLOWED_MODEL_TYPES_1,
tokenizer="distilgpt2",
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
max_bytes=5 * 1024 * 1024 * 1024,
),
CompetitionId.B7_MODEL: ModelConstraints(
max_model_parameter_size=6_900_000_000,
min_model_parameter_size=6_700_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B3_MODEL: ModelConstraints(
max_model_parameter_size=3_400_000_000,
min_model_parameter_size=3_200_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B14_MODEL: ModelConstraints(
max_model_parameter_size=13_900_000_000,
min_model_parameter_size=13_700_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 100800),
max_bytes=29 * 1024 * 1024 * 1024,
),
}


Expand Down Expand Up @@ -206,28 +264,26 @@
[
Competition(
CompetitionId.M772_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.M772_MODEL],
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.M772_MODEL],
0.14,
),
Competition(
CompetitionId.B3_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B3_MODEL],
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.B3_MODEL],
0.29,
),
Competition(
CompetitionId.B7_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.B7_MODEL],
0.15,
),
Competition(
CompetitionId.B14_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL],
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.B14_MODEL],
0.42,
)

),
],
),

]

for block_and_competitions in COMPETITION_SCHEDULE_BY_BLOCK:
Expand All @@ -251,8 +307,6 @@
# validator scoring exponential temperature
# 0.01 gives ~96% to best model with only ~3 receiving any weights.
temperature = 0.01
# validator score boosting for earlier models.
timestamp_epsilon = 0.005

# block to activate sample unpacking
sample_unpack_block = BLOCK_3B_7BSTAR_UNPACK
Expand All @@ -275,5 +329,7 @@
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
# time required between updates to the chain.
chain_update_cadence = dt.timedelta(minutes=20)
# time required between retrying evaluation of a stale model. (First retry will be immediate).
model_retry_cadence = dt.timedelta(hours=4)
# Number of blocks required between retrying evaluation of a model.
model_retry_cadence = 300 # Roughly 1 hour
# How frequently to check the models given weights by other large validators.
scan_top_model_cadence = dt.timedelta(minutes=30)
10 changes: 5 additions & 5 deletions docs/validator.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ You can view the entire validation system by reading the code in `neurons/valida
set_weights( weight )
```

The behaviour of `iswin( loss_a, loss_b, block_a, block_b, epsilon)` function intentionally skews the win function to reward models which have been hosted earlier such that newer models are only better than others iff their loss is `epsilon` percent lower accoring to the following function. Currently `epsilon` is set to 1% and is a hyper parameter of the mechanism
The behaviour of `iswin( loss_a, loss_b, block_a, block_b, epsilon_func, curr_block)` function intentionally skews the win function to reward models which have been hosted earlier such that newer models are only better than others iff their loss is `epsilon` percent lower accoring to the following function. `epsilon` is calculated based on a per-competition specified function based on the distance from the earlier model block to the current block.

```python
def iswin( loss_a, loss_b, block_a, block_b, epsilon):
loss_a = (1 - epsilon) * loss_a if block_a < block_b else loss_a
loss_b = (1 - epsilon) * loss_b if block_b < block_a else loss_b
def iswin(loss_a, loss_b, block_a, block_b, epsilon_func, curr_block):
loss_a = (1 - epsilon_func(curr_block, block_a)) * loss_a if block_a < block_b else loss_a
loss_b = (1 - epsilon_func(curr_block, block_b)) * loss_b if block_b < block_a else loss_b
return loss_a < loss_b
```

Expand All @@ -53,7 +53,7 @@ It is important to note that this affects the game theoretics of the incentive l

Validators will need enough disk space to store the models of miners being evaluated. Each model has a max size by block defined in [constants/__init__.py](https://github.com/macrocosm-os/pretraining/blob/main/constants/__init__.py#L57) and the validator has cleanup logic to remove old models. It is recommended to have at least 2 TB of disk space and 80GB of system memory.

Validators will need enough processing power to evaluate their model. As of Apr 1st, 2024 it is required to have a GPU that supports [flash attention 2](https://github.com/Dao-AILab/flash-attention) with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations.
Validators will need enough processing power to evaluate their model. As of Sept 2nd, 2024, an upgrade to the Nvidia A100 GPU with 80GB of VRAM is required. This GPU's high throughput and FLOPs enable the running of 14B models without impacting the speed of the validation cycle. Although only 40GB of VRAM is necessary, we have observed that A100 GPUs with 80GB are more readily available and are offered at a comparable price to the 40GB variants. The additional VRAM provided by this GPU will allows more flexibility for optimization in future releases, enabling larger validation batch sizes to enhance the stability of the validation process by reducing scoring variance.

# Getting Started

Expand Down
53 changes: 53 additions & 0 deletions model/retry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import math
from typing import List

from taoverse.model.competition.data import EpsilonFunc
from taoverse.model.data import EvalResult


def should_retry_model(
epsilon_func: EpsilonFunc, curr_block: int, eval_history: List[EvalResult]
) -> bool:
"""Determines if a model should be retried based on its evaluation history and the current state.
A model is retryable if any of the following apply:
- It has never been evaluated.
- When it was last evaluated it had a better loss than the top model but couldn't overcome the epsilon disadvantage.
However, now epsilon has lowered to the point that it may be able to overcome the epsilon disadvantage.
- The model has only been evaluated once and it hit an error. In this case, we allow a single retry.
Args:
epsilon_func (EpsilonFunc): The function to compute the current epsilon.
curr_block (int): The current block
eval_history (List[EvalResult]): The (potentially empty) evaluation history of the model.
"""
# If the model has never been evaluated, we should retry it.
if not eval_history:
return True

# Find the most recent successful eval.
last_successful_eval = None
for eval_result in reversed(eval_history):
if eval_result.score != math.inf:
last_successful_eval = eval_result
break

if last_successful_eval:
# If this model had worse loss than the top model during the last eval, no need to retry.
# NOTE: "score" = avg_loss so lower is better.
if last_successful_eval.score > last_successful_eval.winning_model_score:
return False

# Otherwise, this model is potentially better than the top model but at the time it was evaluated
# it couldn't overcome the epsilon disadvantage. Check if epsilon has changed to the point where
# we should retry this model now.
curr_epsilon = epsilon_func.compute_epsilon(
current_block=curr_block,
model_block=last_successful_eval.winning_model_block,
)
# Compute the adjusted loss of the top model based on the current epsilon.
top_model_score = last_successful_eval.winning_model_score * (1 - curr_epsilon)
return last_successful_eval.score < top_model_score

# This model has been evaluated but has errored every time. Allow a single retry in this case.
return len(eval_history) < 2
Loading

0 comments on commit 3bebb14

Please sign in to comment.