Merge pull request #173 from macrocosm-os/dev

Release 4.3.0
macrocosm-os · Sep 2, 2024 · 3bebb14 · 3bebb14
2 parents 7fa4bf4 + 66345d6
commit 3bebb14
Show file tree

Hide file tree

Showing 7 changed files with 682 additions and 234 deletions.
diff --git a/constants/__init__.py b/constants/__init__.py
@@ -24,7 +24,10 @@
     ModelConstraints,
     NormValidationConstraints,
 )
-from taoverse.model.competition.epsilon import FixedEpsilon
+from taoverse.model.competition.epsilon import (
+    FixedEpsilon,
+    LinearDecay
+)
 from competitions.data import CompetitionId
 
 from typing import Dict, List, Tuple
@@ -34,7 +37,7 @@
 # ---------------------------------
 
 # Release
-__version__ = "4.2.0"
+__version__ = "4.3.0"
 
 # Validator schema version
 __validator_version__ = "3.1.0"
@@ -150,7 +153,62 @@
         epsilon_func=FixedEpsilon(0.005),
         max_bytes=29 * 1024 * 1024 * 1024,
     ),
+}
 
+# Defined model constraints by competition id with decaying epsilon
+MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY: Dict[CompetitionId, ModelConstraints] = {
+    CompetitionId.M772_MODEL: ModelConstraints(
+        max_model_parameter_size=772_000_000,
+        min_model_parameter_size=572_000_000,
+        sequence_length=1024,
+        allowed_architectures=ALLOWED_MODEL_TYPES_1,
+        tokenizer="distilgpt2",
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        max_bytes=5 * 1024 * 1024 * 1024,
+    ),
+    CompetitionId.B7_MODEL: ModelConstraints(
+        max_model_parameter_size=6_900_000_000,
+        min_model_parameter_size=6_700_000_000,
+        sequence_length=4096,
+        allowed_architectures=ALLOWED_MODEL_TYPES_2,
+        tokenizer="Xenova/gpt-4",
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "flash_attention_2",
+        },
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        max_bytes=15 * 1024 * 1024 * 1024,
+    ),
+    CompetitionId.B3_MODEL: ModelConstraints(
+        max_model_parameter_size=3_400_000_000,
+        min_model_parameter_size=3_200_000_000,
+        sequence_length=4096,
+        allowed_architectures=ALLOWED_MODEL_TYPES_2,
+        tokenizer="Xenova/gpt-4",
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "flash_attention_2",
+        },
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        max_bytes=15 * 1024 * 1024 * 1024,
+    ),
+    CompetitionId.B14_MODEL: ModelConstraints(
+        max_model_parameter_size=13_900_000_000,
+        min_model_parameter_size=13_700_000_000,
+        sequence_length=4096,
+        allowed_architectures=ALLOWED_MODEL_TYPES_2,
+        tokenizer="Xenova/gpt-4",
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "flash_attention_2",
+        },
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 100800),
+        max_bytes=29 * 1024 * 1024 * 1024,
+    ),
 }
 
 
@@ -206,28 +264,26 @@
         [
             Competition(
                 CompetitionId.M772_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.M772_MODEL],
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.M772_MODEL],
                 0.14,
             ),
             Competition(
                 CompetitionId.B3_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B3_MODEL],
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.B3_MODEL],
                 0.29,
             ),
             Competition(
                 CompetitionId.B7_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.B7_MODEL],
                 0.15,
             ),
             Competition(
                 CompetitionId.B14_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL],
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[CompetitionId.B14_MODEL],
                 0.42,
-            )
-
+            ),
         ],
     ),
-
 ]
 
 for block_and_competitions in COMPETITION_SCHEDULE_BY_BLOCK:
@@ -251,8 +307,6 @@
 # validator scoring exponential temperature
 # 0.01 gives ~96% to best model with only ~3 receiving any weights.
 temperature = 0.01
-# validator score boosting for earlier models.
-timestamp_epsilon = 0.005
 
 # block to activate sample unpacking
 sample_unpack_block = BLOCK_3B_7BSTAR_UNPACK
@@ -275,5 +329,7 @@
 updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
 # time required between updates to the chain.
 chain_update_cadence = dt.timedelta(minutes=20)
-# time required between retrying evaluation of a stale model. (First retry will be immediate).
-model_retry_cadence = dt.timedelta(hours=4)
+# Number of blocks required between retrying evaluation of a model.
+model_retry_cadence = 300  # Roughly 1 hour
+# How frequently to check the models given weights by other large validators.
+scan_top_model_cadence = dt.timedelta(minutes=30)
diff --git a/docs/validator.md b/docs/validator.md
@@ -38,12 +38,12 @@ You can view the entire validation system by reading the code in `neurons/valida
         set_weights( weight )
 ```
 
-The behaviour of `iswin( loss_a, loss_b, block_a, block_b, epsilon)` function intentionally skews the win function to reward models which have been hosted earlier such that newer models are only better than others iff their loss is `epsilon` percent lower accoring to the following function. Currently `epsilon` is set to 1% and is a hyper parameter of the mechanism
+The behaviour of `iswin( loss_a, loss_b, block_a, block_b, epsilon_func, curr_block)` function intentionally skews the win function to reward models which have been hosted earlier such that newer models are only better than others iff their loss is `epsilon` percent lower accoring to the following function. `epsilon` is calculated based on a per-competition specified function based on the distance from the earlier model block to the current block.
 
 ```python
-def iswin( loss_a, loss_b, block_a, block_b, epsilon):
-    loss_a = (1 - epsilon) * loss_a if block_a < block_b else loss_a
-    loss_b = (1 - epsilon) * loss_b if block_b < block_a else loss_b
+def iswin(loss_a, loss_b, block_a, block_b, epsilon_func, curr_block):
+    loss_a = (1 - epsilon_func(curr_block, block_a)) * loss_a if block_a < block_b else loss_a
+    loss_b = (1 - epsilon_func(curr_block, block_b)) * loss_b if block_b < block_a else loss_b
     return loss_a < loss_b
 ```
 
@@ -53,7 +53,7 @@ It is important to note that this affects the game theoretics of the incentive l
 
 Validators will need enough disk space to store the models of miners being evaluated. Each model has a max size by block defined in [constants/__init__.py](https://github.com/macrocosm-os/pretraining/blob/main/constants/__init__.py#L57) and the validator has cleanup logic to remove old models. It is recommended to have at least 2 TB of disk space and 80GB of system memory.
 
-Validators will need enough processing power to evaluate their model. As of Apr 1st, 2024 it is required to have a GPU that supports [flash attention 2](https://github.com/Dao-AILab/flash-attention) with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations.
+Validators will need enough processing power to evaluate their model. As of Sept 2nd, 2024, an upgrade to the Nvidia A100 GPU with 80GB of VRAM is required. This GPU's high throughput and FLOPs enable the running of 14B models without impacting the speed of the validation cycle. Although only 40GB of VRAM is necessary, we have observed that A100 GPUs with 80GB are more readily available and are offered at a comparable price to the 40GB variants. The additional VRAM provided by this GPU will allows more flexibility for optimization in future releases, enabling larger validation batch sizes to enhance the stability of the validation process by reducing scoring variance.
 
 # Getting Started
 

diff --git a/model/retry.py b/model/retry.py
@@ -0,0 +1,53 @@
+import math
+from typing import List
+
+from taoverse.model.competition.data import EpsilonFunc
+from taoverse.model.data import EvalResult
+
+
+def should_retry_model(
+    epsilon_func: EpsilonFunc, curr_block: int, eval_history: List[EvalResult]
+) -> bool:
+    """Determines if a model should be retried based on its evaluation history and the current state.
+
+    A model is retryable if any of the following apply:
+        - It has never been evaluated.
+        - When it was last evaluated it had a better loss than the top model but couldn't overcome the epsilon disadvantage.
+          However, now epsilon has lowered to the point that it may be able to overcome the epsilon disadvantage.
+        - The model has only been evaluated once and it hit an error. In this case, we allow a single retry.
+
+    Args:
+        epsilon_func (EpsilonFunc): The function to compute the current epsilon.
+        curr_block (int): The current block
+        eval_history (List[EvalResult]): The (potentially empty) evaluation history of the model.
+    """
+    # If the model has never been evaluated, we should retry it.
+    if not eval_history:
+        return True
+
+    # Find the most recent successful eval.
+    last_successful_eval = None
+    for eval_result in reversed(eval_history):
+        if eval_result.score != math.inf:
+            last_successful_eval = eval_result
+            break
+
+    if last_successful_eval:
+        # If this model had worse loss than the top model during the last eval, no need to retry.
+        # NOTE: "score" = avg_loss so lower is better.
+        if last_successful_eval.score > last_successful_eval.winning_model_score:
+            return False
+
+        # Otherwise, this model is potentially better than the top model but at the time it was evaluated
+        # it couldn't overcome the epsilon disadvantage. Check if epsilon has changed to the point where
+        # we should retry this model now.
+        curr_epsilon = epsilon_func.compute_epsilon(
+            current_block=curr_block,
+            model_block=last_successful_eval.winning_model_block,
+        )
+        # Compute the adjusted loss of the top model based on the current epsilon.
+        top_model_score = last_successful_eval.winning_model_score * (1 - curr_epsilon)
+        return last_successful_eval.score < top_model_score
+
+    # This model has been evaluated but has errored every time. Allow a single retry in this case.
+    return len(eval_history) < 2