From 2e9d6ddaacd6ac57392726947cfdd7cddd71e6e6 Mon Sep 17 00:00:00 2001
From: Sid <sid.data.universe@gmail.com>
Date: Thu, 21 Mar 2024 20:27:31 -0700
Subject: [PATCH 1/6] Check generated outputs before calculating losses.

---
 pretrain/validation.py | 44 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/pretrain/validation.py b/pretrain/validation.py
index bb63c73e..9936a319 100644
--- a/pretrain/validation.py
+++ b/pretrain/validation.py
@@ -99,6 +99,50 @@ def compute_losses(
     model.to(device)
     model.eval()
 
+    # First do a sanity check that the model outputs look reasonable.
+    # Grab 100 tokens from the first two batches as 'prompts'. (1 x Seq Length tensors.)
+    prompt_length = 100
+    falcon_token_inputs_1 = batches[0][:, :prompt_length]
+    falcon_token_inputs_2 = batches[1][:, :prompt_length]
+
+    # Generate 30 tokens of output from the model for each prompt.
+    output_length = 30
+    # Only take the last 30 tokens since otherwise we also get the prompt ids.
+    generate_id1s = model.generate(
+        falcon_token_inputs_1.cuda(),
+        min_new_tokens=output_length,
+        max_new_tokens=output_length,
+    )[:, -output_length:]
+    generate_id2s = model.generate(
+        falcon_token_inputs_2.cuda(),
+        min_new_tokens=output_length,
+        max_new_tokens=output_length,
+    )[:, -output_length:]
+
+    # Check if too many of the generated ids are the same between the two outputs.
+    if torch.sum(torch.eq(generate_id1s, generate_id2s)).item() >= output_length / 3:
+        bt.logging.info(
+            f"Model with config {model.config} had too much overlap between generated outputs."
+        )
+        return [math.inf for _ in batches]
+
+    # Check if internally either response is too repetitive.
+    for tensor in [generate_id1s, generate_id2s]:
+        # Find unique elements and their counts
+        _, counts = torch.unique(tensor, return_counts=True)
+        # Find the index of the maximum count
+        max_count_index = torch.argmax(counts)
+        # Extract the count of the most common element
+        most_common_count = counts[max_count_index].item()
+
+        if most_common_count > output_length / 3:
+            bt.logging.info(
+                f"Model with config {model.config} had too much repetition in generated output."
+            )
+            return [math.inf for _ in batches]
+
+    # Everything looks good! Continue to computing actual losses.
+
     # Iterate over each page and corresponding batches
     losses = []
     for batch in batches:

From 82e74a3f551beddb94ab6ae9d24a8ccdbbdc4370 Mon Sep 17 00:00:00 2001
From: Sid <sid.data.universe@gmail.com>
Date: Thu, 21 Mar 2024 20:36:16 -0700
Subject: [PATCH 2/6] Send inputs to the same device as the model.

---
 pretrain/validation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pretrain/validation.py b/pretrain/validation.py
index 9936a319..acb9eb2d 100644
--- a/pretrain/validation.py
+++ b/pretrain/validation.py
@@ -102,19 +102,19 @@ def compute_losses(
     # First do a sanity check that the model outputs look reasonable.
     # Grab 100 tokens from the first two batches as 'prompts'. (1 x Seq Length tensors.)
     prompt_length = 100
-    falcon_token_inputs_1 = batches[0][:, :prompt_length]
-    falcon_token_inputs_2 = batches[1][:, :prompt_length]
+    falcon_token_inputs_1 = (batches[0][:, :prompt_length]).to(device)
+    falcon_token_inputs_2 = (batches[1][:, :prompt_length]).to(device)
 
     # Generate 30 tokens of output from the model for each prompt.
     output_length = 30
     # Only take the last 30 tokens since otherwise we also get the prompt ids.
     generate_id1s = model.generate(
-        falcon_token_inputs_1.cuda(),
+        falcon_token_inputs_1,
         min_new_tokens=output_length,
         max_new_tokens=output_length,
     )[:, -output_length:]
     generate_id2s = model.generate(
-        falcon_token_inputs_2.cuda(),
+        falcon_token_inputs_2,
         min_new_tokens=output_length,
         max_new_tokens=output_length,
     )[:, -output_length:]

From 7eb4b4e0fbc816ef6c900d4bfa6c978b8bc4b3bd Mon Sep 17 00:00:00 2001
From: Sid <sid.data.universe@gmail.com>
Date: Thu, 21 Mar 2024 20:54:50 -0700
Subject: [PATCH 3/6] Refactor check out to a helper function.

---
 pretrain/validation.py | 68 ++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/pretrain/validation.py b/pretrain/validation.py
index acb9eb2d..0b30af8b 100644
--- a/pretrain/validation.py
+++ b/pretrain/validation.py
@@ -82,39 +82,29 @@ def compute_wins(
     return wins, win_rate
 
 
-def compute_losses(
-    model, batches: typing.List[torch.Tensor], device: str
-) -> typing.List[float]:
-    """
-    Computes the losses for a given model on provided batches.
+def check_for_reasonable_output(
+    model, input1: torch.Tensor, input2: torch.Tensor
+) -> bool:
+    """Checks that a model generates reasonable outputs for two given inputs.
 
-    Parameters:
-        model (torch.nn.Module): The model for which losses are to be computed.
-        batches (dict): A list of batches.
-        device (str): The device to use for computation (e.g., 'cpu', 'gpu').
+    Args:
+        model (torch.nn.Module): The model for which outputs are to be checked. Already loaded to device.
+        input1 (torch.Tensor]): Tokenized input1 to check. Already loaded to device.
+        input2 (torch.Tensor]): Tokenized input2 to check. Already loaded to device.
 
     Returns:
-        dict: A dictionary with page indices as keys and lists of loss values as values.
+        bool: If the model generates reasonable outputs.
     """
-    model.to(device)
-    model.eval()
-
-    # First do a sanity check that the model outputs look reasonable.
-    # Grab 100 tokens from the first two batches as 'prompts'. (1 x Seq Length tensors.)
-    prompt_length = 100
-    falcon_token_inputs_1 = (batches[0][:, :prompt_length]).to(device)
-    falcon_token_inputs_2 = (batches[1][:, :prompt_length]).to(device)
-
     # Generate 30 tokens of output from the model for each prompt.
     output_length = 30
     # Only take the last 30 tokens since otherwise we also get the prompt ids.
     generate_id1s = model.generate(
-        falcon_token_inputs_1,
+        input1,
         min_new_tokens=output_length,
         max_new_tokens=output_length,
     )[:, -output_length:]
     generate_id2s = model.generate(
-        falcon_token_inputs_2,
+        input2,
         min_new_tokens=output_length,
         max_new_tokens=output_length,
     )[:, -output_length:]
@@ -124,7 +114,7 @@ def compute_losses(
         bt.logging.info(
             f"Model with config {model.config} had too much overlap between generated outputs."
         )
-        return [math.inf for _ in batches]
+        return False
 
     # Check if internally either response is too repetitive.
     for tensor in [generate_id1s, generate_id2s]:
@@ -139,7 +129,39 @@ def compute_losses(
             bt.logging.info(
                 f"Model with config {model.config} had too much repetition in generated output."
             )
-            return [math.inf for _ in batches]
+            return False
+
+    # Passed all the checks, return True.
+    return True
+
+
+def compute_losses(
+    model, batches: typing.List[torch.Tensor], device: str
+) -> typing.List[float]:
+    """
+    Computes the losses for a given model on provided batches.
+
+    Parameters:
+        model (torch.nn.Module): The model for which losses are to be computed.
+        batches (dict): A list of batches.
+        device (str): The device to use for computation (e.g., 'cpu', 'gpu').
+
+    Returns:
+        dict: A dictionary with page indices as keys and lists of loss values as values.
+    """
+    model.to(device)
+    model.eval()
+
+    # First check that model generates reasonable looking outputs.
+    # Grab 100 tokens from the first two batches as 'prompts'. (1 x Seq Length tensors.)
+    prompt_length = 100
+    falcon_token_inputs_1 = (batches[0][:, :prompt_length]).to(device)
+    falcon_token_inputs_2 = (batches[1][:, :prompt_length]).to(device)
+
+    if not check_for_reasonable_output(
+        model, falcon_token_inputs_1, falcon_token_inputs_2
+    ):
+        return [math.inf for _ in batches]
 
     # Everything looks good! Continue to computing actual losses.
 

From 1177610e07ac36a9613fc968a0acd197b66025b2 Mon Sep 17 00:00:00 2001
From: Sid <sid.data.universe@gmail.com>
Date: Thu, 21 Mar 2024 20:59:10 -0700
Subject: [PATCH 4/6] Bump spec version to force reload of models.

---
 constants/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/constants/__init__.py b/constants/__init__.py
index 5aaaabb4..e38d2f5a 100644
--- a/constants/__init__.py
+++ b/constants/__init__.py
@@ -13,7 +13,7 @@
 # Project Constants.
 # ---------------------------------
 
-__version__ = "2.2.1"
+__version__ = "2.2.2"
 version_split = __version__.split(".")
 __spec_version__ = (
     (1000 * int(version_split[0]))

From 6160d49fa611f8b7706184ffda3a3f95e1460680 Mon Sep 17 00:00:00 2001
From: Sid <sid.data.universe@gmail.com>
Date: Thu, 21 Mar 2024 21:31:13 -0700
Subject: [PATCH 5/6] Pass tokenizer eos token id to remove warning message.

---
 pretrain/validation.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pretrain/validation.py b/pretrain/validation.py
index 0b30af8b..cfe0cc64 100644
--- a/pretrain/validation.py
+++ b/pretrain/validation.py
@@ -24,6 +24,7 @@
 import constants
 import traceback
 import bittensor as bt
+import pretrain as pt
 
 
 def iswin(loss_i, loss_j, block_i, block_j):
@@ -97,16 +98,19 @@ def check_for_reasonable_output(
     """
     # Generate 30 tokens of output from the model for each prompt.
     output_length = 30
+    tokenizer = pt.model.get_tokenizer()
     # Only take the last 30 tokens since otherwise we also get the prompt ids.
     generate_id1s = model.generate(
         input1,
         min_new_tokens=output_length,
         max_new_tokens=output_length,
+        pad_token_id=tokenizer.eos_token_id,
     )[:, -output_length:]
     generate_id2s = model.generate(
         input2,
         min_new_tokens=output_length,
         max_new_tokens=output_length,
+        pad_token_id=tokenizer.eos_token_id,
     )[:, -output_length:]
 
     # Check if too many of the generated ids are the same between the two outputs.
@@ -136,7 +140,9 @@ def check_for_reasonable_output(
 
 
 def compute_losses(
-    model, batches: typing.List[torch.Tensor], device: str
+    model,
+    batches: typing.List[torch.Tensor],
+    device: str,
 ) -> typing.List[float]:
     """
     Computes the losses for a given model on provided batches.

From d80f965c3eb7c59db04ac539191e6674afa49a9d Mon Sep 17 00:00:00 2001
From: Sid <sid.data.universe@gmail.com>
Date: Thu, 21 Mar 2024 21:38:40 -0700
Subject: [PATCH 6/6] Start iterator at 200 for fresh start.

---
 utilities/miner_iterator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utilities/miner_iterator.py b/utilities/miner_iterator.py
index 12b8a27f..0a17cdf2 100644
--- a/utilities/miner_iterator.py
+++ b/utilities/miner_iterator.py
@@ -17,7 +17,8 @@ def __init__(self, miner_uids: List[int]):
         self.miner_uids = sorted(copy.deepcopy(miner_uids))
         # Start the index at a random position. This helps ensure that miners with high UIDs aren't penalized if
         # the validator restarts frequently.
-        self.index = random.randint(0, len(self.miner_uids) - 1)
+        # Temporarily hard code to start at 200 to more quickly restart on the relevant models.
+        self.index = 200
         self.lock = threading.Lock()
 
     def __iter__(self):