Merge pull request #49 from RaoFoundation/dev

Release 2.1.4
macrocosm-os · Feb 2, 2024 · 4402b91 · 4402b91
2 parents 06eecdd + 563dfdb
commit 4402b91
Show file tree

Hide file tree

Showing 7 changed files with 289 additions and 187 deletions.
diff --git a/docs/miner.md b/docs/miner.md
@@ -133,21 +133,20 @@ import pretrain as pt
 import bittensor as bt
 from transformers import PreTrainedModel
 
-config = bt.config(...)
-wallet = bt.wallet()
-metagraph = bt.metagraph(netuid=9)
-
-actions = pt.mining.actions.Actions.create(config, wallet)
-
 # Load a model from another miner.
-model: PreTrainedModel = actions.load_remote_model(uid=123, metagraph=metagraph, download_dir="mydir")
+model: PreTrainedModel = await pt.mining.load_remote_model(uid=123, download_dir="mydir")
 
 # Save the model to local file.
-actions.save(model, "model-foo/")
+pt.mining.save(model, "model-foo/")
 
 # Load the model from disk.
-actions.load_local_model("model-foo/")
+pt.mining.load_local_model("model-foo/")
 
 # Publish the model for validator evaluation.
-actions.push(model)
+wallet = bt.wallet()
+await pt.mining.push(model, repo="jdoe/my-repo", wallet=wallet)
+
+# Get the URL to the best model
+best_uid = pt.graph.best_uid()
+print(await pt.mining.get_repo(best_uid))
 ```
diff --git a/neurons/miner.py b/neurons/miner.py
@@ -26,10 +26,11 @@
 import constants
 from model.storage.chain.chain_model_metadata_store import ChainModelMetadataStore
 from model.storage.hugging_face.hugging_face_model_store import HuggingFaceModelStore
+from model.storage.model_metadata_store import ModelMetadataStore
+from model.storage.remote_model_store import RemoteModelStore
 import pretrain as pt
 import bittensor as bt
 from transformers import PreTrainedModel
-from pretrain.mining import Actions
 from utilities import utils
 import datetime as dt
 
@@ -154,15 +155,20 @@ def get_config():
 
 
 async def load_starting_model(
-    actions: Actions, config: bt.config, metagraph: bt.metagraph
+    config: bt.config,
+    metagraph: bt.metagraph,
+    metadata_store: ModelMetadataStore,
+    remote_model_store: RemoteModelStore,
 ) -> PreTrainedModel:
     """Loads the model to train based on the provided config."""
 
     # Initialize the model based on the best on the network.
     if config.load_best:
         # Get the best UID be incentive and load it.
         best_uid = pt.graph.best_uid(metagraph)
-        model = await actions.load_remote_model(best_uid, metagraph, config.model_dir)
+        model = await pt.mining.load_remote_model(
+            best_uid, config.model_dir, metagraph, metadata_store, remote_model_store
+        )
         bt.logging.success(
             f"Training with model from best uid: {best_uid}. Model={str(model)}"
         )
@@ -171,8 +177,12 @@ async def load_starting_model(
     # Initialize the model based on a passed uid.
     if config.load_uid is not None:
         # Sync the state from the passed uid.
-        model = await actions.load_remote_model(
-            config.load_uid, metagraph, config.model_dir
+        model = await pt.mining.load_remote_model(
+            config.load_uid,
+            config.model_dir,
+            metagraph,
+            metadata_store,
+            remote_model_store,
         )
         bt.logging.success(
             f"Training with model from uid: {config.load_uid}. Model={str(model)}"
@@ -181,13 +191,13 @@ async def load_starting_model(
 
     # Check if we should load a model from a local directory.
     if config.load_model_dir:
-        model = actions.load_local_model(config.load_model_dir)
+        model = pt.mining.load_local_model(config.load_model_dir)
         bt.logging.success(f"Training with model from disk. Model={str(model)}")
         return model
 
     # Check if we should load a model from a local file.
     if config.load_model:
-        model = actions.load_gpt2_model(config.load_model)
+        model = pt.mining.load_gpt2_model(config.load_model)
         bt.logging.success(f"Training with model from disk. Model={str(model)}")
         return model
 
@@ -211,9 +221,7 @@ async def main(config: bt.config):
     if not config.offline:
         my_uid = utils.assert_registered(wallet, metagraph)
         HuggingFaceModelStore.assert_access_token_exists()
-
-    # Configure the stores and miner actions.
-    miner_actions = pt.mining.Actions.create(config, wallet, subtensor)
+        utils.validate_hf_repo_id(config.hf_repo_id)
 
     # Create a unique run id for this run.
     run_id = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -230,12 +238,16 @@ async def main(config: bt.config):
             use_wandb = True
 
     # Init model.
-    model: PreTrainedModel = await load_starting_model(miner_actions, config, metagraph)
+    metadata_store = ChainModelMetadataStore(subtensor, wallet, config.netuid)
+    remote_store = HuggingFaceModelStore()
+    model: PreTrainedModel = await load_starting_model(
+        config, metagraph, metadata_store, remote_store
+    )
     model = model.train()
     model = model.to(config.device)
 
     bt.logging.success(f"Saving model to path: {model_dir}.")
-    miner_actions.save(model, model_dir)
+    pt.mining.save(model, model_dir)
 
     # Build optimizer
     optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=0.01)
@@ -345,7 +357,7 @@ async def main(config: bt.config):
 
                 # Save the model to your mining dir.
                 bt.logging.success(f"Saving model to path: {model_dir}.")
-                miner_actions.save(model, model_dir)
+                pt.mining.save(model, model_dir)
 
         bt.logging.success("Finished training")
         # Push the model to your run.
@@ -356,8 +368,14 @@ async def main(config: bt.config):
                 )
 
                 # First, reload the best model from the training run.
-                model_to_upload = miner_actions.load_local_model(model_dir)
-                await miner_actions.push(model_to_upload)
+                model_to_upload = pt.mining.load_local_model(model_dir)
+                await pt.mining.push(
+                    model_to_upload,
+                    config.hf_repo_id,
+                    wallet,
+                    metadata_store=metadata_store,
+                    remote_model_store=remote_store,
+                )
             else:
                 bt.logging.success(
                     f"This training run achieved a best_avg_loss={best_avg_loss}, which did not meet the upload threshold. Not uploading to hugging face."

diff --git a/neurons/validator.py b/neurons/validator.py
@@ -18,6 +18,7 @@
 
 from collections import defaultdict
 import datetime as dt
+import functools
 import os
 import json
 import math
@@ -326,17 +327,17 @@ def update_models(self):
                     time.sleep(time_to_sleep)
 
                 uid_last_checked[next_uid] = dt.datetime.now()
-                bt.logging.trace(f"Updating model for UID={next_uid}")
 
                 # Get their hotkey from the metagraph.
                 hotkey = self.metagraph.hotkeys[next_uid]
 
                 # Compare metadata and tracker, syncing new model from remote store to local if necessary.
                 updated = asyncio.run(self.model_updater.sync_model(hotkey))
 
-                bt.logging.trace(
-                    f"Updated model for UID={next_uid}. Was new = {updated}"
-                )
+                if updated:
+                    bt.logging.trace(
+                        f"Updated model for UID={next_uid}. Was new = {updated}"
+                    )
 
                 # Ensure we eval the new model on the next loop.
                 if updated:
@@ -487,14 +488,17 @@ async def run_step(self):
             )
         )
 
+        bt.logging.debug(f"Computing losses on {uids} with pages {pages}")
+
         # Compute model losses on batches.
-        bt.logging.debug(f"Computing losses on {uids}")
         losses_per_uid = {muid: None for muid in uids}
 
         load_model_perf = PerfMonitor("Eval: Load model")
         compute_loss_perf = PerfMonitor("Eval: Compute loss")
 
         for uid_i in uids:
+            bt.logging.trace(f"Computing model losses for uid:{uid_i}.")
+
             # Check that the model is in the tracker.
             hotkey = self.metagraph.hotkeys[uid_i]
             model_i_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
@@ -516,10 +520,17 @@ async def run_step(self):
                         )
 
                     with compute_loss_perf.sample():
-                        losses = pt.validation.compute_losses(
-                            model_i.pt_model, batches, device=self.config.device
+                        # Run each computation in a subprocess so that the GPU is reset between each model.
+                        losses = utils.run_in_subprocess(
+                            functools.partial(
+                                pt.validation.compute_losses,
+                                model_i.pt_model,
+                                batches,
+                                self.config.device,
+                            ),
+                            ttl=60,
+                            mode="spawn",
                         )
-
                     del model_i
                 except Exception as e:
                     bt.logging.error(
@@ -558,8 +569,16 @@ async def run_step(self):
         self.weights = self.weights.nan_to_num(0.0)
 
         # Filter based on win rate removing all by the sample_min best models for evaluation.
+        # First remove any models that have an infinite loss.
+        filtered_win_rate = {
+            uid: wr
+            for uid, wr in win_rate.items()
+            if not all(math.isinf(x) for x in losses_per_uid.get(uid, [math.inf]))
+        }
         self.uids_to_eval = set(
-            sorted(win_rate, key=win_rate.get, reverse=True)[: self.config.sample_min]
+            sorted(filtered_win_rate, key=filtered_win_rate.get, reverse=True)[
+                : self.config.sample_min
+            ]
         )
 
         # Save state