Skip to content

Commit

Permalink
Merge pull request #49 from RaoFoundation/dev
Browse files Browse the repository at this point in the history
Release 2.1.4
  • Loading branch information
RusticLuftig authored Feb 2, 2024
2 parents 06eecdd + 563dfdb commit 4402b91
Show file tree
Hide file tree
Showing 7 changed files with 289 additions and 187 deletions.
19 changes: 9 additions & 10 deletions docs/miner.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,20 @@ import pretrain as pt
import bittensor as bt
from transformers import PreTrainedModel

config = bt.config(...)
wallet = bt.wallet()
metagraph = bt.metagraph(netuid=9)

actions = pt.mining.actions.Actions.create(config, wallet)

# Load a model from another miner.
model: PreTrainedModel = actions.load_remote_model(uid=123, metagraph=metagraph, download_dir="mydir")
model: PreTrainedModel = await pt.mining.load_remote_model(uid=123, download_dir="mydir")

# Save the model to local file.
actions.save(model, "model-foo/")
pt.mining.save(model, "model-foo/")

# Load the model from disk.
actions.load_local_model("model-foo/")
pt.mining.load_local_model("model-foo/")

# Publish the model for validator evaluation.
actions.push(model)
wallet = bt.wallet()
await pt.mining.push(model, repo="jdoe/my-repo", wallet=wallet)

# Get the URL to the best model
best_uid = pt.graph.best_uid()
print(await pt.mining.get_repo(best_uid))
```
48 changes: 33 additions & 15 deletions neurons/miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@
import constants
from model.storage.chain.chain_model_metadata_store import ChainModelMetadataStore
from model.storage.hugging_face.hugging_face_model_store import HuggingFaceModelStore
from model.storage.model_metadata_store import ModelMetadataStore
from model.storage.remote_model_store import RemoteModelStore
import pretrain as pt
import bittensor as bt
from transformers import PreTrainedModel
from pretrain.mining import Actions
from utilities import utils
import datetime as dt

Expand Down Expand Up @@ -154,15 +155,20 @@ def get_config():


async def load_starting_model(
actions: Actions, config: bt.config, metagraph: bt.metagraph
config: bt.config,
metagraph: bt.metagraph,
metadata_store: ModelMetadataStore,
remote_model_store: RemoteModelStore,
) -> PreTrainedModel:
"""Loads the model to train based on the provided config."""

# Initialize the model based on the best on the network.
if config.load_best:
# Get the best UID be incentive and load it.
best_uid = pt.graph.best_uid(metagraph)
model = await actions.load_remote_model(best_uid, metagraph, config.model_dir)
model = await pt.mining.load_remote_model(
best_uid, config.model_dir, metagraph, metadata_store, remote_model_store
)
bt.logging.success(
f"Training with model from best uid: {best_uid}. Model={str(model)}"
)
Expand All @@ -171,8 +177,12 @@ async def load_starting_model(
# Initialize the model based on a passed uid.
if config.load_uid is not None:
# Sync the state from the passed uid.
model = await actions.load_remote_model(
config.load_uid, metagraph, config.model_dir
model = await pt.mining.load_remote_model(
config.load_uid,
config.model_dir,
metagraph,
metadata_store,
remote_model_store,
)
bt.logging.success(
f"Training with model from uid: {config.load_uid}. Model={str(model)}"
Expand All @@ -181,13 +191,13 @@ async def load_starting_model(

# Check if we should load a model from a local directory.
if config.load_model_dir:
model = actions.load_local_model(config.load_model_dir)
model = pt.mining.load_local_model(config.load_model_dir)
bt.logging.success(f"Training with model from disk. Model={str(model)}")
return model

# Check if we should load a model from a local file.
if config.load_model:
model = actions.load_gpt2_model(config.load_model)
model = pt.mining.load_gpt2_model(config.load_model)
bt.logging.success(f"Training with model from disk. Model={str(model)}")
return model

Expand All @@ -211,9 +221,7 @@ async def main(config: bt.config):
if not config.offline:
my_uid = utils.assert_registered(wallet, metagraph)
HuggingFaceModelStore.assert_access_token_exists()

# Configure the stores and miner actions.
miner_actions = pt.mining.Actions.create(config, wallet, subtensor)
utils.validate_hf_repo_id(config.hf_repo_id)

# Create a unique run id for this run.
run_id = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
Expand All @@ -230,12 +238,16 @@ async def main(config: bt.config):
use_wandb = True

# Init model.
model: PreTrainedModel = await load_starting_model(miner_actions, config, metagraph)
metadata_store = ChainModelMetadataStore(subtensor, wallet, config.netuid)
remote_store = HuggingFaceModelStore()
model: PreTrainedModel = await load_starting_model(
config, metagraph, metadata_store, remote_store
)
model = model.train()
model = model.to(config.device)

bt.logging.success(f"Saving model to path: {model_dir}.")
miner_actions.save(model, model_dir)
pt.mining.save(model, model_dir)

# Build optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=0.01)
Expand Down Expand Up @@ -345,7 +357,7 @@ async def main(config: bt.config):

# Save the model to your mining dir.
bt.logging.success(f"Saving model to path: {model_dir}.")
miner_actions.save(model, model_dir)
pt.mining.save(model, model_dir)

bt.logging.success("Finished training")
# Push the model to your run.
Expand All @@ -356,8 +368,14 @@ async def main(config: bt.config):
)

# First, reload the best model from the training run.
model_to_upload = miner_actions.load_local_model(model_dir)
await miner_actions.push(model_to_upload)
model_to_upload = pt.mining.load_local_model(model_dir)
await pt.mining.push(
model_to_upload,
config.hf_repo_id,
wallet,
metadata_store=metadata_store,
remote_model_store=remote_store,
)
else:
bt.logging.success(
f"This training run achieved a best_avg_loss={best_avg_loss}, which did not meet the upload threshold. Not uploading to hugging face."
Expand Down
37 changes: 28 additions & 9 deletions neurons/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from collections import defaultdict
import datetime as dt
import functools
import os
import json
import math
Expand Down Expand Up @@ -326,17 +327,17 @@ def update_models(self):
time.sleep(time_to_sleep)

uid_last_checked[next_uid] = dt.datetime.now()
bt.logging.trace(f"Updating model for UID={next_uid}")

# Get their hotkey from the metagraph.
hotkey = self.metagraph.hotkeys[next_uid]

# Compare metadata and tracker, syncing new model from remote store to local if necessary.
updated = asyncio.run(self.model_updater.sync_model(hotkey))

bt.logging.trace(
f"Updated model for UID={next_uid}. Was new = {updated}"
)
if updated:
bt.logging.trace(
f"Updated model for UID={next_uid}. Was new = {updated}"
)

# Ensure we eval the new model on the next loop.
if updated:
Expand Down Expand Up @@ -487,14 +488,17 @@ async def run_step(self):
)
)

bt.logging.debug(f"Computing losses on {uids} with pages {pages}")

# Compute model losses on batches.
bt.logging.debug(f"Computing losses on {uids}")
losses_per_uid = {muid: None for muid in uids}

load_model_perf = PerfMonitor("Eval: Load model")
compute_loss_perf = PerfMonitor("Eval: Compute loss")

for uid_i in uids:
bt.logging.trace(f"Computing model losses for uid:{uid_i}.")

# Check that the model is in the tracker.
hotkey = self.metagraph.hotkeys[uid_i]
model_i_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
Expand All @@ -516,10 +520,17 @@ async def run_step(self):
)

with compute_loss_perf.sample():
losses = pt.validation.compute_losses(
model_i.pt_model, batches, device=self.config.device
# Run each computation in a subprocess so that the GPU is reset between each model.
losses = utils.run_in_subprocess(
functools.partial(
pt.validation.compute_losses,
model_i.pt_model,
batches,
self.config.device,
),
ttl=60,
mode="spawn",
)

del model_i
except Exception as e:
bt.logging.error(
Expand Down Expand Up @@ -558,8 +569,16 @@ async def run_step(self):
self.weights = self.weights.nan_to_num(0.0)

# Filter based on win rate removing all by the sample_min best models for evaluation.
# First remove any models that have an infinite loss.
filtered_win_rate = {
uid: wr
for uid, wr in win_rate.items()
if not all(math.isinf(x) for x in losses_per_uid.get(uid, [math.inf]))
}
self.uids_to_eval = set(
sorted(win_rate, key=win_rate.get, reverse=True)[: self.config.sample_min]
sorted(filtered_win_rate, key=filtered_win_rate.get, reverse=True)[
: self.config.sample_min
]
)

# Save state
Expand Down
Loading

0 comments on commit 4402b91

Please sign in to comment.