Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ELO as completion criteria option for curriculum lessons #5646

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ and this project adheres to
### Minor Changes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
-Added support for Elo as a curriculum learning completion criteria. (#5646)

### Bug Fixes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
Expand Down
46 changes: 46 additions & 0 deletions config/poca/SoccerTwosCurriculum.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
behaviors:
SoccerTwos:
trainer_type: poca
hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 512
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 50000000
time_horizon: 1000
summary_freq: 10000
self_play:
save_steps: 50000
team_change: 200000
swap_steps: 2000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0
environment_parameters:
ball_touch:
curriculum:
- name: Lesson0 # The '-' is important as this is a list
completion_criteria:
measure: Elo
behavior: SoccerTwos
signal_smoothing: false
min_lesson_length: 100
threshold: 1250.0
value: 1.0
- name: Lesson1 # The '-' is important as this is a list
value: 0.0

4 changes: 4 additions & 0 deletions ml-agents/mlagents/trainers/environment_parameter_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def update_lessons(
trainer_steps: Dict[str, int],
trainer_max_steps: Dict[str, int],
trainer_reward_buffer: Dict[str, List[float]],
trainer_elo_score: Dict[str, int],
) -> Tuple[bool, bool]:
"""
Given progress metrics, calculates if at least one environment parameter is
Expand All @@ -148,6 +149,8 @@ def update_lessons(
of training steps this behavior's trainer has performed.
:param trainer_reward_buffer: A dictionary from behavior_name to the list of
the most recent episode returns for this behavior's trainer.
:trainer_elo_score: A Dictionary from behavior_name to the minimum Elo score
to be reached.
:returns: A tuple of two booleans : (True if any lesson has changed, True if
environment needs to reset)
"""
Expand All @@ -169,6 +172,7 @@ def update_lessons(
float(trainer_steps[behavior_to_consider])
/ float(trainer_max_steps[behavior_to_consider]),
trainer_reward_buffer[behavior_to_consider],
trainer_elo_score[behavior_to_consider] if trainer_elo_score else None,
self._smoothed_values[param_name],
)
self._smoothed_values[param_name] = new_smoothing
Expand Down
12 changes: 10 additions & 2 deletions ml-agents/mlagents/trainers/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ class CompletionCriteriaSettings:
class MeasureType(Enum):
PROGRESS: str = "progress"
REWARD: str = "reward"
ELO: str = "Elo"

behavior: str
measure: MeasureType = attr.ib(default=MeasureType.REWARD)
Expand All @@ -516,7 +517,7 @@ def _check_threshold_value(self, attribute, value):
)

def need_increment(
self, progress: float, reward_buffer: List[float], smoothing: float
self, progress: float, reward_buffer: List[float], elo_score: float, smoothing: float
) -> Tuple[bool, float]:
"""
Given measures, this method returns a boolean indicating if the lesson
Expand All @@ -528,7 +529,7 @@ def need_increment(
if self.measure == CompletionCriteriaSettings.MeasureType.PROGRESS:
if progress > self.threshold:
return True, smoothing
if self.measure == CompletionCriteriaSettings.MeasureType.REWARD:
elif self.measure == CompletionCriteriaSettings.MeasureType.REWARD:
if len(reward_buffer) < 1:
return False, smoothing
measure = np.mean(reward_buffer)
Expand All @@ -539,6 +540,13 @@ def need_increment(
smoothing = measure
if measure > self.threshold:
return True, smoothing
elif self.measure == CompletionCriteriaSettings.MeasureType.ELO:
if elo_score is None:
raise TrainerConfigError(
"Elo isn't a valid completion criteria measure if not using self-play."
)
if elo_score > self.threshold:
return True, smoothing
return False, smoothing


Expand Down
39 changes: 34 additions & 5 deletions ml-agents/mlagents/trainers/tests/test_env_param_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_curriculum_conversion():
assert lesson.value.max_value == 3


test_bad_curriculum_no_competion_criteria_config_yaml = """
test_bad_curriculum_no_completion_criteria_config_yaml = """
environment_parameters:
param_1:
curriculum:
Expand All @@ -154,7 +154,7 @@ def test_curriculum_conversion():
"""


test_bad_curriculum_all_competion_criteria_config_yaml = """
test_bad_curriculum_all_completion_criteria_config_yaml = """
environment_parameters:
param_1:
curriculum:
Expand All @@ -175,6 +175,14 @@ def test_curriculum_conversion():
require_reset: true
value: 2
- name: Lesson3
completion_criteria:
measure: Elo
behavior: fake_behavior
threshold: 1300
min_lesson_length: 100
require_reset: true
value: 3
- name: Lesson4
completion_criteria:
measure: reward
behavior: fake_behavior
Expand All @@ -192,14 +200,14 @@ def test_curriculum_conversion():
def test_curriculum_raises_no_completion_criteria_conversion():
with pytest.raises(TrainerConfigError):
RunOptions.from_dict(
yaml.safe_load(test_bad_curriculum_no_competion_criteria_config_yaml)
yaml.safe_load(test_bad_curriculum_no_completion_criteria_config_yaml)
)


def test_curriculum_raises_all_completion_criteria_conversion():
with pytest.warns(TrainerConfigWarning):
run_options = RunOptions.from_dict(
yaml.safe_load(test_bad_curriculum_all_competion_criteria_config_yaml)
yaml.safe_load(test_bad_curriculum_all_completion_criteria_config_yaml)
)

param_manager = EnvironmentParameterManager(
Expand All @@ -209,19 +217,36 @@ def test_curriculum_raises_all_completion_criteria_conversion():
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0}, #TODO: trainer_elo_scores aren't set properly for tests
) == (True, True)
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (True, True)
assert param_manager.get_current_lesson_number() == {"param_1": 2}
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (False, False)
assert param_manager.get_current_lesson_number() == {"param_1": 2}

assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1500.0},
) == (True, True)
assert param_manager.get_current_lesson_number() == {"param_1": 3}
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1500.0},
) == (False, False) # No step to advance to
assert param_manager.get_current_lesson_number() == {"param_1": 3}

test_everything_config_yaml = """
environment_parameters:
Expand Down Expand Up @@ -279,17 +304,20 @@ def test_create_manager():
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 99},
trainer_elo_score={"fake_behavior": 1200.0},
) == (False, False)
# Not enough episodes reward
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (False, False)
assert param_manager.update_lessons(
trainer_steps={"fake_behavior": 500},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [1000] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (True, True)
assert param_manager.get_current_lesson_number() == {
"param_1": 1,
Expand All @@ -310,6 +338,7 @@ def test_create_manager():
trainer_steps={"fake_behavior": 700},
trainer_max_steps={"fake_behavior": 1000},
trainer_reward_buffer={"fake_behavior": [0] * 101},
trainer_elo_score={"fake_behavior": 1200.0},
) == (True, False)
assert param_manager.get_current_samplers() == {
"param_1": UniformSettings(seed=1337 + 2, min_value=1, max_value=3),
Expand Down
7 changes: 6 additions & 1 deletion ml-agents/mlagents/trainers/trainer_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,15 @@ def reset_env_if_ready(self, env: EnvManager) -> None:
reward_buff = {k: list(t.reward_buffer) for (k, t) in self.trainers.items()}
curr_step = {k: int(t.get_step) for (k, t) in self.trainers.items()}
max_step = {k: int(t.get_max_steps) for (k, t) in self.trainers.items()}
try:
curr_elo = {k: float(t.current_elo) for (k, t) in self.trainers.items()}
except AttributeError:
curr_elo = None

# Attempt to increment the lessons of the brains who
# were ready.
updated, param_must_reset = self.param_manager.update_lessons(
curr_step, max_step, reward_buff
curr_step, max_step, reward_buff, curr_elo
)
if updated:
for trainer in self.trainers.values():
Expand Down