Merge pull request #82 from microsoft/speedup_tests

Speedup tests
microsoft · Aug 11, 2024 · e6995ff · e6995ff
2 parents ebaf4e3 + 270ff37
commit e6995ff
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 19 deletions.
diff --git a/mttl/models/containers/__init__.py b/mttl/models/containers/__init__.py
@@ -2,7 +2,7 @@
 from typing import Tuple
 
 from mttl.config import Config
-from mttl.logging import logger
+from mttl.logging import logger, warn_once
 from mttl.models.containers.base import ExpertContainer
 from mttl.models.containers.kv_containers import KVExpertContainer
 from mttl.models.containers.lora_containers import (
@@ -49,7 +49,7 @@ def get_container_class(modifier: str):
         return LoRAExpertContainer
     elif modifier == "skilled_lora":
         if not os.environ.get("COALESCED_LORA_CONTAINER", "False") == "1":
-            logger.warning(
+            warn_once(
                 "COALESCED_LORA_CONTAINER is not set to 1, but still using it for SkilledLoRA"
             )
         return CoalescedLoRAExpertContainer

diff --git a/mttl/models/containers/selectors/base.py b/mttl/models/containers/selectors/base.py
@@ -346,7 +346,7 @@ def add_expert(
         self, expert_name: str, expert_info: ExpertInfo = None, is_default=False
     ):
         if expert_info is None or expert_info.expert_task_name is None:
-            logger.warning(
+            logger.debug(
                 "Expert's task_name not set, assume task name corresponds to expert name!"
             )
             self._task_to_expert_name[expert_name] = expert_name

diff --git a/tests/test_routed_multi_expert_model.py b/tests/test_routed_multi_expert_model.py
@@ -92,6 +92,9 @@ def bigger_dummy_batch():
     return batch
 
 
+bs, max_seq_len = 10, 5
+
+
 class TestMultiExpertModel:
     def create_dummy_expert(self, config: ExpertConfig, exp_name) -> Expert:
         model = MultiExpertModel(model=config.model, device_map="cpu")
@@ -112,7 +115,6 @@ def test_add_expert_with_action_merge(self, tmp_exp_config):
 
         module = MultiExpertModel(**vars(config))
         module.add_experts_from_dict(module_dict, action="merge")
-        bs, max_seq_len = 10, 100
 
         assert isinstance(
             module.model.transformer.h[0].attn.attention.k_proj, LoRAExpertContainer
@@ -132,7 +134,7 @@ def test_add_expert_with_action_merge(self, tmp_exp_config):
 
         # Test Base Llama model
         output = module(batch)
-        assert np.allclose(output.item(), 10.15, atol=0.1)
+        assert np.allclose(output.item(), 15.2, atol=0.1)
 
     def nonzero_B_init(self, model):
         gen = torch.Generator()
@@ -144,8 +146,9 @@ def nonzero_B_init(self, model):
                 mod.lora_a.data = torch.rand(mod.lora_a.shape, generator=gen) * 0.5
                 mod.lora_b.data = torch.rand(mod.lora_b.shape, generator=gen) * 0.5
 
+    @pytest.mark.parametrize("is_coalesced", [(True, False)])
     def test_expert_selector_with_poly_task_routing(
-        self, tmp_exp_config
+        self, tmp_exp_config, is_coalesced
     ):  # this fails, why?
         seed_everything(0)
         config: Config = tmp_exp_config
@@ -163,7 +166,6 @@ def test_expert_selector_with_poly_task_routing(
         )
         assert module.hparams.model_modifier == None
         module.add_experts_from_dict(module_dict, action="route")
-        bs, max_seq_len = 10, 100
 
         assert isinstance(
             module.model.transformer.h[0].attn.attention.k_proj, LoRAExpertContainer
@@ -180,16 +182,16 @@ def test_expert_selector_with_poly_task_routing(
         batch["attention_mask"] = attn_mask
         batch["task_names"] = ["task_1", "task_2"] * 5
 
-        is_coalesced = os.environ.get("COALESCED_LORA_CONTAINER", "0") == "1"
+        os.environ["COALESCED_LORA_CONTAINER"] = str(is_coalesced)
 
         # BASE MODEL FWD BASS (because all Bs are == 0, so functially same as backbone)
         output = module(batch)
-        assert np.allclose(output.item(), 10.08 if is_coalesced else 10.20, atol=0.1)
+        assert np.allclose(output.item(), 15.625 if is_coalesced else 10.20, atol=0.1)
 
         # Now let's change the adapter params, and also the function parameterized by the model
         self.nonzero_B_init(module)
         output = module(batch)
-        assert np.allclose(output.item(), 15.03 if is_coalesced else 14.69, atol=0.1)
+        assert np.allclose(output.item(), 18.37 if is_coalesced else 14.69, atol=0.1)
 
         """ Multi-Head Routing Test """
         # NOTE: We need to add SkilledLoRAs instead of standard LoRAs
@@ -214,7 +216,7 @@ def test_expert_selector_with_poly_task_routing(
         output = module(batch)
 
         # Because routing is initialized to uniform, should give same result
-        assert np.allclose(output.item(), 15.03 if is_coalesced else 15.27, atol=0.1)
+        assert np.allclose(output.item(), 19.125 if is_coalesced else 15.27, atol=0.1)
 
         # Now let's change the routing, to make sure the output also changes
         for mod in module.modules():
@@ -223,13 +225,15 @@ def test_expert_selector_with_poly_task_routing(
                 mod.module_logits.data[:, -1] = 999
 
         output = module(batch)
-        assert np.allclose(output.item(), 15.56 if is_coalesced else 16.22, atol=0.1)
+        assert np.allclose(output.item(), 19.875 if is_coalesced else 16.22, atol=0.1)
 
         # Finally, Test invalid tasks
         batch["task_names"][-1] = "task_10"
         with pytest.raises(AssertionError):
             output = module(batch)
 
+        os.environ["COALESCED_LORA_CONTAINER"] = "0"
+
     def test_expert_selector_with_task_name_routing(self, tmp_exp_config):
         seed_everything(0)
         config: Config = tmp_exp_config
@@ -244,8 +248,6 @@ def test_expert_selector_with_task_name_routing(self, tmp_exp_config):
         module.add_experts_from_dict(module_dict, action="route")
         module.set_default_expert("mod3")
 
-        bs, max_seq_len = 10, 100
-
         assert isinstance(
             module.model.transformer.h[0].attn.attention.k_proj, LoRAExpertContainer
         )
@@ -268,7 +270,7 @@ def test_expert_selector_with_task_name_routing(self, tmp_exp_config):
 
         # Test Base Llama model
         output = module(batch)
-        assert np.allclose(output.item(), 10.1, atol=0.1)
+        assert np.allclose(output.item(), 12.3125, atol=0.1)
 
     def test_expert_selector_with_poly_routing(self, tmp_exp_config):
         seed_everything(0)
@@ -290,7 +292,6 @@ def test_expert_selector_with_poly_routing(self, tmp_exp_config):
         # Model has been created. Now, we fix the generator to ensure that coalesced vs not coalesced gives the same as base llama
         generator = torch.Generator()
         generator.manual_seed(0)
-        bs, max_seq_len = 10, 100
         batch = {
             "input_ids": torch.randint(10, 400, (bs, max_seq_len), generator=generator),
             "labels": torch.randint(10, 400, (bs, max_seq_len), generator=generator),
@@ -306,7 +307,7 @@ def test_expert_selector_with_poly_routing(self, tmp_exp_config):
 
         # Test Base Llama model
         output = module(batch)
-        assert np.allclose(output.item(), 10.1, atol=0.1)
+        assert np.allclose(output.item(), 12.3125, atol=0.1)
 
         # check the get_router_weights function
         weights = {}
@@ -345,7 +346,7 @@ def test_expert_selector_with_poly_routing(self, tmp_exp_config):
         assert selector.module_logits_dict["mod2"].item() == 0.0
 
         output = module(batch)
-        assert np.allclose(output.item(), 10.1, atol=0.1)
+        assert np.allclose(output.item(), 12.3125, atol=0.1)
 
         weights = {}
         for _, selector_dict in module.selector_cache.items():
@@ -531,7 +532,7 @@ def test_expert_selector_with_task_predictor_selection(self, tmp_exp_config):
         module = MultiExpertModel(**vars(config))
         module.add_experts_from_dict(module_dict, action="route")
 
-        bs, max_seq_len = 2, 100
+        bs = 2
         batch = {
             "input_ids": torch.randint(bs, 400, (bs, max_seq_len)),
             "labels": torch.randint(bs, 400, (bs, max_seq_len)),