volcengine · PeterSH6 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/.github/workflows/sanity.yml b/.github/workflows/sanity.yml
@@ -0,0 +1,39 @@
+name: sanity
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/sanity.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/sanity.yml
+
+jobs:
+  sanity:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install the current repository
+        run: |
+          pip install -e .[test]
+      - name: Run sanity test
+        run: |
+          pytest -s -x tests/sanity
+      - name: Run untility test
+        run: |
+          pytest -s -x tests/utility
diff --git a/setup.py b/setup.py
@@ -42,7 +42,7 @@
     'demo': ['hydra-core', 'transformers', ''],
     'single-controller': ['ray', 'kubernetes'],
     'single-controller-ray': ['ray'],
-    'test': ['fsspec', 'pytest', 'datasets']
+    'test': ['fsspec', 'pytest', 'datasets', 'ray']
 }
 
 from pathlib import Path

diff --git a/tests/gpu_utility/test_memory_buffers.py b/tests/gpu_utility/test_memory_buffers.py
@@ -0,0 +1,70 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test memory buffers
+- We start with two models with the same weights
+- We use Memory buffer to make one of the models and then compare the parameters
+"""
+
+import torch
+import gc
+
+from transformers import LlamaModel, LlamaConfig
+from verl.utils.memory_buffer import MemoryBufferModuleWrapper
+
+
+def test_memory_buffers():
+    llama_config = LlamaConfig(vocab_size=256,
+                               hidden_size=4096,
+                               intermediate_size=11008,
+                               num_hidden_layers=2,
+                               num_attention_heads=16,
+                               num_key_value_heads=16)
+
+    model = LlamaModel(config=llama_config).cuda()
+    model_copy = LlamaModel(config=llama_config).cuda()
+    model_copy.load_state_dict(model.state_dict())
+
+    model_named_params = dict(model.named_parameters())
+    model_copy_named_params = dict(model_copy.named_parameters())
+
+    norm_factor = 1024**3
+
+    t_before = torch.cuda.get_device_properties(0).total_memory / norm_factor
+    r_before = torch.cuda.memory_reserved(0) / norm_factor
+    a_before = torch.cuda.memory_allocated(0) / norm_factor
+
+    print(f'Before Total memory: {t_before} GB, reserved: {r_before} GB, allocated: {a_before} GB')
+
+    model_wrapper = MemoryBufferModuleWrapper(model)
+
+    t = torch.cuda.get_device_properties(0).total_memory / norm_factor
+    r = torch.cuda.memory_reserved(0) / norm_factor
+    a = torch.cuda.memory_allocated(0) / norm_factor
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    print(f'After Total memory: {t} GB, reserved: {r} GB, allocated: {a} GB')
+
+    change_ratio = (a - a_before) / a_before
+    assert change_ratio < 0.01, f'make sure the allocated change is less than 1%, Got {change_ratio}'
+
+    for (name1, param1), (name2, param2) in zip(model.named_parameters(), model_copy.named_parameters()):
+        assert name1 == name2
+        assert torch.eq(param1.data, param2.data).all(), f'{param1.data}, {param2.data}, {name1}'
+
+
+if __name__ == '__main__':
+    test_memory_buffers()
diff --git a/tests/gpu_utility/test_ops.py b/tests/gpu_utility/test_ops.py
@@ -0,0 +1,47 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_flash_attn_cross_entropy():
+    from verl.utils.torch_functional import logprobs_from_logits_naive
+
+    from verl.utils.debug import log_gpu_memory_usage
+
+    from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+
+    import torch
+    from torch import nn
+
+    log_gpu_memory_usage('At start')
+
+    hidden_states = torch.randn(size=(2048, 5120), device='cuda', requires_grad=True, dtype=torch.bfloat16)
+
+    linear = nn.Linear(in_features=5120, out_features=155136, bias=False, device='cuda', dtype=torch.bfloat16)
+
+    logits = linear(hidden_states)
+
+    # logits = logits.float()
+    labels = torch.randint(low=0, high=155136, size=(2048,), device='cuda')
+
+    log_gpu_memory_usage('before computation')
+    # output = checkpoint.checkpoint(logprobs_from_logits, logits, labels, use_reentrant=True)
+    output = -cross_entropy_loss(logits, labels)[0]
+    # output = logprobs_from_logits(logits, labels)
+    log_gpu_memory_usage('After forward')
+    output.sum().backward()
+    log_gpu_memory_usage('After backward')
+
+    groundtruth = logprobs_from_logits_naive(logits.float(), labels)
+
+    torch.testing.assert_close(output, groundtruth)
diff --git a/tests/gpu_utility/test_torch_functional.py b/tests/gpu_utility/test_torch_functional.py
@@ -0,0 +1,81 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from verl.utils.model import create_random_mask
+from flash_attn.bert_padding import unpad_input
+import torch
+
+
+def test_log_probs_from_logits_response_rmpad():
+    from verl.utils.torch_functional import log_probs_from_logits_response, log_probs_from_logits_response_rmpad
+    vocab_size = 32000
+    batch_size = 2
+    prompt_length = 256
+    response_length = 256
+
+    input_ids = torch.randint(low=0, high=vocab_size, size=(batch_size, prompt_length + response_length), device='cuda')
+    attention_mask = create_random_mask(input_ids=input_ids,
+                                        max_ratio_of_left_padding=0.2,
+                                        max_ratio_of_valid_token=0.8,
+                                        min_ratio_of_valid_token=0.6)
+
+    response_mask = attention_mask[:, -response_length:]
+
+    assert torch.all(response_mask[:, 0] == 1)
+
+    logits = torch.randn(batch_size, prompt_length + response_length, vocab_size, device='cuda')
+    logits_rmpad = unpad_input(logits, attention_mask)[0]
+
+    expected_output = log_probs_from_logits_response(input_ids=input_ids,
+                                                     logits=logits,
+                                                     response_length=response_length)
+    actual_output = log_probs_from_logits_response_rmpad(input_ids=input_ids,
+                                                         attention_mask=attention_mask,
+                                                         logits_rmpad=logits_rmpad,
+                                                         response_length=response_length)
+
+    # This should bitwise align as only this operation only contains gather operators
+    assert torch.all(torch.eq(actual_output * response_mask, expected_output * response_mask))
+
+
+def test_lr_scheduler():
+    from torch import nn
+    model = nn.Linear(10, 10)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+    from verl.utils.torch_functional import get_constant_schedule_with_warmup
+    constant_lr = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2)
+
+    lr_lst = []
+
+    for _ in range(5):
+        lr_lst.append(constant_lr.get_last_lr()[0])
+        constant_lr.step()
+
+    torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.001, 0.001])
+
+    from verl.utils.torch_functional import get_cosine_schedule_with_warmup
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    cosine_lr = get_cosine_schedule_with_warmup(optimizer=optimizer,
+                                                num_warmup_steps=2,
+                                                num_training_steps=5,
+                                                min_lr_ratio=0.1)
+
+    lr_lst = []
+
+    for _ in range(5):
+        lr_lst.append(cosine_lr.get_last_lr()[0])
+        cosine_lr.step()
+
+    torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.0007750000000000002, 0.0003250000000000002])
diff --git a/tests/rollout/test_vllm_hf_loader.py b/tests/rollout/test_vllm_hf_loader.py
@@ -0,0 +1,130 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+import transformers
+
+from verl.third_party.vllm import LLM, vllm_version
+from verl.utils.model import update_model_config
+from vllm import SamplingParams
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+
+from transformers import GenerationConfig
+
+from verl.utils.torch_functional import pad_sequence_to_length
+
+
+def test_vllm_with_hf():
+    assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'
+
+    # fill rollout config
+    max_prompt_length = 16
+    max_response_length = 32
+
+    # create model
+    override_config_kwargs = {
+        'vocab_size': 32000,
+        'n_positions': max_prompt_length + max_response_length,
+        'max_position_embeddings': max_prompt_length + max_response_length
+    }
+
+    # Initialize model and token
+    local_cache_path = '~/.cache/verl/rlhf'
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = 'deepseek-ai/deepseek-llm-7b-chat'
+    from verl.utils.fs import copy_local_path_from_hdfs
+    local_model_path = copy_local_path_from_hdfs(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+
+    preencode_prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    tokenizer.pad_token = tokenizer.eos_token
+    prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
+    input_ids = prompts['input_ids']
+    attention_mask = prompts['attention_mask']
+
+    input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
+    attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)
+
+    actor_model = AutoModelForCausalLM.from_pretrained(local_model_path)
+    actor_model.to(torch.bfloat16)
+
+    actor_model_config = AutoConfig.from_pretrained(local_model_path)
+    assert isinstance(actor_model_config, transformers.MistralConfig), 'Only llama is supported now.'
+    update_model_config(actor_model_config, override_config_kwargs)
+
+    temperature = 0
+    top_p = 1
+
+    kwargs = dict(n=1,
+                  temperature=temperature,
+                  top_p=top_p,
+                  max_tokens=max_response_length,
+                  logprobs=1,
+                  ignore_eos=True)
+
+    if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
+        kwargs['detokenize'] = False
+    sampling_params = SamplingParams(**kwargs)
+
+    tensor_parallel_size = 2
+
+    llm = LLM(model=actor_model,
+              tokenizer=tokenizer,
+              model_hf_config=actor_model_config,
+              tensor_parallel_size=tensor_parallel_size,
+              dtype='bfloat16',
+              gpu_memory_utilization=0.1,
+              load_format='hf')
+
+    print('start generation')
+    input_ids = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    outputs = llm.generate(prompt_token_ids=input_ids, sampling_params=sampling_params, use_tqdm=False)
+    vllm_output = outputs[0].cuda()
+    llm.free_cache_engine()
+    llm = None
+    import gc
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    generation_config = GenerationConfig(do_sample=False)
+    actor_model.cuda()
+    output = actor_model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=max_response_length,
+        # max_length=max_length,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.pad_token_id,
+        generation_config=generation_config,
+        # renormalize_logits=True,
+        output_scores=False,  # this is potentially very large
+        return_dict_in_generate=True,
+        use_cache=False)  # may OOM when use_cache = True
+    seq = output.sequences
+    response = seq[:, max_prompt_length:]
+
+    print(f'hf response: {tokenizer.batch_decode(response)}')
+    print(f'vllm response: {tokenizer.batch_decode(vllm_output)}')
+    assert torch.allclose(response, vllm_output), f'hf_response:{response} | vllm_response:{vllm_output}'
+    print('Check Pass')
+
+
+# if __name__ == "__main__":
+#     test_vllm_with_hf()