Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][ci] add more CI workflow #38

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/sanity.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: sanity

on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- .github/workflows/sanity.yml
pull_request:
branches:
- main
paths:
- "**/*.py"
- .github/workflows/sanity.yml

jobs:
sanity:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip install -e .[test]
- name: Run sanity test
run: |
pytest -s -x tests/sanity
- name: Run untility test
run: |
pytest -s -x tests/utility
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
'demo': ['hydra-core', 'transformers', ''],
'single-controller': ['ray', 'kubernetes'],
'single-controller-ray': ['ray'],
'test': ['fsspec', 'pytest', 'datasets']
'test': ['fsspec', 'pytest', 'datasets', 'ray']
}

from pathlib import Path
Expand Down
70 changes: 70 additions & 0 deletions tests/gpu_utility/test_memory_buffers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test memory buffers
- We start with two models with the same weights
- We use Memory buffer to make one of the models and then compare the parameters
"""

import torch
import gc

from transformers import LlamaModel, LlamaConfig
from verl.utils.memory_buffer import MemoryBufferModuleWrapper


def test_memory_buffers():
llama_config = LlamaConfig(vocab_size=256,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=2,
num_attention_heads=16,
num_key_value_heads=16)

model = LlamaModel(config=llama_config).cuda()
model_copy = LlamaModel(config=llama_config).cuda()
model_copy.load_state_dict(model.state_dict())

model_named_params = dict(model.named_parameters())
model_copy_named_params = dict(model_copy.named_parameters())

norm_factor = 1024**3

t_before = torch.cuda.get_device_properties(0).total_memory / norm_factor
r_before = torch.cuda.memory_reserved(0) / norm_factor
a_before = torch.cuda.memory_allocated(0) / norm_factor

print(f'Before Total memory: {t_before} GB, reserved: {r_before} GB, allocated: {a_before} GB')

model_wrapper = MemoryBufferModuleWrapper(model)

t = torch.cuda.get_device_properties(0).total_memory / norm_factor
r = torch.cuda.memory_reserved(0) / norm_factor
a = torch.cuda.memory_allocated(0) / norm_factor

gc.collect()
torch.cuda.empty_cache()

print(f'After Total memory: {t} GB, reserved: {r} GB, allocated: {a} GB')

change_ratio = (a - a_before) / a_before
assert change_ratio < 0.01, f'make sure the allocated change is less than 1%, Got {change_ratio}'

for (name1, param1), (name2, param2) in zip(model.named_parameters(), model_copy.named_parameters()):
assert name1 == name2
assert torch.eq(param1.data, param2.data).all(), f'{param1.data}, {param2.data}, {name1}'


if __name__ == '__main__':
test_memory_buffers()
47 changes: 47 additions & 0 deletions tests/gpu_utility/test_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def test_flash_attn_cross_entropy():
from verl.utils.torch_functional import logprobs_from_logits_naive

from verl.utils.debug import log_gpu_memory_usage

from flash_attn.ops.triton.cross_entropy import cross_entropy_loss

import torch
from torch import nn

log_gpu_memory_usage('At start')

hidden_states = torch.randn(size=(2048, 5120), device='cuda', requires_grad=True, dtype=torch.bfloat16)

linear = nn.Linear(in_features=5120, out_features=155136, bias=False, device='cuda', dtype=torch.bfloat16)

logits = linear(hidden_states)

# logits = logits.float()
labels = torch.randint(low=0, high=155136, size=(2048,), device='cuda')

log_gpu_memory_usage('before computation')
# output = checkpoint.checkpoint(logprobs_from_logits, logits, labels, use_reentrant=True)
output = -cross_entropy_loss(logits, labels)[0]
# output = logprobs_from_logits(logits, labels)
log_gpu_memory_usage('After forward')
output.sum().backward()
log_gpu_memory_usage('After backward')

groundtruth = logprobs_from_logits_naive(logits.float(), labels)

torch.testing.assert_close(output, groundtruth)
81 changes: 81 additions & 0 deletions tests/gpu_utility/test_torch_functional.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from verl.utils.model import create_random_mask
from flash_attn.bert_padding import unpad_input
import torch


def test_log_probs_from_logits_response_rmpad():
from verl.utils.torch_functional import log_probs_from_logits_response, log_probs_from_logits_response_rmpad
vocab_size = 32000
batch_size = 2
prompt_length = 256
response_length = 256

input_ids = torch.randint(low=0, high=vocab_size, size=(batch_size, prompt_length + response_length), device='cuda')
attention_mask = create_random_mask(input_ids=input_ids,
max_ratio_of_left_padding=0.2,
max_ratio_of_valid_token=0.8,
min_ratio_of_valid_token=0.6)

response_mask = attention_mask[:, -response_length:]

assert torch.all(response_mask[:, 0] == 1)

logits = torch.randn(batch_size, prompt_length + response_length, vocab_size, device='cuda')
logits_rmpad = unpad_input(logits, attention_mask)[0]

expected_output = log_probs_from_logits_response(input_ids=input_ids,
logits=logits,
response_length=response_length)
actual_output = log_probs_from_logits_response_rmpad(input_ids=input_ids,
attention_mask=attention_mask,
logits_rmpad=logits_rmpad,
response_length=response_length)

# This should bitwise align as only this operation only contains gather operators
assert torch.all(torch.eq(actual_output * response_mask, expected_output * response_mask))


def test_lr_scheduler():
from torch import nn
model = nn.Linear(10, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

from verl.utils.torch_functional import get_constant_schedule_with_warmup
constant_lr = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2)

lr_lst = []

for _ in range(5):
lr_lst.append(constant_lr.get_last_lr()[0])
constant_lr.step()

torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.001, 0.001])

from verl.utils.torch_functional import get_cosine_schedule_with_warmup
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
cosine_lr = get_cosine_schedule_with_warmup(optimizer=optimizer,
num_warmup_steps=2,
num_training_steps=5,
min_lr_ratio=0.1)

lr_lst = []

for _ in range(5):
lr_lst.append(cosine_lr.get_last_lr()[0])
cosine_lr.step()

torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.0007750000000000002, 0.0003250000000000002])
130 changes: 130 additions & 0 deletions tests/rollout/test_vllm_hf_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import torch
import transformers

from verl.third_party.vllm import LLM, vllm_version
from verl.utils.model import update_model_config
from vllm import SamplingParams
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

from transformers import GenerationConfig

from verl.utils.torch_functional import pad_sequence_to_length


def test_vllm_with_hf():
assert torch.cuda.device_count() >= 2, 'At least 2 GPUs is required to run tp+dp tests.'

# fill rollout config
max_prompt_length = 16
max_response_length = 32

# create model
override_config_kwargs = {
'vocab_size': 32000,
'n_positions': max_prompt_length + max_response_length,
'max_position_embeddings': max_prompt_length + max_response_length
}

# Initialize model and token
local_cache_path = '~/.cache/verl/rlhf'
local_cache_path = os.path.expanduser(local_cache_path)
hdfs_path = 'deepseek-ai/deepseek-llm-7b-chat'
from verl.utils.fs import copy_local_path_from_hdfs
local_model_path = copy_local_path_from_hdfs(src=hdfs_path, cache_dir=local_cache_path)
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

preencode_prompts = [
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
tokenizer.pad_token = tokenizer.eos_token
prompts = tokenizer(preencode_prompts, return_tensors='pt', padding=True)
input_ids = prompts['input_ids']
attention_mask = prompts['attention_mask']

input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True)
attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True)

actor_model = AutoModelForCausalLM.from_pretrained(local_model_path)
actor_model.to(torch.bfloat16)

actor_model_config = AutoConfig.from_pretrained(local_model_path)
assert isinstance(actor_model_config, transformers.MistralConfig), 'Only llama is supported now.'
update_model_config(actor_model_config, override_config_kwargs)

temperature = 0
top_p = 1

kwargs = dict(n=1,
temperature=temperature,
top_p=top_p,
max_tokens=max_response_length,
logprobs=1,
ignore_eos=True)

if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
kwargs['detokenize'] = False
sampling_params = SamplingParams(**kwargs)

tensor_parallel_size = 2

llm = LLM(model=actor_model,
tokenizer=tokenizer,
model_hf_config=actor_model_config,
tensor_parallel_size=tensor_parallel_size,
dtype='bfloat16',
gpu_memory_utilization=0.1,
load_format='hf')

print('start generation')
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
outputs = llm.generate(prompt_token_ids=input_ids, sampling_params=sampling_params, use_tqdm=False)
vllm_output = outputs[0].cuda()
llm.free_cache_engine()
llm = None
import gc
torch.cuda.empty_cache()
gc.collect()

generation_config = GenerationConfig(do_sample=False)
actor_model.cuda()
output = actor_model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_response_length,
# max_length=max_length,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config=generation_config,
# renormalize_logits=True,
output_scores=False, # this is potentially very large
return_dict_in_generate=True,
use_cache=False) # may OOM when use_cache = True
seq = output.sequences
response = seq[:, max_prompt_length:]

print(f'hf response: {tokenizer.batch_decode(response)}')
print(f'vllm response: {tokenizer.batch_decode(vllm_output)}')
assert torch.allclose(response, vllm_output), f'hf_response:{response} | vllm_response:{vllm_output}'
print('Check Pass')


# if __name__ == "__main__":
# test_vllm_with_hf()
Loading
Loading