Skip to content

Commit

Permalink
Merge branch 'master' into parallel_zero_to_fp32_conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored Oct 31, 2024
2 parents ff1083c + c7f58c8 commit d8fd142
Show file tree
Hide file tree
Showing 18 changed files with 249 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/hpu-gaudi2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
Expand Down
8 changes: 3 additions & 5 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
image: nvcr.io/nvidia/pytorch:24.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
Expand All @@ -47,8 +47,6 @@ jobs:
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
# Update packages included in the container that do not support pydantic 2+ to versions that do
python -m pip install thinc spacy confection --upgrade
python -m pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
Expand All @@ -58,8 +56,8 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.3" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.3" --cuda_ver="12"
- name: MII unit tests
run: |
BRANCH="main"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-human-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
image: nvcr.io/nvidia/pytorch:24.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
Expand Down Expand Up @@ -50,4 +50,4 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.3" --cuda_ver="12"
2 changes: 1 addition & 1 deletion .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
6 changes: 2 additions & 4 deletions .github/workflows/nv-sd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
sd-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
image: nvcr.io/nvidia/pytorch:24.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
Expand All @@ -53,8 +53,6 @@ jobs:
pip install image-similarity-measures
python -m pip install opencv-python==4.6.* --force-reinstall
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
# Update packages included in the container that do not support pydantic 2+ to versions that do
python -m pip install thinc spacy confection --upgrade
python -m pip install .[dev,1bit,autotuning,sd]
ds_report
- name: Python environment
Expand All @@ -64,7 +62,7 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.3" --cuda_ver="12"
- name: Open GitHub issue if weekly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch110-p40.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch110-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
4 changes: 3 additions & 1 deletion deepspeed/module_inject/replace_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,10 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
if hasattr(model_config, "vision_config"):
if "MllamaVisionEncoderLayer" in str(module):
num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
else:
elif hasattr(model_config, "text_config"):
num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
else:
num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
else:
num_kv_heads = _autotp.get_model_num_kv_heads(model_config)

Expand Down
4 changes: 3 additions & 1 deletion deepspeed/module_inject/tp_shard.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ def set_n_embd(num):

def get_num_kv_heads():
global num_kv_heads
return num_kv_heads
if 'num_kv_heads' in globals():
return num_kv_heads
return None


def get_num_attention_heads():
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/profiling/flops_profiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def start_time_hook(module, input):
get_accelerator().synchronize()
module.__start_time__ = time.time()

if not hasattr(module, "__start_time_hook_handle"):
if not hasattr(module, "__start_time_hook_handle__"):
module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)

def end_time_hook(module, input, output):
Expand Down
13 changes: 13 additions & 0 deletions deepspeed/runtime/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@

import torch

try:
from torch.compiler import is_compiling as torch_is_compiling
except ImportError:
try:
from torch._dynamo.external_utils import is_compiling as torch_is_compiling
except ImportError:
# Torch does not have compiler support
torch_is_compiling = lambda: False


def is_compile_supported():
return hasattr(torch, "compiler") and hasattr(torch.nn.Module, "compile")
Expand All @@ -14,3 +23,7 @@ def disable(func):
if is_compile_supported():
return torch.compiler.disable(func)
return func


def is_compiling():
return torch_is_compiling()
3 changes: 2 additions & 1 deletion deepspeed/runtime/pipe/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ def _exec_reduce_tied_grads(self):
weight_group_list = self.module.get_tied_weights_and_groups()
for weight, group in weight_group_list:
grad = weight._hp_grad if self.using_bf16_optimizer else weight.grad
dist.all_reduce(grad, group=group)
if grad is not None:
dist.all_reduce(grad, group=group)

def _exec_reduce_grads(self):
self._force_grad_boundary = True
Expand Down
126 changes: 126 additions & 0 deletions deepspeed/sequence/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

import deepspeed.comm as dist
from deepspeed.accelerator import get_accelerator
from deepspeed.module_inject.tp_shard import get_shard_size_list, set_num_kv_heads, get_num_kv_heads
from deepspeed.utils import groups


def post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, seq_len, num_head, head_dim):
Expand Down Expand Up @@ -38,8 +40,132 @@ def post_func(input):
return post_func


def uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group):
seq_world_size = dist.get_world_size(group)
inp_shape = list(input.shape)
assert batch_dim_idx in [0, 1], "batch_dim_idx must be either 0 or 1"

if not (scatter_idx < 2):
input_splits = get_shard_size_list(inp_shape[scatter_idx], seq_world_size)
input = input.transpose(0, scatter_idx).contiguous()
local_heads = input_splits[groups._get_sequence_parallel_rank()]
output_splits = [local_heads] * seq_world_size

output_buffer_shape = [seq_world_size * local_heads] + list(input.shape[1:])
output = torch.empty(output_buffer_shape, device=input.device, dtype=input.dtype)
dist.all_to_all_single(output,input,output_split_sizes=output_splits,\
input_split_sizes=input_splits,group=group)
###[seq_ws*local_heads, ...] to [seq_ws, local_heads, ...]
output = output.view(seq_world_size, local_heads, *output.shape[1:])
###[seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]

### batch_dim_idx=0 [seq_ws,local_heads,seq_len,b,...] to [b, seq_ws, seq_len, local_heads ...]
### batch_dim_idx=1 [seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]
if batch_dim_idx == 0:
order = [3, 0, 2, 1] + list(range(4, len(output.shape)))
output = output.permute(order).contiguous()
###[b, seq_ws*local_seq_len, local_heads,...]
output = output.view(output.shape[0], inp_shape[gather_idx] * seq_world_size,
*output.shape[3:]).contiguous()
elif batch_dim_idx == 1:
output = output.transpose(1, 3).contiguous()
###[seq_ws*local_seq_len, b, local_heads,...]
output = output.view(inp_shape[gather_idx] * seq_world_size, *output.shape[2:]).contiguous()
else:
# The compatibility handling of 4D and 3D tensors, standardizing to 3D.
input = input.reshape(input.shape[0], input.shape[1], -1)

if batch_dim_idx == 0: #b,s,h
input = input.permute(1, 2, 0).contiguous() #s,h,b
elif batch_dim_idx == 1: #s,b,h
input = input.transpose(1, 2).contiguous() #s,h,b
seq_len, h, batch_size = input.shape
num_local_heads_list = get_shard_size_list(get_num_kv_heads(), seq_world_size)
local_heads = num_local_heads_list[groups._get_sequence_parallel_rank()]
h_dim = h // local_heads
local_seq_len = seq_len // seq_world_size

input = input.view(seq_len * h, batch_size)
local_seq_len_with_heads = int(input.shape[0] / seq_world_size) # dim size of local_seq_len*local_heads*hdim
input_splits = [local_seq_len_with_heads] * seq_world_size
coeff = local_seq_len_with_heads // local_heads #per head: dim size of local_seq_len*hdim

#uneven seq_world_size coeff, total_heads/local_heads.
heads_scale_coeff = get_num_kv_heads() / local_heads

output_splits = [num_local_heads * coeff for num_local_heads in num_local_heads_list]
output_buff_d1_size = int(heads_scale_coeff * local_seq_len_with_heads)
total_h = int(inp_shape[gather_idx] * heads_scale_coeff)
output = torch.empty(output_buff_d1_size, input.shape[1], device=input.device, dtype=input.dtype)
dist.all_to_all_single(output,input,output_split_sizes=output_splits, \
input_split_sizes=input_splits,group=group)
##################
#suppose 7 heads divide into 4 ranks [2,2,2,1]
#chunk_num_heads_small=floor(7/4)=1
#chunk_num_heads_large=ceil(7/4)=2
#num_chunk_heads_large=len([2,2,2])=3, all2all_buffer_counts
#num_chunk_heads_small=len([1])=1, all2all_buffer_counts
#total_num_large_heads=sum([2,2,2])=7
#total_num_small_heads=sum([1])=1

chunk_num_heads_small = get_num_kv_heads() // seq_world_size # even heads compatible
chunk_num_heads_large = chunk_num_heads_small + 1
num_chunk_heads_large = get_num_kv_heads() % seq_world_size
num_chunk_heads_small = seq_world_size - num_chunk_heads_large
total_num_large_heads = num_chunk_heads_large * chunk_num_heads_large
total_num_small_heads = num_chunk_heads_small * chunk_num_heads_small

heads_large_combine_size = coeff * total_num_large_heads
heads_small_combine_size = coeff * total_num_small_heads
heads_large_chunk, heads_small_chunk = output.split([heads_large_combine_size, heads_small_combine_size],
dim=0)
heads_large_chunk = heads_large_chunk.view(num_chunk_heads_large, local_seq_len, chunk_num_heads_large, h_dim,
batch_size)
heads_small_chunk = heads_small_chunk.view(num_chunk_heads_small, local_seq_len, chunk_num_heads_small, h_dim,
batch_size)
if batch_dim_idx == 0:
#[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[batch,local_seq_len,all2all_buffer_counts*n_heads,dim]
order = [4, 1, 0, 2, 3]
heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
total_num_large_heads, h_dim)
heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
total_num_small_heads, h_dim)
elif batch_dim_idx == 1:
#[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[local_seq_len,batch,all2all_buffer_counts*n_heads,dim]
order = [1, 4, 0, 2, 3]
heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
total_num_large_heads, h_dim)
heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
total_num_small_heads, h_dim)

output = torch.cat([heads_large_chunk, heads_small_chunk], dim=2).contiguous()

inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
output_shape= inp_shape[: gather_idx] + \
[total_h,] + \
inp_shape[gather_idx + 1:]

output = output.view(output_shape)

return output


def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, async_op=False, handle=None, type=None):
seq_world_size = dist.get_world_size(group)
# we only need num_heads once
num_heads = input.shape[2]

if get_num_kv_heads() is not None or num_heads % seq_world_size != 0:
# Assuming here that the number of heads for q is consistent with kv
# If not, additional logic is required for cases like GQA
if get_num_kv_heads() is None:
assert num_heads > seq_world_size, f"Number of heads ({num_heads}) must be larger than sequence parallel size ({seq_world_size})"
# set heads at first call by num_total_heads.
# then use ``get_num_kv_heads() is not None`` to re-entry uneven path.
set_num_kv_heads(num_heads)
assert async_op == False, "uneven head sp does not support async op"
return uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group)

if batch_dim_idx == 0:
# b, s, n, h
if scatter_idx < 2:
Expand Down
2 changes: 2 additions & 0 deletions deepspeed/utils/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,8 @@ def _get_sequence_parallel_rank():
global mpu
if mpu is not None and hasattr(mpu, 'get_sequence_parallel_rank'):
return mpu.get_sequence_parallel_rank()
if mesh_device is not None:
return dist.get_rank(mesh_device.get_group(mesh_dim="sequence_parallel"))
return 0


Expand Down
7 changes: 3 additions & 4 deletions deepspeed/utils/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
import logging
import sys
import os
import torch
from deepspeed.runtime.compiler import is_compile_supported
from deepspeed.runtime.compiler import is_compile_supported, is_compiling

log_levels = {
"debug": logging.DEBUG,
Expand All @@ -26,7 +25,7 @@ def create_warning_filter(logger):

def warn_once(record):
nonlocal warn
if is_compile_supported() and torch.compiler.is_compiling() and not warn:
if is_compile_supported() and is_compiling() and not warn:
warn = True
logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to"
" disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1")
Expand All @@ -39,7 +38,7 @@ def logging_decorator(func):

@functools.wraps(func)
def wrapper(*args, **kwargs):
if torch.compiler.is_compiling():
if is_compiling():
return
else:
return func(*args, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion op_builder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def get_default_compute_capabilities():
# Special treatment of CUDA 11.0 because compute_86 is not supported.
compute_caps += ";8.0"
else:
compute_caps += ";8.0;8.6"
compute_caps += ";8.0;8.6;9.0"
return compute_caps


Expand Down
Loading

0 comments on commit d8fd142

Please sign in to comment.