Skip to content

Commit

Permalink
Merge branch 'master' into Tecorigin-SDAA-accelerator
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored Dec 23, 2024
2 parents 6df38db + eea5304 commit 59dc25c
Show file tree
Hide file tree
Showing 46 changed files with 1,015 additions and 121 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cpu-torch-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout 6c3f168b3
# git checkout 6c3f168b3
git rev-parse --short HEAD
pip install .
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/no-torch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
workflow_dispatch:
pull_request:
paths:
- 'accelerator/**'
- '.github/workflows/no-torch.yml'
- 'op_builder/**'
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout 6c3f168b3
# git checkout 6c3f168b3
git rev-parse --short HEAD
pip install .
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ name: nv-torch-nightly-v100

on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/nv-torch-nightly-v100.yml'
schedule:
- cron: "0 0 * * *"

Expand Down Expand Up @@ -34,7 +37,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
# git checkout 6c3f168b3
git rev-parse --short HEAD
pip install .
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/xpu-compile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ jobs:
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/xpu-max1100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ jobs:
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
Expand Down
45 changes: 24 additions & 21 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -8,49 +8,52 @@

# top-level repo folders
/.github/ @loadams
/azure/ @awan-10
/benchmarks/ @awan-10 @tjruwase
/azure/ @loadams
/benchmarks/ @guanhuawang @tjruwase
/bin/ @loadams
/csrc/ @awan-10
/csrc/ @tjruwase
/deepspeed/ @loadams @tjruwase
/docker/ @awan-10
/docker/ @loadams @guanhuawang
/docs/ @loadams @tjruwase
/examples/ @awan-10 @tohtana
/examples/ @jomayeri @tohtana
/op_builder/ @loadams @tjruwase @jomayeri
/release/ @loadams
/release/ @loadams @jomayeri
/requirements/ @loadams
/scripts/ @awan-10
/scripts/ @loadams @tjruwase
/tests/ @tjruwase @loadams @tohtana

# deepspeed
/deepspeed/autotuning/ @loadams
/deepspeed/checkpoint/ @tjruwase
/deepspeed/comm/ @awan-10
/deepspeed/comm/ @guanhuawang
/deepspeed/compression/ @tjruwase
/deepspeed/elasticity/ @awan-10
/deepspeed/elasticity/ @tjruwase
/deepspeed/launcher/ @loadams
/deepspeed/module_inject/ @awan-10
/deepspeed/module_inject/ @hwchen2017 @loadams
/deepspeed/moe/ @tohtana
/deepspeed/monitor/ @awan-10
/deepspeed/monitor/ @tjruwase
/deepspeed/nebula/ @tjruwase
/deepspeed/nvme/ @tjruwase @jomayeri
/deepspeed/ops/ @tohtana
/deepspeed/pipe/ @tohtana @loadams
/deepspeed/profiling/ @loadams
/deepspeed/utils/ @tjruwase @awan-10
/deepspeed/sequence/ @tohtana
/deepspeed/utils/ @tjruwase @tohtana

# inference
/deepspeed/inference/ @awan-10
/deepspeed/model_implementations/ @awan-10
/deepspeed/inference/ @hwchen2017 @tohtana
/deepspeed/model_implementations/@tohtana @loadams

# training
/deepspeed/runtime/ @tjruwase @tohtana
/deepspeed/runtime/activation_checkpointing/ @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase
/deepspeed/runtime/comm/ @awan-10
/deepspeed/runtime/compression/ @awan-10
/deepspeed/runtime/comm/ @guanhuawang
/deepspeed/runtime/compression/ @tjruwase
/deepspeed/runtime/data_pipeline/ @tjruwase
/deepspeed/runtime/fp16/ @tjruwase
/deepspeed/runtime/fp16/onebit/ @awan-10
/deepspeed/runtime/pipe/ @loadams
/deepspeed/runtime/swap_tensor/ @tjruwase
/deepspeed/runtime/zero/ @tjruwase
/deepspeed/runtime/domino/ @guanhuawang @hwchen2017
/deepspeed/runtime/fp16/ @tjruwase @tohtana
/deepspeed/runtime/fp16/onebit/ @tjruwase
/deepspeed/runtime/pipe/ @loadams @tohtana
/deepspeed/runtime/swap_tensor/ @tjruwase @jomayeri
/deepspeed/runtime/zero/ @tjruwase @tohtana
6 changes: 4 additions & 2 deletions COMMITTERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@
| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | Microsoft |
| Logan Adams | [loadams](https://github.com/loadams) | Microsoft |
| Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Microsoft |
| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake |
| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC |
| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake |
| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC |
| Ashwin Aji | [ashwinma](https://github.com/ashwinma) | AMD |
| Sam Foreman | [saforem2](https://github.com/saforem2) | Argonne National Laboratory |
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]


<!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
Expand Down Expand Up @@ -78,7 +78,7 @@ To further increase the inference efficiency, DeepSpeed offers easy-to-use and f

## DeepSpeed4Science

In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](https://www.deepspeed.ai/deepspeed4science/)
In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [tutorials](https://www.deepspeed.ai/deepspeed4science/)

---

Expand Down
8 changes: 7 additions & 1 deletion accelerator/cpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@

# DeepSpeed Team

import torch
from .abstract_accelerator import DeepSpeedAccelerator

# During setup stage torch may not be installed, pass on no torch will
# allow op builder related API to be executed.
try:
import torch
except ImportError as e:
pass

try:
import oneccl_bindings_for_pytorch # noqa: F401 # type: ignore
oneccl_imported_p = True
Expand Down
2 changes: 1 addition & 1 deletion accelerator/hpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def __init__(self):
self.apply_hpu_workarounds()
try:
import habana_frameworks.torch.hpu as hpu
hpu.setDeterministic(True)
self.hpu = hpu
torch.use_deterministic_algorithms(True)
except ImportError as e:
raise ValueError(
f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
Expand Down
25 changes: 14 additions & 11 deletions accelerator/real_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,9 @@ def get_accelerator():
if accelerator_name is None:
try:
import intel_extension_for_pytorch as ipex

if ipex._C._has_xpu():
accelerator_name = "xpu"
else:
accelerator_name = "cpu"
except ImportError as e:
pass
if accelerator_name is None:
Expand Down Expand Up @@ -175,23 +174,27 @@ def get_accelerator():
except ImportError as e:
pass
if accelerator_name is None:
# borrow this log from PR#5084
try:
import torch

# Determine if we are on a GPU or x86 CPU with torch.
if torch.cuda.is_available(): #ignore-cuda
# "torch.cuda.is_available()" provides a stronger guarantee, #ignore-cuda
# ensuring that we are free from CUDA initialization errors.
# While "torch.cuda.device_count() > 0" check ensures that #ignore-cuda
# we won't try to do any CUDA calls when no device is available
# For reference: https://github.com/microsoft/DeepSpeed/pull/6810
if torch.cuda.device_count() > 0 and torch.cuda.is_available(): #ignore-cuda
accelerator_name = "cuda"
else:
if accel_logger is not None:
accel_logger.warn(
"Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it."
)
accelerator_name = "cpu"
except (RuntimeError, ImportError) as e:
# TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection
accelerator_name = "cuda"
pass
if accelerator_name is None:
# borrow this log from PR#5084
if accel_logger is not None:
accel_logger.warn(
"Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.")
# cpu added as catch-all when accelerator detection fails
accelerator_name = "cpu"

ds_set_method = "auto detect"

Expand Down
30 changes: 30 additions & 0 deletions csrc/includes/quantization.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,36 @@ void launch_swizzled_quant(int8_t* q_data,
int devices_per_node,
cudaStream_t stream);

void launch_loco_swizzled_quant(int8_t* quantized_data,
float* quantized_scales,
const __half* uncompressed_data,
__half* error_feedback,
const float err_beta,
int num_bits,
quantize::Type quant_type,
int groups,
int elems_per_group,
int pipelining,
int nodes,
int devices_per_node,
cudaStream_t stream);

void launch_loco_dequant_reduce(int8_t* reduced_data,
float* reduced_scales,
const int8_t* input_data,
const float* input_scales,
int num_gpus,
int num_bits,
quantize::Type quant_type,
int out_groups,
int elems_per_out_group,
int elems_per_in_tensor,
int groups_per_in_tensor,
int elems_per_in_group,
__half2* error_feedback,
const float err_beta,
cudaStream_t stream);

void launch_dequant_reduce(int8_t* reduced_data,
float* reduced_scales,
const int8_t* input_data,
Expand Down
1 change: 1 addition & 0 deletions csrc/includes/quantization_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ constexpr int max_threads = 1024;
Class to hold the quantization parameters for a given tensor.
Holds the implementation of the quantization operation.
*/

template <Type qType, int numBits>
class Params {
public:
Expand Down
Loading

0 comments on commit 59dc25c

Please sign in to comment.