diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml index e095e089fc30..9e8bd9d792fb 100644 --- a/.github/workflows/xpu-compile.yml +++ b/.github/workflows/xpu-compile.yml @@ -31,10 +31,10 @@ jobs: run: | apt-get update apt-get install clinfo libaio-dev python3-pip -y - pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/ - pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/ - pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/ - pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/ + pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/ + pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/ + pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/ + pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/ pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl pip install py-cpuinfo numpy pip install .[dev,autotuning] diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index d19e73aeef1c..56bff4a88ba9 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -47,10 +47,10 @@ jobs: run: | apt-get update apt-get install clinfo libaio-dev python3-pip -y - pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/ - pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/ - pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/ - pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/ + pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/ + pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/ + pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/ + pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/ pip install py-cpuinfo numpy pip install .[dev,autotuning] diff --git a/COMMITTERS.md b/COMMITTERS.md index bcb8579bf1f7..8418bdf8629d 100644 --- a/COMMITTERS.md +++ b/COMMITTERS.md @@ -5,5 +5,7 @@ | Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | Microsoft | | Logan Adams | [loadams](https://github.com/loadams) | Microsoft | | Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Microsoft | -| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake | -| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC | +| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake | +| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC | +| Ashwin Aji | [ashwinma](https://github.com/ashwinma) | AMD | +| Sam Foreman | [saforem2](https://github.com/saforem2) | Argonne National Laboratory | diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py index 69e96d285bb8..a6173ac70abd 100644 --- a/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -167,7 +167,12 @@ def get_accelerator(): import torch # Determine if we are on a GPU or x86 CPU with torch. - if torch.cuda.is_available(): #ignore-cuda + # "torch.cuda.is_available()" provides a stronger guarantee, #ignore-cuda + # ensuring that we are free from CUDA initialization errors. + # While "torch.cuda.device_count() > 0" check ensures that #ignore-cuda + # we won't try to do any CUDA calls when no device is available + # For reference: https://github.com/microsoft/DeepSpeed/pull/6810 + if torch.cuda.device_count() > 0 and torch.cuda.is_available(): #ignore-cuda accelerator_name = "cuda" else: if accel_logger is not None: diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index b17685b8dc4d..c3ebad4f86af 100755 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -75,6 +75,8 @@ lnav: url: /tutorials/data-efficiency/ - title: 'DeepNVMe' url: /tutorials/deepnvme/ + - title: 'Domino' + url: /tutorials/domino/ - title: 'DS4Sci_EvoformerAttention' url: /tutorials/ds4sci_evoformerattention/ - title: 'Flops Profiler' diff --git a/docs/_tutorials/domino.md b/docs/_tutorials/domino.md new file mode 100644 index 000000000000..6b116cb87463 --- /dev/null +++ b/docs/_tutorials/domino.md @@ -0,0 +1,6 @@ +--- +title: "Domino" +tags: training +--- + +Domino achieves near-complete communication hiding behind computation for tensor parallel training. Please find our [Domino-tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in DeepSpeedExample repo. diff --git a/docs/index.md b/docs/index.md index 3279682b42d4..3d5f290f2bde 100755 --- a/docs/index.md +++ b/docs/index.md @@ -7,25 +7,25 @@ title: "Latest News" --- DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). +* [2024/12] [DeepSpeed Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md) + * [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/chinese/README.md)] * [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/chinese/README.md)] * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)] * [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)] -* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
More news