diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
index e095e089fc30..9e8bd9d792fb 100644
--- a/.github/workflows/xpu-compile.yml
+++ b/.github/workflows/xpu-compile.yml
@@ -31,10 +31,10 @@ jobs:
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
- pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
- pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
- pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
- pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+ pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+ pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+ pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+ pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index d19e73aeef1c..56bff4a88ba9 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -47,10 +47,10 @@ jobs:
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
- pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
- pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
- pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
- pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+ pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+ pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+ pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+ pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
diff --git a/COMMITTERS.md b/COMMITTERS.md
index bcb8579bf1f7..8418bdf8629d 100644
--- a/COMMITTERS.md
+++ b/COMMITTERS.md
@@ -5,5 +5,7 @@
| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | Microsoft |
| Logan Adams | [loadams](https://github.com/loadams) | Microsoft |
| Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Microsoft |
-| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake |
-| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC |
+| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake |
+| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC |
+| Ashwin Aji | [ashwinma](https://github.com/ashwinma) | AMD |
+| Sam Foreman | [saforem2](https://github.com/saforem2) | Argonne National Laboratory |
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 69e96d285bb8..a6173ac70abd 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -167,7 +167,12 @@ def get_accelerator():
import torch
# Determine if we are on a GPU or x86 CPU with torch.
- if torch.cuda.is_available(): #ignore-cuda
+ # "torch.cuda.is_available()" provides a stronger guarantee, #ignore-cuda
+ # ensuring that we are free from CUDA initialization errors.
+ # While "torch.cuda.device_count() > 0" check ensures that #ignore-cuda
+ # we won't try to do any CUDA calls when no device is available
+ # For reference: https://github.com/microsoft/DeepSpeed/pull/6810
+ if torch.cuda.device_count() > 0 and torch.cuda.is_available(): #ignore-cuda
accelerator_name = "cuda"
else:
if accel_logger is not None:
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index b17685b8dc4d..c3ebad4f86af 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -75,6 +75,8 @@ lnav:
url: /tutorials/data-efficiency/
- title: 'DeepNVMe'
url: /tutorials/deepnvme/
+ - title: 'Domino'
+ url: /tutorials/domino/
- title: 'DS4Sci_EvoformerAttention'
url: /tutorials/ds4sci_evoformerattention/
- title: 'Flops Profiler'
diff --git a/docs/_tutorials/domino.md b/docs/_tutorials/domino.md
new file mode 100644
index 000000000000..6b116cb87463
--- /dev/null
+++ b/docs/_tutorials/domino.md
@@ -0,0 +1,6 @@
+---
+title: "Domino"
+tags: training
+---
+
+Domino achieves near-complete communication hiding behind computation for tensor parallel training. Please find our [Domino-tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in DeepSpeedExample repo.
diff --git a/docs/index.md b/docs/index.md
index 3279682b42d4..3d5f290f2bde 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,25 +7,25 @@ title: "Latest News"
---
DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat).
+* [2024/12] [DeepSpeed Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
+
* [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/chinese/README.md)]
* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/chinese/README.md)]
* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
* [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
-* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
More news