From b692cdea479fba8201584054d654f639e925a265 Mon Sep 17 00:00:00 2001
From: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
Date: Tue, 12 Nov 2024 08:34:17 -0800
Subject: [PATCH 01/16] AIO File Offsets (#6641)

Adding the option for a file offset to the read/write functions of AIO &
GDS ops.

---------

Co-authored-by: jomayeri <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 accelerator/cpu_accelerator.py              |  2 +
 csrc/aio/common/deepspeed_aio_utils.cpp     | 18 ++--
 csrc/aio/common/deepspeed_aio_utils.h       |  4 +-
 csrc/aio/py_lib/deepspeed_aio_op_desc.cpp   |  6 +-
 csrc/aio/py_lib/deepspeed_aio_op_desc.h     |  4 +-
 csrc/aio/py_lib/deepspeed_cpu_op.cpp        | 19 ++--
 csrc/aio/py_lib/deepspeed_cpu_op.h          |  3 +-
 csrc/aio/py_lib/deepspeed_py_aio.cpp        |  7 +-
 csrc/aio/py_lib/deepspeed_py_io_handle.cpp  | 65 +++++++++-----
 csrc/aio/py_lib/deepspeed_py_io_handle.h    | 27 ++++--
 csrc/aio/py_lib/py_ds_aio.cpp               | 24 +++--
 csrc/aio/py_test/ds_aio_handle.py           |  2 +-
 csrc/gds/py_lib/deepspeed_gds_op.cpp        | 24 +++--
 csrc/gds/py_lib/deepspeed_gds_op.h          |  3 +-
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 15 +++-
 csrc/gds/py_lib/deepspeed_py_gds_handle.h   |  3 +-
 csrc/gds/py_lib/py_ds_gds.cpp               | 24 +++--
 deepspeed/runtime/swap_tensor/utils.py      |  4 +-
 deepspeed/utils/numa.py                     |  5 +-
 tests/unit/ops/aio/test_aio.py              | 97 +++++++++++++++++++--
 tests/unit/ops/aio/test_gds.py              | 87 ++++++++++++++++--
 21 files changed, 342 insertions(+), 101 deletions(-)
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index 1e4335b19292..0e49bd9f6458 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -71,6 +71,8 @@ def device_count(self):
             # In flat mode, HBM is in separate NUMA node with no cores on this node.
             # Ignore these NUMA nodes with no cores.
             numa_core_lists = get_numa_cores()
+            if not numa_core_lists:
+                return 1
             numa_count = 0
             prev_core_list = []
             for core_list in numa_core_lists:
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index 0536ff6a362e..fb269b58315f 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -19,9 +19,14 @@ const int c_io_queue_depth = 8;
 
 io_xfer_ctxt::io_xfer_ctxt(const int fd,
                            const int64_t file_offset,
+                           const int64_t buffer_offset,
                            const int64_t num_bytes,
                            const void* buffer)
-    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
+    : _fd(fd),
+      _file_base_offset(file_offset),
+      _buffer_base_offset(buffer_offset),
+      _mem_buffer(buffer),
+      _num_bytes(num_bytes)
 {
 }
 
@@ -41,9 +46,10 @@ void io_prep_context::prep_iocbs(const int n_iocbs,
     assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
     for (auto i = 0; i < n_iocbs; ++i) {
         const auto shift = i * _block_size;
-        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
-        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
+        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_buffer_base_offset + shift;
+        const auto xfer_offset = _xfer_ctxt->_file_base_offset + start_offset + shift;
         auto byte_count = _block_size;
+
         if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
 
         if (_read_op) {
@@ -79,10 +85,10 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>*
 
     auto actual_n_iocbs = min(static_cast<int64_t>(n_iocbs), _remaining_io_blocks);
     for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
-        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
-        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
+        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + _xfer_ctxt->_buffer_base_offset +
+                                 (_next_iocb_index * _block_size);
+        const auto xfer_offset = _xfer_ctxt->_file_base_offset + (_next_iocb_index * _block_size);
         const auto num_bytes = min(static_cast<int64_t>(_block_size), _remaining_bytes);
-
         if (_read_op) {
             io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
         } else {
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 20e81fe8eebd..6b7599acecb4 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -30,12 +30,14 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 struct io_xfer_ctxt {
     const int _fd;
-    const int64_t _base_offset;
+    const int64_t _file_base_offset;
+    const int64_t _buffer_base_offset;
     const void* _mem_buffer;
     const int64_t _num_bytes;
 
     io_xfer_ctxt(const int fd,
                  const int64_t file_offset,
+                 const int64_t buffer_offset,
                  const int64_t num_bytes,
                  const void* buffer);
 };
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
index 6f311c5400c7..945251397225 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -13,14 +13,16 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
                            const char* filename,
                            const int64_t file_num_bytes,
                            const int intra_op_parallelism,
-                           const bool validate)
+                           const bool validate,
+                           const int64_t file_offset)
     : _read_op(read_op),
       _buffer(buffer),
       _fd(fd),
       _filename(filename),
       _file_num_bytes(file_num_bytes),
+      _file_offset(file_offset),
       _intra_op_parallelism(intra_op_parallelism),
-      _num_bytes_per_thread(file_num_bytes / intra_op_parallelism),
+      _num_bytes_per_thread(static_cast<int64_t>(buffer.nbytes()) / intra_op_parallelism),
       _validate(validate)
 {
 }
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
index f841b8ce520a..ac1cdf90f78b 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -19,6 +19,7 @@ struct io_op_desc_t {
     const int64_t _num_bytes_per_thread;
     torch::Tensor _contiguous_buffer;
     const bool _validate;
+    const int64_t _file_offset;
 
     io_op_desc_t(const bool read_op,
                  const torch::Tensor& buffer,
@@ -26,7 +27,8 @@ struct io_op_desc_t {
                  const char* filename,
                  const int64_t file_num_bytes,
                  const int intra_op_parallelism,
-                 const bool validate);
+                 const bool validate,
+                 const int64_t file_offset);
 
     virtual void run(const int tid,
                      std::unique_ptr<aio_context>& aio_ctxt,
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index da2ff568d74b..56fb33fb1886 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -16,8 +16,16 @@ cpu_op_desc_t::cpu_op_desc_t(
     const char* filename,
     const int64_t file_num_bytes,
     const int intra_op_parallelism,
-    const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate),
+    const bool validate,
+    const int64_t file_offset)
+    : io_op_desc_t(read_op,
+                   buffer,
+                   fd,
+                   filename,
+                   file_num_bytes,
+                   intra_op_parallelism,
+                   validate,
+                   file_offset),
       _cpu_buffer(buffer),
       _pinned_tensor_mgr(pinned_tensor_mgr),
       _is_managed_bounce_buffer(false)
@@ -66,10 +74,11 @@ void cpu_op_desc_t::run(const int tid,
                         deepspeed_aio_config_t* aio_config)
 {
     assert(tid < _intra_op_parallelism);
-    const auto base_offset = _num_bytes_per_thread * tid;
+    const auto buffer_base_offset = _num_bytes_per_thread * tid;
+    const auto file_base_offset = _file_offset + (_num_bytes_per_thread * tid);
 
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
-        new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr()));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
+        _fd, file_base_offset, buffer_base_offset, _num_bytes_per_thread, data_ptr()));
 
     if (aio_config->_overlap_events) {
         do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
index 9de2fa254048..debaf4a90731 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.h
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -20,7 +20,8 @@ struct cpu_op_desc_t : io_op_desc_t {
                   const char* filename,
                   const int64_t file_num_bytes,
                   const int intra_op_parallelism,
-                  const bool validate);
+                  const bool validate,
+                  const int64_t file_offset);
 
     void run(const int tid,
              std::unique_ptr<aio_context>& aio_ctxt,
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 02b04057d1ac..1ff0397043fa 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -52,7 +52,9 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     auto write_buffer = (char*)buffer.data_ptr();
     const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, 0, 0, num_write_bytes, write_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
     if (config._overlap_events) {
@@ -97,7 +99,8 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
     auto read_buffer = (char*)buffer.data_ptr();
     assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
 
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, 0, 0, num_file_bytes, read_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
     if (config._overlap_events) {
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
index 48ea8a1339d4..64d7c2e0541e 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -58,7 +58,10 @@ const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_e
 
 const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; }
 
-int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+int deepspeed_io_handle_t::read(torch::Tensor& buffer,
+                                const char* filename,
+                                const bool validate,
+                                const int64_t file_offset)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
 
@@ -76,7 +79,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con
     if (fd == -1) { return -1; }
 
     auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, file_offset, 0, num_file_bytes, read_buffer));
 
     if (_aio_config._overlap_events) {
         do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
@@ -98,7 +102,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con
 
 int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
                                  const char* filename,
-                                 const bool validate)
+                                 const bool validate,
+                                 const int64_t file_offset)
 {
     assert(_aio_ctxt);
 
@@ -109,7 +114,8 @@ int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
 
     auto write_buffer = (char*)buffer.data_ptr();
     const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, file_offset, 0, num_write_bytes, write_buffer));
 
     if (_aio_config._overlap_events) {
         do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
@@ -206,7 +212,8 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
     const int fd,
     const char* filename,
     const int64_t file_num_bytes,
-    const bool validate)
+    const bool validate,
+    const int64_t file_offset)
 {
     return std::make_shared<cpu_op_desc_t>(read_op,
                                            buffer,
@@ -215,13 +222,15 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
                                            filename,
                                            file_num_bytes,
                                            _intra_op_parallelism,
-                                           validate);
+                                           validate,
+                                           file_offset);
 }
 
 int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
                                  const char* filename,
                                  const bool validate,
-                                 const bool async)
+                                 const bool async,
+                                 const int64_t file_offset)
 {
     int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
@@ -229,20 +238,18 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
         report_file_error(filename, " fstat for read", error_code);
         return -1;
     }
+
+    // buffer can exceed file size to enable 4k alignment
     const auto buffer_bytes = static_cast<int64_t>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(buffer_bytes == num_file_bytes);
     assert((num_file_bytes % _intra_op_parallelism) == 0);
 
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+    if (!_is_valid_parallel_aio_op(true, buffer_bytes)) { return -1; }
 
     const auto fd = open_file(filename, true);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate);
+    auto scheduled_op =
+        _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate, file_offset);
 
     _schedule_aio_work(scheduled_op);
 
@@ -254,7 +261,8 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
 int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
                                   const char* filename,
                                   const bool validate,
-                                  const bool async)
+                                  const bool async,
+                                  const int64_t file_offset)
 {
     const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
     assert((num_write_bytes % _intra_op_parallelism) == 0);
@@ -264,7 +272,8 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
     const auto fd = open_file(filename, false);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate);
+    auto scheduled_op =
+        _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate, file_offset);
 
     _schedule_aio_work(scheduled_op);
 
@@ -273,24 +282,32 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
     return wait();
 }
 
-int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer,
+                                      const char* filename,
+                                      const int64_t file_offset)
 {
-    return pread(buffer, filename, false, false);
+    return pread(buffer, filename, false, false, file_offset);
 }
 
-int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer,
+                                       const char* filename,
+                                       const int64_t file_offset)
 {
-    return pwrite(buffer, filename, false, false);
+    return pwrite(buffer, filename, false, false, file_offset);
 }
 
-int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer,
+                                       const char* filename,
+                                       const int64_t file_offset)
 {
-    return pread(buffer, filename, false, true);
+    return pread(buffer, filename, false, true, file_offset);
 }
 
-int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer,
+                                        const char* filename,
+                                        const int64_t file_offset)
 {
-    return pwrite(buffer, filename, false, true);
+    return pwrite(buffer, filename, false, true, file_offset);
 }
 
 at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem,
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
index 4fedf8080818..dfcb4125ab9a 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -38,27 +38,35 @@ struct deepspeed_io_handle_t {
     const bool get_overlap_events() const;
     const int get_intra_op_parallelism() const;
 
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+    int read(torch::Tensor& buffer,
+             const char* filename,
+             const bool validate,
+             const int64_t file_offset);
 
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+    int write(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const int64_t file_offset);
 
     int pread(const torch::Tensor& buffer,
               const char* filename,
               const bool validate,
-              const bool async);
+              const bool async,
+              const int64_t file_offset);
 
     int pwrite(const torch::Tensor& buffer,
                const char* filename,
                const bool validate,
-               const bool async);
+               const bool async,
+               const int64_t file_offset);
 
-    int sync_pread(torch::Tensor& buffer, const char* filename);
+    int sync_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
-    int async_pread(torch::Tensor& buffer, const char* filename);
+    int async_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+    int async_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
     // TODO: Make API's args to be shape and dtype.
     torch::Tensor new_cpu_locked_tensor(const int64_t num_elem,
@@ -81,5 +89,6 @@ struct deepspeed_io_handle_t {
                                                                     const int fd,
                                                                     const char* filename,
                                                                     const int64_t file_num_bytes,
-                                                                    const bool validate);
+                                                                    const bool validate,
+                                                                    const int64_t file_offset);
 };
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index b80fa2d6c8e6..bf298b691b81 100644
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -40,14 +40,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "Synchronous and non-parallel file read. Returns count of completed read ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("write",
              &deepspeed_aio_handle_t::write,
              "Synchronous and non-parallel file write. Returns count of completed write ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("pread",
              &deepspeed_aio_handle_t::pread,
@@ -55,7 +57,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("pwrite",
              &deepspeed_aio_handle_t::pwrite,
@@ -63,33 +66,38 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pread",
              &deepspeed_aio_handle_t::sync_pread,
              "Synchrononous parallel file read. Returns count of completed read ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pwrite",
              &deepspeed_aio_handle_t::sync_pwrite,
              "Synchronous parallel file write. Returns count of completed write ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pread",
              &deepspeed_aio_handle_t::async_pread,
              "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
              "following wait() returns count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pwrite",
              &deepspeed_aio_handle_t::async_pwrite,
              "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
              "count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("new_cpu_locked_tensor",
              &deepspeed_aio_handle_t::new_cpu_locked_tensor,
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index f4a179deb9ec..6913e9090bf5 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -92,7 +92,7 @@ def main_parallel_read(pool_params):
 
     start_time = time.time()
     dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
-    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True)
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, 0, True)
     assert ret != -1
     handle.wait()
     if dest_buffer == BOUNCE_BUFFER:
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
index f49f74394374..b7055c8cc72b 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -95,8 +95,16 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
                              const char* filename,
                              const int64_t file_num_bytes,
                              const int intra_op_parallelism,
-                             const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate)
+                             const bool validate,
+                             const int64_t file_offset)
+    : io_op_desc_t(read_op,
+                   buffer,
+                   fd,
+                   filename,
+                   file_num_bytes,
+                   intra_op_parallelism,
+                   validate,
+                   file_offset)
 {
     _contiguous_buffer = _buffer.contiguous();
     const int64_t device = _buffer.get_device();
@@ -124,17 +132,17 @@ void gds_op_desc_t::run(const int tid,
 {
     assert(tid < _intra_op_parallelism);
     check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
-    int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
-    const auto file_offset = _num_bytes_per_thread * tid;
+    const auto buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
+    const auto tid_file_offset = _file_offset + (_num_bytes_per_thread * tid);
 
     if (_read_op) {
         auto ret =
-            cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
-        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+            cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, tid_file_offset); }
     } else {
         auto ret =
-            cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
-        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+            cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, tid_file_offset); }
     }
 }
 
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
index 380bb0b9b6ae..d955527b1ba3 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.h
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -24,7 +24,8 @@ struct gds_op_desc_t : io_op_desc_t {
                   const char* filename,
                   const int64_t file_num_bytes,
                   const int intra_op_parallelism,
-                  const bool validate);
+                  const bool validate,
+                  const int64_t file_offset);
 
     void run(const int tid,
              std::unique_ptr<aio_context>& aio_ctxt,
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index c052144a0190..f11245c75a5e 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -107,12 +107,19 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
     const int fd,
     const char* filename,
     const int64_t file_num_bytes,
-    const bool validate)
+    const bool validate,
+    const int64_t file_offset)
 {
     if (buffer.is_cuda()) {
-        return std::make_shared<gds_op_desc_t>(
-            read_op, buffer, fd, filename, file_num_bytes, _intra_op_parallelism, validate);
+        return std::make_shared<gds_op_desc_t>(read_op,
+                                               buffer,
+                                               fd,
+                                               filename,
+                                               file_num_bytes,
+                                               _intra_op_parallelism,
+                                               validate,
+                                               file_offset);
     }
     return deepspeed_io_handle_t::_create_io_op_desc(
-        read_op, buffer, fd, filename, file_num_bytes, validate);
+        read_op, buffer, fd, filename, file_num_bytes, validate, file_offset);
 }
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index 131e83e7b838..25f68e177b2c 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -42,7 +42,8 @@ struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
                                                             const int fd,
                                                             const char* filename,
                                                             const int64_t file_num_bytes,
-                                                            const bool validate);
+                                                            const bool validate,
+                                                            const int64_t file_offset);
 
     static int s_cuFile_init;
 };
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
index 57bf8d2207c4..2f165ee2c32a 100644
--- a/csrc/gds/py_lib/py_ds_gds.cpp
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -33,14 +33,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "Synchronous and non-parallel file read. Returns count of completed read ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("write",
              &deepspeed_gds_handle_t::write,
              "Synchronous and non-parallel file write. Returns count of completed write ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("pread",
              &deepspeed_gds_handle_t::pread,
@@ -48,7 +50,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("pwrite",
              &deepspeed_gds_handle_t::pwrite,
@@ -56,33 +59,38 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pread",
              &deepspeed_gds_handle_t::sync_pread,
              "Synchrononous parallel file read. Returns count of completed read ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pwrite",
              &deepspeed_gds_handle_t::sync_pwrite,
              "Synchronous parallel file write. Returns count of completed write ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pread",
              &deepspeed_gds_handle_t::async_pread,
              "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
              "following wait() returns count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pwrite",
              &deepspeed_gds_handle_t::async_pwrite,
              "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
              "count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("new_cpu_locked_tensor",
              &deepspeed_gds_handle_t::new_cpu_locked_tensor,
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
index 90b2d9b8bd31..1f9825c34638 100644
--- a/deepspeed/runtime/swap_tensor/utils.py
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -18,12 +18,12 @@
 
 def swap_in_tensors(swap_handle, tensor_buffers, swap_paths):
     for buffer, path in zip(tensor_buffers, swap_paths):
-        assert (swap_handle.async_pread(buffer, path) == 0)
+        assert (swap_handle.async_pread(buffer, path, 0) == 0)
 
 
 def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
     for buffer, path in zip(tensor_buffers, swap_paths):
-        assert (swap_handle.async_pwrite(buffer, path) == 0)
+        assert (swap_handle.async_pwrite(buffer, path, 0) == 0)
 
 
 def print_object(obj, name, exclude_list=[]):
diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py
index 4fe7cbba90ae..aba3b5179d41 100644
--- a/deepspeed/utils/numa.py
+++ b/deepspeed/utils/numa.py
@@ -23,7 +23,10 @@
 # ]
 def get_numa_cores():
     ret = []
-    output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    try:
+        output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    except:
+        return []
     lines = output.split('\n')
     for line in lines:
         if line.startswith('available:'):
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index a074cfca317f..1aa5f647a8aa 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -35,16 +35,21 @@ def _get_local_rank():
     return 0
 
 
-def _do_ref_write(tmpdir, index=0):
+def _do_ref_write(tmpdir, index=0, file_size=IO_SIZE):
     file_suffix = f'{_get_local_rank()}_{index}'
     ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
-    ref_buffer = os.urandom(IO_SIZE)
+    ref_buffer = os.urandom(file_size)
     with open(ref_file, 'wb') as f:
         f.write(ref_buffer)
 
     return ref_file, ref_buffer
 
 
+def _get_file_path(tmpdir, file_prefix, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'{file_prefix}_{file_suffix}.pt')
+
+
 def _get_test_write_file(tmpdir, index):
     file_suffix = f'{_get_local_rank()}_{index}'
     return os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
@@ -103,7 +108,7 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.sync_pread(aio_buffer, ref_file)
+        read_status = h.sync_pread(aio_buffer, ref_file, 0)
         assert read_status == 1
 
         with open(ref_file, 'rb') as f:
@@ -131,7 +136,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.async_pread(aio_buffer, ref_file)
+        read_status = h.async_pread(aio_buffer, ref_file, 0)
         assert read_status == 0
 
         wait_status = h.wait()
@@ -172,7 +177,7 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.sync_pwrite(aio_buffer, aio_file)
+        write_status = h.sync_pwrite(aio_buffer, aio_file, 0)
         assert write_status == 1
 
         if not use_cuda_pinned_tensor:
@@ -201,7 +206,7 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.async_pwrite(aio_buffer, aio_file)
+        write_status = h.async_pwrite(aio_buffer, aio_file, 0)
         assert write_status == 0
 
         wait_status = h.wait()
@@ -258,7 +263,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, use_unpinned_te
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pread(aio_buffers[i], ref_files[i])
+            read_status = h.async_pread(aio_buffers[i], ref_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -305,7 +310,7 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, use_unpinned_t
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
+            read_status = h.async_pwrite(aio_buffers[i], aio_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -320,3 +325,79 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, use_unpinned_t
 
             filecmp.clear_cache()
             assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
+
+
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize('file_partitions', [[1, 1, 1], [1, 1, 2], [1, 2, 1], [2, 1, 1]])
+class TestAsyncFileOffset(DistributedTest):
+    world_size = 1
+
+    def test_offset_write(self, tmpdir, file_partitions, use_cuda_pinned_tensor):
+
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        ref_file = _get_file_path(tmpdir, '_py_random')
+        aio_file = _get_file_path(tmpdir, '_aio_random')
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            data_buffer = torch.ByteTensor(list(os.urandom(file_size))).pin_memory()
+        else:
+            data_buffer = h.new_cpu_locked_tensor(file_size, torch.empty(0, dtype=torch.uint8))
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        ref_fd = open(ref_file, 'wb')
+        for i in range(len(file_partitions)):
+            src_buffer = torch.narrow(data_buffer, 0, file_offsets[i], file_partitions[i] * partition_unit_size)
+
+            ref_fd.write(src_buffer.numpy().tobytes())
+            ref_fd.flush()
+
+            assert 1 == h.sync_pwrite(buffer=src_buffer, filename=aio_file, file_offset=file_offsets[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+        ref_fd.close()
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(data_buffer)
+
+    def test_offset_read(self, tmpdir, file_partitions, use_cuda_pinned_tensor):
+
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+        ref_file, _ = _do_ref_write(tmpdir, 0, file_size)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            data_buffer = torch.zeros(file_size, dtype=torch.uint8, device='cpu').pin_memory()
+        else:
+            data_buffer = h.new_cpu_locked_tensor(file_size, torch.empty(0, dtype=torch.uint8))
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        with open(ref_file, 'rb') as ref_fd:
+            for i in range(len(file_partitions)):
+                ref_fd.seek(file_offsets[i])
+                bytes_to_read = file_partitions[i] * partition_unit_size
+                ref_buf = list(ref_fd.read(bytes_to_read))
+
+                dst_tensor = torch.narrow(data_buffer, 0, 0, bytes_to_read)
+                assert 1 == h.sync_pread(dst_tensor, ref_file, file_offsets[i])
+                assert dst_tensor.tolist() == ref_buf
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(data_buffer)
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
index e94d42cd22af..d97eff452eb5 100644
--- a/tests/unit/ops/aio/test_gds.py
+++ b/tests/unit/ops/aio/test_gds.py
@@ -29,16 +29,21 @@ def _get_local_rank():
     return 0
 
 
-def _do_ref_write(tmpdir, index=0):
+def _do_ref_write(tmpdir, index=0, file_size=IO_SIZE):
     file_suffix = f'{_get_local_rank()}_{index}'
     ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
-    ref_buffer = os.urandom(IO_SIZE)
+    ref_buffer = os.urandom(file_size)
     with open(ref_file, 'wb') as f:
         f.write(ref_buffer)
 
     return ref_file, ref_buffer
 
 
+def _get_file_path(tmpdir, file_prefix, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'{file_prefix}_{file_suffix}.pt')
+
+
 def _get_test_write_file(tmpdir, index):
     file_suffix = f'{_get_local_rank()}_{index}'
     return os.path.join(tmpdir, f'_gds_write_random_{file_suffix}.pt')
@@ -78,7 +83,7 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events):
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.sync_pread(gds_buffer, ref_file)
+        read_status = h.sync_pread(gds_buffer, ref_file, 0)
         assert read_status == 1
 
         with open(ref_file, 'rb') as f:
@@ -97,7 +102,7 @@ def test_async_read(self, tmpdir, single_submit, overlap_events):
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.async_pread(gds_buffer, ref_file)
+        read_status = h.async_pread(gds_buffer, ref_file, 0)
         assert read_status == 0
 
         wait_status = h.wait()
@@ -128,7 +133,7 @@ def test_parallel_write(self, tmpdir, single_submit, overlap_events):
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.sync_pwrite(gds_buffer, gds_file)
+        write_status = h.sync_pwrite(gds_buffer, gds_file, 0)
         assert write_status == 1
 
         h.unpin_device_tensor(gds_buffer)
@@ -146,7 +151,7 @@ def test_async_write(self, tmpdir, single_submit, overlap_events):
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.async_pwrite(gds_buffer, gds_file)
+        write_status = h.async_pwrite(gds_buffer, gds_file, 0)
         assert write_status == 0
 
         wait_status = h.wait()
@@ -188,7 +193,7 @@ def test_read(self, tmpdir, async_queue):
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pread(gds_buffers[i], ref_files[i])
+            read_status = h.async_pread(gds_buffers[i], ref_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -225,7 +230,7 @@ def test_write(self, tmpdir, async_queue):
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pwrite(gds_buffers[i], gds_files[i])
+            read_status = h.async_pwrite(gds_buffers[i], gds_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -268,3 +273,69 @@ def test_pin_device_tensor(self, use_new_api):
             h.free_pinned_device_tensor(pinned_buffer)
         else:
             h.unpin_device_tensor(pinned_buffer)
+
+
+@pytest.mark.parametrize('file_partitions', [[1, 1, 1], [1, 1, 2], [1, 2, 1], [2, 1, 1]])
+class TestAsyncFileOffset(DistributedTest):
+    world_size = 1
+
+    def test_offset_write(self, tmpdir, file_partitions):
+        ref_file = _get_file_path(tmpdir, '_py_random')
+        aio_file = _get_file_path(tmpdir, '_aio_random')
+        partition_unit_size = IO_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        gds_buffer = torch.empty(file_size, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        ref_fd = open(ref_file, 'wb')
+        for i in range(len(file_partitions)):
+            src_buffer = torch.narrow(gds_buffer, 0, file_offsets[i],
+                                      file_partitions[i] * partition_unit_size).to(device='cpu')
+
+            ref_fd.write(src_buffer.numpy().tobytes())
+            ref_fd.flush()
+
+            assert 1 == h.sync_pwrite(buffer=src_buffer, filename=aio_file, file_offset=file_offsets[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+        ref_fd.close()
+
+        h.unpin_device_tensor(gds_buffer)
+
+    def test_offset_read(self, tmpdir, file_partitions):
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+        ref_file, _ = _do_ref_write(tmpdir, 0, file_size)
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        gds_buffer = torch.empty(file_size, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        with open(ref_file, 'rb') as ref_fd:
+            for i in range(len(file_partitions)):
+                ref_fd.seek(file_offsets[i])
+                bytes_to_read = file_partitions[i] * partition_unit_size
+                ref_buf = list(ref_fd.read(bytes_to_read))
+
+                dst_tensor = torch.narrow(gds_buffer, 0, 0, bytes_to_read)
+                assert 1 == h.sync_pread(dst_tensor, ref_file, file_offsets[i])
+                assert dst_tensor.tolist() == ref_buf
+
+        h.unpin_device_tensor(gds_buffer)

From 877aa0dba673c2aa2157029c28363b804d6ee03d Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:50:02 -0800
Subject: [PATCH 02/16] Update path for BingBertSquad from DeepSpeedExamples
 (#6746)

In https://github.com/microsoft/DeepSpeedExamples/pull/245, the
DeepSpeedExamples directory structure was refactored, this updates the
DeepSpeed examples from those changes.
---
 docs/_tutorials/bert-finetuning.md                    | 4 ++--
 docs/_tutorials/onebit-adam.md                        | 4 ++--
 tests/model/BingBertSquad/run_BingBertSquad.sh        | 2 +-
 tests/model/BingBertSquad/run_BingBertSquad_sanity.sh | 2 +-
 tests/model/BingBertSquad/run_tests.sh                | 2 +-
 tests/model/BingBertSquad/test_e2e_squad.py           | 4 ++--
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md
index 3014be18d682..f833acebde9a 100755
--- a/docs/_tutorials/bert-finetuning.md
+++ b/docs/_tutorials/bert-finetuning.md
@@ -10,14 +10,14 @@ In this tutorial we will be adding DeepSpeed to the BingBert model for the SQuAD
 
 If you don't already have a copy of the DeepSpeed repository, please clone in
 now and checkout the DeepSpeedExamples submodule the contains the BingBertSquad
-example (DeepSpeedExamples/BingBertSquad) we will be going over in the rest of
+example (DeepSpeedExamples/training/BingBertSquad) we will be going over in the rest of
 this tutorial.
 
 ```shell
 git clone https://github.com/microsoft/DeepSpeed
 cd DeepSpeed
 git submodule update --init --recursive
-cd DeepSpeedExamples/BingBertSquad
+cd DeepSpeedExamples/training/BingBertSquad
 ```
 
 ### Pre-requisites
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index b1a8b5369761..e66bba3f818b 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -136,7 +136,7 @@ You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [Hug
 
 ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
 
-We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+We provide example scripts under [DeepSpeedExamples/training/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
 
 <!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
 already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
@@ -157,7 +157,7 @@ To enable the 1-bit compressed training, 1-bit Adam uses an MPI library (E.g. MV
 
 ### Launch with deepspeed
 
-The following helper script in the DeepSpeedExamples/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
+The following helper script in the DeepSpeedExamples/training/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
 
 ```shell
 bash run_squad_deepspeed_onebitadam.sh <PATH_TO_OUTPUT_DIR>
diff --git a/tests/model/BingBertSquad/run_BingBertSquad.sh b/tests/model/BingBertSquad/run_BingBertSquad.sh
index fcfdf5e66361..f665456416e0 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad.sh
@@ -93,7 +93,7 @@ done
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
diff --git a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
index 1b49a37b783f..8b6ad942ba59 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
@@ -94,7 +94,7 @@ done
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
diff --git a/tests/model/BingBertSquad/run_tests.sh b/tests/model/BingBertSquad/run_tests.sh
index eef93ef98796..2a69fdf01c79 100755
--- a/tests/model/BingBertSquad/run_tests.sh
+++ b/tests/model/BingBertSquad/run_tests.sh
@@ -31,7 +31,7 @@ validate_folder() {
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py
index 9312dc67a193..9f03b89d0829 100644
--- a/tests/model/BingBertSquad/test_e2e_squad.py
+++ b/tests/model/BingBertSquad/test_e2e_squad.py
@@ -10,11 +10,11 @@
 import pytest
 import json
 
-sys.path.append("../../../DeepSpeedExamples/BingBertSquad")
+sys.path.append("../../../DeepSpeedExamples/training/BingBertSquad")
 import evaluate as eval
 
 squad_dir = "/data/BingBertSquad"
-base_dir = "../../../DeepSpeedExamples/BingBertSquad"
+base_dir = "../../../DeepSpeedExamples/training/BingBertSquad"
 
 script_file_name = "run_squad_deepspeed.sh"
 model_file_name = "training_state_checkpoint_162.tar"

From 9a2c209cee898931df310c218cd87d0840a72572 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 13 Nov 2024 09:04:56 -0800
Subject: [PATCH 03/16] Sanitize inputs to eval() (#6745)

---
 tests/model/BingBertSquad/run_BingBertSquad.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/model/BingBertSquad/run_BingBertSquad.sh b/tests/model/BingBertSquad/run_BingBertSquad.sh
index f665456416e0..4d06bb1230a4 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad.sh
@@ -160,8 +160,11 @@ run_cmd="deepspeed.pt \
       --master_port ${master_port}
       ${BingBertSquad_script} ${other_args} ${squad_args}"
 
-echo ${run_cmd}
-eval ${run_cmd}
+# Sanitize input before running eval()
+safe_cmd=$(printf '%q' "$run_cmd")
+
+echo ${safe_cmd}
+eval ${safe_cmd}
 
 set +x
 

From d702eb5f79bcfc8a7afa735b70762633cd5a56e9 Mon Sep 17 00:00:00 2001
From: Minjia Zhang <zhangninja@gmail.com>
Date: Thu, 14 Nov 2024 14:01:53 -0600
Subject: [PATCH 04/16] Adding the governance doc (#6748)

Drafted governance doc for the LFAI.

Co-authored-by: Minjia Zhang <minjiaz@illinois.edu>
---
 GOVERNANCE.md | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 GOVERNANCE.md

diff --git a/GOVERNANCE.md b/GOVERNANCE.md
new file mode 100644
index 000000000000..d488ec55114e
--- /dev/null
+++ b/GOVERNANCE.md
@@ -0,0 +1,101 @@
+
+# DeepSpeed Project Charter and Governance
+
+This charter sets forth the responsibilities and procedures for technical contribution to, and oversight of, the DeepSpeed open source project. All contributors (including committers, maintainers, and other technical positions) and other participants in the Project (collectively, "Collaborators") must comply with the terms of this Charter.
+
+## Mission and Scope of the Project
+
+The mission of the Project is to DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.
+
+The scope of the Project includes collaborative development under the Project License (as defined herein) supporting the mission, including documentation, testing, integration, and the creation of other artifacts that aid the development, deployment, operation, or adoption of the open source project.
+
+## Technical Steering Committee
+
+1. The Technical Steering Committee (the "TSC") will be responsible for all technical oversight of the open source Project.
+
+2. The TSC voting members are initially the Project's Committers. At the inception of the project, the Committers of the Project will be as set forth within the "CONTRIBUTING" file within the Project's code repository. The TSC may choose an alternative approach for determining the voting members of the TSC, and any such alternative approach will be documented in the CONTRIBUTING file. Any meetings of the Technical Steering Committee are intended to be open to the public, and can be conducted electronically, via teleconference, or in person.
+
+3. TSC projects generally will involve Contributors and Committers. The TSC may adopt or modify roles so long as the roles are documented in the CONTRIBUTING file. Unless otherwise documented:
+
+	- **Contributors** include anyone in the technical community that contributes code, documentation, or other technical artifacts to the Project.
+	- **Committers** are Contributors who have earned the ability to modify ("commit") source code, documentation, or other technical artifacts in a project's repository.
+
+	-  A Contributor may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers.
+
+4. Participation in the Project through becoming a Contributor and Committer is open to anyone so long as they abide by the terms of this Charter.
+
+5. The TSC may:
+	- Establish workflow procedures for the submission, approval, and closure/archiving of projects.
+	- Set requirements for the promotion of Contributors to Committer status, as applicable.
+	- Amend, adjust, refine and/or eliminate the roles of Contributors and Committers, and create new roles, and publicly document any TSC roles, as it sees fit.
+
+6. The TSC may elect a TSC Chair, who will preside over meetings of the TSC and will serve until their resignation or replacement by the TSC. The TSC Chair, or any other TSC member so designated by the TSC, will serve as the primary communication contact between the Project and AI & Data, a directed fund of The Linux Foundation.
+
+7. Responsibilities:  The TSC will be responsible for all aspects of oversight relating to the Project, which may include:
+
+	- Coordinating the technical direction of the Project.
+	- Approving project or system proposals (including, but not limited to, incubation, deprecation, and changes to a sub-project's scope).
+	- Organizing sub-projects and removing sub-projects.
+	- Creating sub-committees or working groups to focus on cross-project technical issues and requirements.
+	- Appointing representatives to work with other open source or open standards communities.
+	- Establishing community norms, workflows, issuing releases, and security issue reporting policies.
+	- Approving and implementing policies and processes for contributing (to be published in the CONTRIBUTING file) and coordinating with the series manager of the Project (as provided for in the Series Agreement, the "Series Manager") to resolve matters or concerns that may arise as set forth in Section 7 of this Charter.
+	- Discussions, seeking consensus, and where necessary, voting on technical matters relating to the code base that affect multiple projects.
+	- Coordinating any marketing, events, or communications regarding the Project.
+
+## TSC Voting
+
+1. While the Project aims to operate as a consensus-based community, if any TSC decision requires a vote to move the Project forward, the voting members of the TSC will vote on a one vote per voting member basis.
+
+2. Quorum for TSC meetings requires at least fifty percent of all voting members of the TSC to be present. The TSC may continue to meet if quorum is not met but will be prevented from making any decisions at the meeting.
+
+3. Except as provided in Section 7.c. and 8.a, decisions by vote at a meeting require a majority vote of those in attendance, provided quorum is met. Decisions made by electronic vote without a meeting require a majority vote of all voting members of the TSC.
+
+4. In the event a vote cannot be resolved by the TSC, any voting member of the TSC may refer the matter to the Series Manager for assistance in reaching a resolution.
+
+## Compliance with Policies
+
+1. This Charter is subject to the Series Agreement for the Project and the Operating Agreement of LF Projects. Contributors will comply with the policies of LF Projects as may be adopted and amended by LF Projects, including, without limitation, the policies listed at https://lfprojects.org/policies/.
+
+2. The TSC may adopt a code of conduct ("CoC") for the Project, which is subject to approval by the Series Manager. In the event that a Project-specific CoC has not been approved, the LF Projects Code of Conduct listed at https://lfprojects.org/policies will apply for all Collaborators in the Project.
+
+3. When amending or adopting any policy applicable to the Project, LF Projects will publish such policy, as to be amended or adopted, on its website at least 30 days prior to such policy taking effect; provided, however, that in the case of any amendment of the Trademark Policy or Terms of Use of LF Projects, any such amendment is effective upon publication on LF Project's website.
+
+4. All Collaborators must allow open participation from any individual or organization meeting the requirements for contributing under this Charter and any policies adopted for all Collaborators by the TSC, regardless of competitive interests. Put another way, the Project community must not seek to exclude any participant based on any criteria, requirement, or reason other than those that are reasonable and applied on a non-discriminatory basis to all Collaborators in the Project community.
+
+5. The Project will operate in a transparent, open, collaborative, and ethical manner at all times. The output of all Project discussions, proposals, timelines, decisions, and status should be made open and easily visible to all. Any potential violations of this requirement should be reported immediately to the Series Manager.
+
+## Community Assets
+
+1. LF Projects will hold title to all trade or service marks used by the Project ("Project Trademarks"), whether based on common law or registered rights. Project Trademarks will be transferred and assigned to LF Projects to hold on behalf of the Project. Any use of any Project Trademarks by Collaborators in the Project will be in accordance with the license from LF Projects and inure to the benefit of LF Projects.
+
+2. The Project will, as permitted and in accordance with such license from LF Projects, develop and own all Project GitHub and social media accounts, and domain name registrations created by the Project community.
+
+3. Under no circumstances will LF Projects be expected or required to undertake any action on behalf of the Project that is inconsistent with the tax-exempt status or purpose, as applicable, of the Joint Development Foundation or LF Projects, LLC.
+
+## General Rules and Operations
+
+The Project will:
+
+1. Engage in the work of the Project in a professional manner consistent with maintaining a cohesive community, while also maintaining the goodwill and esteem of LF Projects, Joint Development Foundation, and other partner organizations in the open source community.
+2. Respect the rights of all trademark owners, including any branding and trademark usage guidelines.
+
+## Intellectual Property Policy
+
+1. Collaborators acknowledge that the copyright in all new contributions will be retained by the copyright holder as independent works of authorship and that no contributor or copyright holder will be required to assign copyrights to the Project.
+
+2. Except as described in Section 7.c., all contributions to the Project are subject to the following:
+
+    - All new inbound code contributions to the Project must be made using Apache License, Version 2.0 available at http://www.apache.org/licenses/LICENSE-2.0 (the "Project License").
+	- All new inbound code contributions must also be accompanied by a Developer Certificate of Origin (http://developercertificate.org) sign-off in the source code system that is submitted through a TSC-approved contribution process which will bind the authorized contributor and, if not self-employed, their employer to the applicable license.
+	- All outbound code will be made available under the Project License.
+	- Documentation will be received and made available by the Project under the Creative Commons Attribution 4.0 International License (available at http://creativecommons.org/licenses/by/4.0/).
+	- The Project may seek to integrate and contribute back to other open source projects ("Upstream Projects"). In such cases, the Project will conform to all license requirements of the Upstream Projects, including dependencies, leveraged by the Project. Upstream Project code contributions not stored within the Project's main code repository will comply with the contribution process and license terms for the applicable Upstream Project.
+
+3. The TSC may approve the use of an alternative license or licenses for inbound or outbound contributions on an exception basis. To request an exception, please describe the contribution, the alternative open source license(s), and the justification for using an alternative open source license for the Project. License exceptions must be approved by a two-thirds vote of the entire TSC.
+
+4. Contributed files should contain license information, such as SPDX short form identifiers, indicating the open source license or licenses pertaining to the file.
+
+## Amendments
+
+1. This charter may be amended by a two-thirds vote of the entire TSC and is subject to approval by LF Projects.

From fc4e73370d84af5242996a90b32b3ffce8e6b922 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 14 Nov 2024 13:52:51 -0500
Subject: [PATCH 05/16] Add no_sync context manager (#6675)

Fix #1902

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 .../layer_container_base.py                   |   4 +-
 deepspeed/runtime/engine.py                   |  37 +++-
 deepspeed/runtime/zero/stage_1_and_2.py       |   5 -
 tests/unit/runtime/test_no_sync_ctxt.py       | 197 ++++++++++++++++++
 4 files changed, 229 insertions(+), 14 deletions(-)
 create mode 100644 tests/unit/runtime/test_no_sync_ctxt.py

diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py
index f26c87556665..feb65b4a5f5d 100644
--- a/deepspeed/inference/v2/model_implementations/layer_container_base.py
+++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py
@@ -14,7 +14,7 @@
 
 # Currently have dependency loops for the type hints.
 InferenceModel = Type["InferenceModel"]
-LayerContainer = Type["LayerContainer"]
+LayerContainer = Type["LayerContainer"]  # noqa: F811
 
 MAPPING_KEY = "PARAM_MAPPING"
 PLIST_HELPERS = "_ds_plist_strip_vals"
@@ -161,7 +161,7 @@ def __call__(cls, *args, **kwargs):
         return instance
 
 
-class LayerContainer(metaclass=LayerMetaclass):
+class LayerContainer(metaclass=LayerMetaclass):  # noqa: F811
     """
     Abstract base class for containing model parameters.
 
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index e1e745d2b112..8c5da36e5a78 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -17,6 +17,7 @@
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from contextlib import contextmanager
 
 from typing import Callable, Dict, Union, Iterable, Container
 
@@ -216,6 +217,7 @@ def __init__(self,
         self.loaded_checkpoint_mp_world_size = None
         self.loaded_checkpoint_dp_world_size = None
         self.enable_backward_allreduce = True
+        self.inside_no_sync_ctxt = False
         self.progressive_layer_drop = None
         self.eigenvalue = None
         self.block_eigenvalue = None
@@ -1981,12 +1983,31 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
                 grads = None
                 self.buffered_allreduce_fallback(grads=grads, elements_per_buffer=bucket_size)
 
+    @contextmanager
+    def no_sync(self):
+        r"""
+            Context manager to disable gradient reduction during backward pass.
+            This context manager has the following effects on other DeepSpeed features.
+            1. Incompatible with ZeRO stage 2/3 which rely on reduction for gradient partitioning.
+            2. It is illegal to  call engine.step() within the context manager.
+            3. Tracking of gradient accumulation steps is disabled.
+        """
+        assert not self.zero_optimization_partition_gradients(), \
+        f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}"
+
+        assert not self.inside_no_sync_ctxt, f"no_sync context manager reentry is unsupported"
+
+        self.inside_no_sync_ctxt = True
+        try:
+            yield
+        finally:
+            self.inside_no_sync_ctxt = False
+
     @instrument_w_nvtx
-    def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True):
+    def backward(self, loss, release_loss=False, retain_graph=False, scale_wrt_gas=True):
         r"""Execute backward pass on the loss
         Arguments:
             loss: Torch tensor on which to execute backward propagation
-            allreduce_gradients: is deprecated, ignored, and will soon be removed'
             retain_graph: bool, default: false
                 forward on user defined choice of retain_graph
         """
@@ -1996,11 +2017,10 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr
         if self.scale_wrt_gas is not None:
             scale_wrt_gas = self.scale_wrt_gas
 
-        if not allreduce_gradients:
-            logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed")
+        do_gradient_reduction = self.enable_backward_allreduce and not self.inside_no_sync_ctxt
 
-        # scale loss w.r.t. gradient accumulation if needed
-        if self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
+        # scale loss w.r.t. gradient accumulation if reduction is not disabled
+        if do_gradient_reduction and self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
             loss = self._scale_loss_by_gas(loss.float())
 
         # Log training loss
@@ -2049,7 +2069,7 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr
 
         self._start_timers(self.engine_timers.backward_reduce_timers)
 
-        if allreduce_gradients and self.enable_backward_allreduce:
+        if do_gradient_reduction:
             # Traditional code path that allreduces the module parameter grads
             self.allreduce_gradients()
 
@@ -2185,6 +2205,9 @@ def step(self, lr_kwargs=None):
         r"""Execute the weight update step after forward and backward propagation
         on effective_train_batch.
         """
+        assert not self.inside_no_sync_ctxt, \
+        "It is illegal to call Engine.step() inside no_sync context manager"
+
         see_memory_usage("Engine before step", force=self.memory_breakdown())
 
         # Check early because self.global_steps is incremented at some point here.
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 669826206e4b..7ac89a233808 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -2297,11 +2297,6 @@ def load_state_dict(self,
     def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
         self.load_hp_checkpoint_state_from_checkpoint_dir("bit16_groups", checkpoint_folder)
 
-    @property
-    def param_groups(self):
-        """Forward the wrapped optimizer's parameters."""
-        return self.optimizer.param_groups
-
     def _load_global_state(self, sd):
         self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler)
         self.dynamic_loss_scale = sd.get('dynamic_loss_scale', self.dynamic_loss_scale)
diff --git a/tests/unit/runtime/test_no_sync_ctxt.py b/tests/unit/runtime/test_no_sync_ctxt.py
new file mode 100644
index 000000000000..8c6497013809
--- /dev/null
+++ b/tests/unit/runtime/test_no_sync_ctxt.py
@@ -0,0 +1,197 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+from contextlib import nullcontext
+import torch
+
+from unit.simple_model import SimpleModel, random_dataloader
+from unit.common import DistributedTest
+
+import deepspeed
+import deepspeed.comm as dist
+from deepspeed.utils import safe_get_full_grad
+
+
+class TestNoSyncCtxt(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1, 2, 3])
+    def test_zero_stage(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        invalid_cfg = zero_stage > 1
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        with pytest.raises(AssertionError) if invalid_cfg else nullcontext() as assertinfo:
+            with model.no_sync():
+                for _, batch in enumerate(data_loader):
+                    loss = model(batch[0], batch[1])
+                    model.backward(loss)
+        if invalid_cfg:
+            assert ("no_sync context manager is incompatible" in str(assertinfo))
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1])
+    def test_engine_step(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                with pytest.raises(AssertionError) as assertinfo:
+                    model.step()
+                assert ("It is illegal to call Engine.step() inside no_sync context manager" in str(assertinfo))
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1])
+    def test_multiple_ctxts(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        param_list = list(model.parameters())
+        first_losses = []
+        first_grad_norms = []
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                first_losses.append(loss.item())
+                model.backward(loss)
+                grad_norm = sum([safe_get_full_grad(p).norm() for p in param_list])
+                first_grad_norms.append(grad_norm.item())
+
+        second_losses = []
+        second_grad_norms = []
+
+        model.zero_grad()
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                second_losses.append(loss.item())
+                model.backward(loss)
+                grad_norm = sum([safe_get_full_grad(p).norm() for p in param_list])
+                second_grad_norms.append(grad_norm.item())
+
+        assert len(first_losses) == len(second_losses)
+        for x, y in zip(first_losses, second_losses):
+            assert x == y
+
+        assert len(first_grad_norms) == len(second_grad_norms)
+        for x, y in zip(first_grad_norms, second_grad_norms):
+            assert x == y
+
+    def test_reentry(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": 1,
+            },
+        }
+
+        hidden_dim = 64
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        dist.barrier()
+
+        with model.no_sync():
+            with pytest.raises(AssertionError) as assertinfo:
+                with model.no_sync():
+                    pass
+            assert ("no_sync context manager reentry is unsupported" in str(assertinfo))

From e3b5a4b6e03cc628268db88178464679abc42810 Mon Sep 17 00:00:00 2001
From: Raza Sikander <srsikander@habana.ai>
Date: Sat, 16 Nov 2024 04:41:59 +0530
Subject: [PATCH 06/16] Gaudi2 Nightly job for daily check (#6753)

Co-authored-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/hpu-gaudi2-nightly.yml | 85 ++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 .github/workflows/hpu-gaudi2-nightly.yml

diff --git a/.github/workflows/hpu-gaudi2-nightly.yml b/.github/workflows/hpu-gaudi2-nightly.yml
new file mode 100644
index 000000000000..5c5caff1ebb0
--- /dev/null
+++ b/.github/workflows/hpu-gaudi2-nightly.yml
@@ -0,0 +1,85 @@
+name: hpu-gaudi2-nightly
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/hpu-gaudi2-nightly.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, intel, gaudi2]
+    container:
+      image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+      ports:
+        - 80
+      options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
+
+    env:
+      PT_HPU_LAZY_MODE: 0
+      TORCHINDUCTOR_COMPILE_THREADS: 1
+      TEST_LIST: |
+        test_adamw.py
+        test_bf16.py
+        test_ds_config_dict.py
+        test_dynamic_loss_scale.py
+        test_latest_checkpoint.py
+        test_moe_checkpoint.py
+        test_multi_output_model.py
+        test_other_optimizer.py
+        test_pipe.py
+        test_pipeline.py
+        test_universal_checkpoint.py
+        test_zero_context_return.py
+        test_zero_leaf_module.py
+        test_zero_offloadpp.py
+        test_zero_tiled.py
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+
+      - name: Check container state
+        run: |
+          ldd --version
+          hl-smi -L
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,autotuning]
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
+          export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
+          TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
+          echo "TEST_LIST ${TEST_LIST}"
+          pytest --verbose unit/ -k "${TEST_LIST}"

From f594dbe3dfe01cb912b9f3260906fe51fc1fdc61 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 18 Nov 2024 10:16:21 -0800
Subject: [PATCH 07/16] Disable failing python tests (#6758)

---
 .github/workflows/python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 37b68f1dbe80..35f9502ecbc9 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -21,7 +21,7 @@ jobs:
   unit-tests:
     strategy:
       matrix:
-        pyVersion: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        pyVersion: ["3.8", "3.9", "3.10"]
       fail-fast: false
 
     runs-on: ubuntu-24.04

From dd40269426e129e6d65f8cda28d97f5ca31d1de0 Mon Sep 17 00:00:00 2001
From: Xu Song <xusong.vip@gmail.com>
Date: Tue, 19 Nov 2024 04:14:35 +0800
Subject: [PATCH 08/16] A faster and more memory-efficient implementation of
 `zero_to_fp32` (#6658)

It is a faster and more memory-efficient implementation of
`zero_to_fp32`.


The previous version double the memory usage, which cause cpu OOM for
very large models (e.g. llama 405B).

https://github.com/microsoft/DeepSpeed/blob/b647fb2470f8f6fefe5cab0ea84a2d89696eb898/deepspeed/utils/zero_to_fp32.py#L438-L441


## How does it work?

1. **Lazy loading**: Load checkpoint with `mmap=True`, thus the weights
are mmaped rather than loading all the storages into memory.
2. **Lazy merge**: `GatheredTensor` contains the mmaped weights and
tensor offset. It is a memory-efficient pseudo tensor. Only when
`tensor.contiguous()` is called, it starts to load related weights to
memory and merge into a single tensor.
3. **Release memory in time**: Save checkpoints shard by shard, and
release the memory once a shard is saved.


Throughout the process, only one shard of tensors are keeped in memory.

## How much benefit in speed and memory ?

Experiments were conducted on a linux host with 1TB of memory. Here is a
detailed comparision
| | world size | peak memory(GB) | elapsed time(h:mm:ss) |

|----------------------|------------|--------------|--------------------|
| llama3-8B(old->new)  | 8          | 90 -> 41 | 0:02:17 -> 0:01:10 |
| llama2-13B(old->new)  | 8        | 146 -> 54 | 0:02:30 -> 0:01:47  |
| llama2-70B(old->new)  | 16        | 789 -> 159 | 0:20:47 -> 0:20:45 |
| qwen1.5-110B(old->new)  | 32       | OOM -> 217 | ? -> 0:34:21 |
| llama3-405B(old->new)  | 192      | OOM -> 262 | ? -> 2:09:59 |



You can reproduce with the following scripts
```sh
# 1. install requirments
apt-get install time
# 2. prepare zero-3 checkpoints
# 3. convert zero to fp32 checkpoints
/usr/bin/time -v python zero_to_fp32.py . output_dir/ --safe_serialization
```

- **memory**: Theoretically, this PR reduces the memory cost from `2M`
to `(1/n)M`, where `M` is the memory cost of the full weights, `n` is
num_shards.
- **speed**: The speed gain mainly comes from avoiding extra tensor
copying. The benifit may be slight.




## Impl history

-
[v1](https://github.com/xu-song/DeepSpeed/commit/19712a1c75bfc1da4a7f3ecca6915a86af671568#diff-6a2ca3427fa608c387b7351359f98cfc1313be6e960cee86344ff246bf1b8326R441-R447)
: a hf_hub compatible approach.
It has been discarded due to the controversial implementation of
`data_ptr().`
- [v2](https://github.com/microsoft/DeepSpeed/pull/6658/files): a simple
approach with `torch.empty`

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/utils/zero_to_fp32.py               | 150 ++++++++++++++----
 .../checkpoint/test_convert_checkpoint.py     |  60 +++++++
 2 files changed, 177 insertions(+), 33 deletions(-)
 create mode 100644 tests/unit/checkpoint/test_convert_checkpoint.py

diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index e69ecd9acb5a..c0768deae62b 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -21,7 +21,9 @@
 import math
 import os
 import re
+import gc
 import json
+import numpy as np
 from tqdm import tqdm
 from collections import OrderedDict
 from dataclasses import dataclass
@@ -146,8 +148,8 @@ def parse_model_states(files):
 def parse_optim_states(files, ds_checkpoint_dir):
     total_files = len(files)
     state_dicts = []
-    for f in files:
-        state_dict = torch.load(f, map_location=device)
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True)
         # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
         # and also handle the case where it was already removed by another helper script
         state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
@@ -179,19 +181,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
     else:
         raise ValueError(f"unknown zero stage {zero_stage}")
 
-    if zero_stage <= 2:
-        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
-    elif zero_stage == 3:
-        # if there is more than one param group, there will be multiple flattened tensors - one
-        # flattened tensor per group - for simplicity merge them into a single tensor
-        #
-        # XXX: could make the script more memory efficient for when there are multiple groups - it
-        # will require matching the sub-lists of param_shapes for each param group flattened tensor
-
-        fp32_flat_groups = [
-            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
-        ]
-
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
     return zero_stage, world_size, fp32_flat_groups
 
 
@@ -398,9 +388,56 @@ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
     print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
 
 
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
 def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
     param_shapes = zero_model_states[0].param_shapes
-    avail_numel = fp32_flat_groups[0].numel() * world_size
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
     # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
     # param, re-consolidating each param, while dealing with padding if any
 
@@ -424,7 +461,8 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     offset = 0
     total_numel = 0
     total_params = 0
-    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
         unpartitioned_numel = shape.numel()
         total_numel += unpartitioned_numel
         total_params += 1
@@ -435,10 +473,9 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
                 f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
             )
 
-        # XXX: memory usage doubles here
-        state_dict[name] = torch.cat(
-            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
-            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
         offset += partitioned_numel
 
     offset *= world_size
@@ -473,7 +510,29 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zer
     return state_dict
 
 
-def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:
+            shared_tensor = state_dict[converted_tensors[tensor_id]]
+            state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                state_dict[name] = tensor.contiguous()
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
     ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
@@ -483,14 +542,12 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
         - ``checkpoint_dir``: path to the desired checkpoint folder
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
         - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
 
     Returns:
         - pytorch ``state_dict``
 
-    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
-    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
-    the checkpoint.
-
     A typical usage might be ::
 
         from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
@@ -506,6 +563,16 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
 
     If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
 
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
     """
     if tag is None:
         latest_path = os.path.join(checkpoint_dir, 'latest')
@@ -520,7 +587,11 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
     if not os.path.isdir(ds_checkpoint_dir):
         raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
 
-    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
 
 
 def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
@@ -541,6 +612,7 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
         - ``exclude_frozen_parameters``: exclude frozen parameters
     """
+
     # Dependency pre-check
     if safe_serialization:
         try:
@@ -556,13 +628,18 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
             raise
 
     # Convert zero checkpoint to state_dict
-    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
 
     # Shard the model if it is too big.
     weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
     if max_shard_size is not None:
         filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
-        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
                                                               filename_pattern=filename_pattern,
                                                               max_shard_size=max_shard_size)
     else:
@@ -571,15 +648,22 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
         state_dict_split = StateDictSplit(is_sharded=False,
                                           filename_to_tensors={weights_name: list(state_dict.keys())})
 
-    # Save the model
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
     filename_to_tensors = state_dict_split.filename_to_tensors.items()
     for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
-        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
         output_path = os.path.join(output_dir, shard_file)
         if safe_serialization:
-            save_file(shard, output_path, metadata={"format": "pt"})
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
         else:
-            torch.save(shard, output_path)
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in shard_state_dict:
+            del state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
 
     # Save index if sharded
     if state_dict_split.is_sharded:
diff --git a/tests/unit/checkpoint/test_convert_checkpoint.py b/tests/unit/checkpoint/test_convert_checkpoint.py
new file mode 100644
index 000000000000..68fdecb32e16
--- /dev/null
+++ b/tests/unit/checkpoint/test_convert_checkpoint.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn as nn
+
+import deepspeed
+from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
+from unit.common import DistributedTest
+
+
+class ModelWithSharedWeights(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.layer0 = nn.Linear(100, 100)
+        self.layer1 = nn.Linear(200, 200)
+        self.layer2 = nn.Linear(300, 300)
+        # tie layer 1 and layer 2
+        self.layer1.weight = self.layer2.weight
+
+
+class TestCheckpointConvert(DistributedTest):
+    world_size = 2
+
+    def test_convert_zero_checkpoint_to_fp32_state_dict(self, tmpdir):
+        config = {
+            "train_micro_batch_size_per_gpu": 2,
+            "zero_allow_untested_optimizer": True,
+            "zero_optimization": {
+                "stage": 3
+            },
+        }
+        model = ModelWithSharedWeights()
+        optimizer = torch.optim.Adam(model.parameters())
+
+        deepspeed_engine, _, _, _ = deepspeed.initialize(
+            config=config,
+            model=model,
+            optimizer=optimizer,
+        )
+        ds_save_dir = tmpdir / "checkpoint_ds"
+        deepspeed_engine.save_checkpoint(ds_save_dir, tag="checkpoint")
+
+        model = ModelWithSharedWeights()
+
+        # save checkpoint
+        fp32_save_dir = tmpdir / "checkpoint_fp32"
+        convert_zero_checkpoint_to_fp32_state_dict(ds_save_dir, fp32_save_dir)
+
+        # load state_dict from fp32 checkpoint
+        state_dict = torch.load(fp32_save_dir / 'pytorch_model.bin')
+
+        # check shared tensor
+        assert id(state_dict['layer1.weight']) == id(state_dict['layer2.weight'])
+
+        # load state_dict into model
+        model.load_state_dict(state_dict, strict=True)

From 8488beea29cbefaf964ce58063d6d709a31a54cc Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 18 Nov 2024 17:36:51 -0800
Subject: [PATCH 09/16] Pin transformers version to work around latest torch
 requirements (#6759)

Latest transformers seems to break our tests that aren't on torch latest
(>= 2.5). Issue opened here:
https://github.com/huggingface/transformers/issues/34795. This pins our
version so these tests can pass in the meantime.
---
 .github/workflows/hpu-gaudi2.yml | 1 +
 .github/workflows/nv-a6000.yml   | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
index 9f1a9d973ca2..66e7cf7f8dbe 100644
--- a/.github/workflows/hpu-gaudi2.yml
+++ b/.github/workflows/hpu-gaudi2.yml
@@ -111,6 +111,7 @@ jobs:
         run: |
           git clone https://github.com/huggingface/transformers
           cd transformers
+          git checkout 7df93d6ffb48946e532c7d766fe10372e98d78b6
           git rev-parse --short HEAD
           pip install .
 
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 639f27498dd9..9e65a9473c11 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -40,8 +40,9 @@ jobs:
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
       - name: Install transformers
         run: |
-          git clone --depth=1 https://github.com/huggingface/transformers
+          git clone https://github.com/huggingface/transformers
           cd transformers
+          git checkout 7df93d6ffb48946e532c7d766fe10372e98d78b6
           git rev-parse --short HEAD
           python -m pip install .
       - name: Install deepspeed

From 1fdad1fa52f525d64132cb11e2746b06751efb22 Mon Sep 17 00:00:00 2001
From: baodi <di.bao@intel.com>
Date: Tue, 19 Nov 2024 09:38:27 -0800
Subject: [PATCH 10/16] make xpu ops compatible with oneapi 2025.0 (#6760)

Compatibility update for xpu ops

This PR introduces changes that will make xpu ops compatible with the
OneAPI 2025.0 toolkit. This is an important update that will allow us to
develop and ship our most demanding models on this innovative hardware.

---------

Signed-off-by: baodii <di.bao@intel.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
---
 csrc/xpu/adam/multi_tensor_apply.dp.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/csrc/xpu/adam/multi_tensor_apply.dp.hpp b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
index 01964e1a4d38..14a130e2a23e 100644
--- a/csrc/xpu/adam/multi_tensor_apply.dp.hpp
+++ b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
@@ -109,6 +109,12 @@ class multi_tensor_apply_kernel {
     std::tuple<ArgTypes...> args;
 };
 
+// to make sure multi_tensor_apply_kernel can be used in sycl::buffer
+namespace sycl {
+template <typename T, typename U, typename... ArgTypes>
+struct is_device_copyable<multi_tensor_apply_kernel<T, U, ArgTypes...>> : std::true_type {};
+}  // namespace sycl
+
 template <int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(int block_size,
                         int chunk_size,

From 2e0c39b55ce55dfeeff1224203d9035555e228d9 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 19 Nov 2024 11:09:52 -0800
Subject: [PATCH 11/16] Add explicit parameters for torch.load (#6751)

Successor PR to #6094:

> FutureWarning: You are using torch.load with weights_only=False (the
current default value), which uses the default pickle module implicitly.
It is possible to construct malicious pickle data which will execute
arbitrary code during unpickling (See
https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models
for more details). In a future release, the default value for
weights_only will be flipped to True. This limits the functions that
could be executed during unpickling. Arbitrary objects will no longer be
allowed to be loaded via this mode unless they are explicitly
allowlisted by the user via torch.serialization.add_safe_globals. We
recommend you start setting weights_only=True for any use case where you
don't have full control of the loaded file. Please open an issue on
GitHub for any issues related to this experimental feature.

Todo:
- [ ] Update values in non-test files to True where necessary.
---
 deepspeed/checkpoint/deepspeed_checkpoint.py    | 17 ++++++++++-------
 deepspeed/checkpoint/ds_to_universal.py         | 14 +++++++-------
 deepspeed/checkpoint/universal_checkpoint.py    |  2 +-
 deepspeed/checkpoint/zero_checkpoint.py         |  2 +-
 deepspeed/inference/engine.py                   |  4 ++--
 .../v2/checkpoint/huggingface_engine.py         |  2 +-
 .../inference_policy_base.py                    |  2 +-
 deepspeed/module_inject/replace_module.py       |  8 ++++----
 deepspeed/runtime/base_optimizer.py             |  2 +-
 .../nebula_checkpoint_engine.py                 |  2 +-
 .../torch_checkpoint_engine.py                  |  2 +-
 deepspeed/runtime/zero/stage3.py                |  4 ++--
 deepspeed/utils/zero_to_fp32.py                 |  4 ++--
 tests/unit/checkpoint/common.py                 |  2 +-
 .../checkpoint/test_universal_checkpoint.py     |  2 +-
 tests/unit/checkpoint/test_zero_optimizer.py    | 11 ++++++-----
 .../test_configurable_parallel_mp.py            |  2 +-
 .../test_configurable_parallel_pp.py            |  2 +-
 18 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py
index 31997177a262..9a368b7a0a25 100644
--- a/deepspeed/checkpoint/deepspeed_checkpoint.py
+++ b/deepspeed/checkpoint/deepspeed_checkpoint.py
@@ -116,7 +116,7 @@ def show_transformer_file_map(self):
         self._dump_mapping(self.transformer_file_map, 'rank_to_transformer_files')
 
     def _build_global_state(self):
-        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
         self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
         self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
 
@@ -137,14 +137,17 @@ def get_final_norm_layer_id(self):
 
     def get_iteration(self):
         if not ITERATION_KEY in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
             self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
 
         return self.global_state[ITERATION_KEY]
 
     def get_embedding_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_embedding_map.keys()
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
+        sd_list = [
+            torch.load(fname, map_location=torch.device('cpu'), weights_only=False)
+            for fname in self.tp_to_embedding_map[tp_index]
+        ]
         sd = self._merge_state_dicts(sd_list)
         return sd
 
@@ -154,7 +157,7 @@ def get_embedding_files(self, tp_index: int) -> list:
 
     def _get_checkpoint_value(self, key):
         if not key in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
             self.global_state[key] = sd.get(key, None)
 
         return self.global_state[key]
@@ -169,7 +172,7 @@ def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict:
         assert tp_index < self.tp_degree
         assert pp_index < self.pp_degree
         fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index)
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
+        sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]
 
         merged_sd = None
         for sd in sd_list:
@@ -185,7 +188,7 @@ def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
         assert pp_index < self.pp_degree
         t_list = []
         for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
-            sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
+            sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]
             sd = self._merge_state_dicts(sd_list)
             t_list.append(sd)
         return t_list
@@ -196,7 +199,7 @@ def get_pp_transformer_map(self, pp_index: int) -> list:
 
     def get_final_norm_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_final_norm_map.keys()
-        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
+        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'), weights_only=False)
         return sd
 
     def get_final_norm_files(self, tp_index: int) -> list:
diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py
index e5974a30df22..f7b75eee66d0 100755
--- a/deepspeed/checkpoint/ds_to_universal.py
+++ b/deepspeed/checkpoint/ds_to_universal.py
@@ -150,7 +150,7 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D):
 
 
 def extract_zero_shards_stage3(optim_files, param_shapes, dp_degree, temp_dir, dp_index):
-    state_dict = torch.load(optim_files[dp_index], map_location='cpu')
+    state_dict = torch.load(optim_files[dp_index], map_location='cpu', weights_only=False)
 
     flat_state = dict(
         exp_avg=state_dict[OPTIMIZER_STATE_DICT]['optimizer_state_dict']['state'][0]["exp_avg"],
@@ -214,7 +214,7 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape=None):
                 raise ValueError(f"Cannot parse dp_rank from {p}")
 
         paths = [f"{prefix_path}.{dp_index_to_str(dp_index)}" for dp_index in sorted(list(dp_indices))]
-        shards = [torch.load(p) for p in paths]
+        shards = [torch.load(p, weights_only=False) for p in paths]
 
         if state == "step":
             assert all(v == shards[0] for v in shards), "All shards must have the same step value"
@@ -404,7 +404,7 @@ def _zero_partitioned_param_info(unpartitioned_numel, world_size):
 
 
 def _parse_model_states_stage3(files):
-    return torch.load(files[0], map_location=torch.device('cpu'))[PARAM_SHAPES]
+    return torch.load(files[0], map_location=torch.device('cpu'), weights_only=False)[PARAM_SHAPES]
 
 
 def _save_optimizer_state(args, ds_checkpoint):
@@ -420,7 +420,7 @@ def _save_optimizer_state(args, ds_checkpoint):
 
 
 def _save_optimizer_state_stage3(args, optim_files):
-    sd = torch.load(optim_files[0], map_location=torch.device('cpu'))
+    sd = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
     output_sd = sd[OPTIMIZER_STATE_DICT]
     output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS]
     zero_output_folder = os.path.join(args.output_folder, "zero")
@@ -446,7 +446,7 @@ def _get_checkpoint_files(checkpoint_dir, glob_pattern):
 
 
 def _get_zero_stage(optim_files):
-    state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'))
+    state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
     optimizer_state = state_dict[OPTIMIZER_STATE_DICT]
     zero_stage = optimizer_state.get(ZERO_STAGE, 1)
     return zero_stage
@@ -454,7 +454,7 @@ def _get_zero_stage(optim_files):
 
 def _inject_missing_state(ds_checkpoint):
     if UNIVERSAL_CHECKPOINT_INFO not in ds_checkpoint.global_state:
-        sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'))
+        sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
         if UNIVERSAL_CHECKPOINT_INFO not in sd:
             ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {}
             ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO][
@@ -488,7 +488,7 @@ def main(args):
 
         slice_shapes = []
         for mp_rank_file in ds_checkpoint.mp_rank_files:
-            mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
+            mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'), weights_only=False)
             slice_shapes += mp_sd[PARAM_SHAPES]
 
         # fix back to normal flat dict, merge duplicates for tp>1
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
index 064891a8bb54..266d5a063595 100644
--- a/deepspeed/checkpoint/universal_checkpoint.py
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -34,7 +34,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
     step = None
     for key in hp_keys:
         ckpt_file = os.path.join(folder, f"{key}.pt")
-        ckpt_dict = torch.load(ckpt_file)
+        ckpt_dict = torch.load(ckpt_file, weights_only=False)
 
         if key == "step":
             step = ckpt_dict
diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py
index 6730b93dfd4f..c85f0241005d 100644
--- a/deepspeed/checkpoint/zero_checkpoint.py
+++ b/deepspeed/checkpoint/zero_checkpoint.py
@@ -54,7 +54,7 @@ def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], st
         state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index)
         merged_sd = None
         for state_file in state_file_list:
-            sd = torch.load(state_file, map_location=torch.device('cpu'))
+            sd = torch.load(state_file, map_location=torch.device('cpu'), weights_only=False)
             for key in keys_to_ignore:
                 sd.pop(key, None)
 
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 6574d49fb132..cfca1ff4fe4c 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -452,7 +452,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
         checkpoint = sd_loader['checkpoints']
 
         if type(checkpoint) is list:
-            self.sd = torch.load(checkpoint[0], map_location='cpu')
+            self.sd = torch.load(checkpoint[0], map_location='cpu', weights_only=False)
             self.key_list = list(self.sd.keys())
 
             self.load_model_with_checkpoint(self.module)
@@ -460,7 +460,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
             for i in range(1, len(checkpoint)):
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     print(f"loading checkpoint ({i})")
-                self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name())
+                self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name(), weights_only=False)
                 self.key_list = list(self.sd.keys())
                 self.load_model_with_checkpoint(self.module)
         else:
diff --git a/deepspeed/inference/v2/checkpoint/huggingface_engine.py b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
index d88d99ebebfd..b17bb886838f 100644
--- a/deepspeed/inference/v2/checkpoint/huggingface_engine.py
+++ b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
@@ -80,7 +80,7 @@ def model_has_safetensors(model_name_or_path: str) -> bool:
         else:
             model_param_json_fname = "pytorch_model.bin.index.json"
             model_file_fname = "pytorch_model.bin"
-            self._checkpoint_load_fn = partial(torch.load, map_location="cpu")
+            self._checkpoint_load_fn = partial(torch.load, map_location="cpu", weights_only=False)
 
         model_param_json = os.path.join(self._local_checkpoint_dir, model_param_json_fname)
 
diff --git a/deepspeed/inference/v2/model_implementations/inference_policy_base.py b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
index d5a326c03599..2f4266a8cb88 100644
--- a/deepspeed/inference/v2/model_implementations/inference_policy_base.py
+++ b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
@@ -205,7 +205,7 @@ def populate_model_parameters(self) -> None:
             buffer_path = make_param_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
             metadata_path = make_metadata_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
 
-            buffer = torch.load(buffer_path)
+            buffer = torch.load(buffer_path, weights_only=False)
             metadata = json.load(open(metadata_path, "r"))
             metadata = ModelMetadata.parse_raw(metadata)
 
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 1c5745dcf168..7afe6ca903fb 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -415,7 +415,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
             pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
 
             for i in range(len(checkpoint)):
-                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu')]
+                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu', weights_only=False)]
                 load_model_with_checkpoint(replaced_module,
                                            sd,
                                            mp_replace,
@@ -437,7 +437,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                     os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j]
                     for j in range(sd_count)
                 ]
-                sds = [torch.load(ckpt_file, map_location='cpu') for ckpt_file in ckpt_files]
+                sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False) for ckpt_file in ckpt_files]
                 load_model_with_checkpoint(replaced_module,
                                            sds,
                                            mp_replace,
@@ -457,7 +457,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                     pbar.update(1)
                     ckpt_file = os.path.join(base_dir1,
                                              checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i]
-                    sds = [torch.load(ckpt_file, map_location='cpu')]
+                    sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False)]
                     load_model_with_checkpoint(replaced_module,
                                                sds,
                                                mp_replace,
@@ -624,7 +624,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No
             from safetensors.torch import load_file
             sd = load_file(checkpoint)
         else:
-            sd = torch.load(checkpoint, map_location='cpu')
+            sd = torch.load(checkpoint, map_location='cpu', weights_only=False)
 
     policy = {}
     if orig_class is not None:
diff --git a/deepspeed/runtime/base_optimizer.py b/deepspeed/runtime/base_optimizer.py
index 6cfd66f1cc38..b8df7499450d 100644
--- a/deepspeed/runtime/base_optimizer.py
+++ b/deepspeed/runtime/base_optimizer.py
@@ -22,7 +22,7 @@ def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, chec
         optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
         assert os.path.isfile(
             optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
-        optim_sd = torch.load(optim_state_path)
+        optim_sd = torch.load(optim_state_path, weights_only=False)
 
         self._load_global_state(optim_sd)
 
diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
index e26e3243c4b5..e834bf0d22d7 100644
--- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
@@ -58,7 +58,7 @@ def load(self, path: str, map_location=None):
         if not self.enable_nebula_load and first_load_flag:
             self.tag_flag = tag
             logger.info(f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
-            partition = torch.load(path, map_location=map_location)
+            partition = torch.load(path, map_location=map_location, weights_only=False)
             logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .")
             return partition
 
diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
index 5cd44864bb2e..076c638532ad 100644
--- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
@@ -25,7 +25,7 @@ def save(self, state_dict, path: str):
 
     def load(self, path: str, map_location=None):
         logger.info(f"[Torch] Loading checkpoint from {path}...")
-        partition = torch.load(path, map_location=map_location)
+        partition = torch.load(path, map_location=map_location, weights_only=False)
         logger.info(f"[Torch] Loaded checkpoint from {path}.")
         return partition
 
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 04d52319ae8c..99a5ecf41a2f 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -2741,7 +2741,7 @@ def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, pa
         assert os.path.isfile(
             optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
 
-        optim_sd = torch.load(optim_state_path)
+        optim_sd = torch.load(optim_state_path, weights_only=False)
         self._load_global_state_stage3(optim_sd)
 
         key_list = ["fp32", "exp_avg", "exp_avg_sq"]
@@ -2799,7 +2799,7 @@ def load_hp_checkpoint_state(self, folder, key):
         local_rank = dist.get_local_rank()
 
         # Load tensors from files and reshape them to flat vectors
-        loaded_checkpoint_state = torch.load(os.path.join(folder, f"{key}.pt")).view(-1)
+        loaded_checkpoint_state = torch.load(os.path.join(folder, f"{key}.pt"), weights_only=False).view(-1)
 
         # Partition the loaded data according to the local rank
         world_size = dist.get_world_size(group=self.dp_process_group)
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index c0768deae62b..e93cb1c95f15 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -102,7 +102,7 @@ def get_model_state_files(checkpoint_dir):
 def parse_model_states(files):
     zero_model_states = []
     for file in files:
-        state_dict = torch.load(file, map_location=device)
+        state_dict = torch.load(file, map_location=device, weights_only=False)
 
         if BUFFER_NAMES not in state_dict:
             raise ValueError(f"{file} is not a model state checkpoint")
@@ -149,7 +149,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
     total_files = len(files)
     state_dicts = []
     for f in tqdm(files, desc='Loading checkpoint shards'):
-        state_dict = torch.load(f, map_location=device, mmap=True)
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
         # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
         # and also handle the case where it was already removed by another helper script
         state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py
index 3fb13b214ea0..001c08f1a99f 100644
--- a/tests/unit/checkpoint/common.py
+++ b/tests/unit/checkpoint/common.py
@@ -218,7 +218,7 @@ def checkpoint_correctness_verification(config_dict,
     for root, _, files in os.walk(save_folder):
         for f in files:
             if "_expert_" in f and "_model_states" in f:
-                expert = torch.load(os.path.join(root, f))
+                expert = torch.load(os.path.join(root, f), weights_only=False)
                 needed, storages = 0, {}
                 for name, tensor in expert.items():
                     needed += tensor.size().numel()
diff --git a/tests/unit/checkpoint/test_universal_checkpoint.py b/tests/unit/checkpoint/test_universal_checkpoint.py
index 27ddf0cdef39..46d4294bdd0d 100644
--- a/tests/unit/checkpoint/test_universal_checkpoint.py
+++ b/tests/unit/checkpoint/test_universal_checkpoint.py
@@ -181,7 +181,7 @@ def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
             )
 
         hidden_dim = 10
-        loaded_model_state, loaded_optimizer_state = torch.load(f"{tmpdir}/baseline_state.pt")
+        loaded_model_state, loaded_optimizer_state = torch.load(f"{tmpdir}/baseline_state.pt", weights_only=False)
 
         ds_config["checkpoint"] = {"load_universal": True}
         univ_model = SimpleModel(hidden_dim)
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index 84b4eca6e2ca..44966b331d0f 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -264,7 +264,7 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l
         model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
 
         if load_optim:
-            saved_sd = torch.load(os.path.join(tmpdir, opt_state_dict_file))
+            saved_sd = torch.load(os.path.join(tmpdir, opt_state_dict_file), weights_only=False)
             curr_sd = model.optimizer.optimizer.state_dict()
             compare_opt_state_dicts(curr_sd, saved_sd, expected_mismatch_keys)
 
@@ -523,7 +523,7 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
         all_ckpt_folder = os.path.join(tmpdir, 'all_params')
         ds_engine.save_checkpoint(all_ckpt_folder)
         all_params_ckpt_file = get_model_ckpt_name_for_rank(os.path.join(all_ckpt_folder, 'global_step0'), '00')
-        loaded_all_param_model = torch.load(all_params_ckpt_file)['module']
+        loaded_all_param_model = torch.load(all_params_ckpt_file, weights_only=False)['module']
         all_param_names = set([n for n, p in model.named_parameters()])
         assert set(loaded_all_param_model.keys()) == all_param_names
 
@@ -536,7 +536,7 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
         # Excluding frozen parameters should reduce checkpoint size
         assert os.path.getsize(all_params_ckpt_file) > os.path.getsize(trainable_ckpt_file)
 
-        loaded_trainable_param_model = torch.load(trainable_ckpt_file)['module']
+        loaded_trainable_param_model = torch.load(trainable_ckpt_file, weights_only=False)['module']
         frozen_param_names = set([n for n, p in model.named_parameters() if not p.requires_grad])
         loaded_trainable_param_names = set(loaded_trainable_param_model.keys())
         overlap_names = set.intersection(loaded_trainable_param_names, frozen_param_names)
@@ -575,7 +575,7 @@ def test_save_exclude_custom_frozen_weights(self, tmpdir, zero_stage):
 
         custom_state_dict_ckpt_file = get_model_ckpt_name_for_rank(
             os.path.join(custom_state_dict_ckpt_folder, 'global_step0'), '00')
-        loaded_custom_state_dict_param_model = torch.load(custom_state_dict_ckpt_file)['module']
+        loaded_custom_state_dict_param_model = torch.load(custom_state_dict_ckpt_file, weights_only=False)['module']
         loaded_custom_state_dict_param_names = set(loaded_custom_state_dict_param_model.keys())
 
         custom_state_dict_param_names = set([k for k, v in model.state_dict().items()])
@@ -618,7 +618,8 @@ def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device):
         clone_ckpt_file = os.path.join(tmpdir, 'clone_ckpt.pt')
         torch.save(clone_state_dict, clone_ckpt_file)
 
-        compare_state_dicts(torch.load(ref_ckpt_file), torch.load(clone_ckpt_file))
+        compare_state_dicts(torch.load(ref_ckpt_file, weights_only=False),
+                            torch.load(clone_ckpt_file, weights_only=False))
 
 
 class TestZeRONonDistributed(DistributedTest):
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
index cca1ef3584ad..a7b0d3431ee9 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -170,7 +170,7 @@ def test(self, baseline_mp2, inputs, class_tmpdir):
             test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 load_path = os.path.join(class_tmpdir, "output.pt")
-                baseline = torch.load(load_path)
+                baseline = torch.load(load_path, weights_only=False)
                 test = test.cpu()
                 assert torch.allclose(
                     baseline, test,
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
index e50fd18577b1..df469044e186 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -225,7 +225,7 @@ def _test(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resiz
                 assert torch.is_tensor(test[0][0])
                 test = test[0][0].cpu()
                 load_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt")
-                baseline = torch.load(load_path)
+                baseline = torch.load(load_path, weights_only=False)
                 assert torch.allclose(
                     baseline, test,
                     atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"

From 065398d5de1019af1117454268813c14c36a8b74 Mon Sep 17 00:00:00 2001
From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com>
Date: Tue, 19 Nov 2024 23:54:53 +0200
Subject: [PATCH 12/16] Fix setup.py bash cmd generation to correctly extract
 git info (#6762)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index e39d8c7e05a3..c0452f867b31 100755
--- a/setup.py
+++ b/setup.py
@@ -202,8 +202,8 @@ def op_enabled(op_name):
 print(f'Install Ops={install_ops}')
 
 # Write out version/git info.
-git_hash_cmd = shlex.split("bash -c git rev-parse --short HEAD")
-git_branch_cmd = shlex.split("bash -c git rev-parse --abbrev-ref HEAD")
+git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"")
+git_branch_cmd = shlex.split("bash -c \"git rev-parse --abbrev-ref HEAD\"")
 if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
     try:
         result = subprocess.check_output(git_hash_cmd)

From 83e4364fbd921ad4fe808d31eb107ef080228a7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Wed, 20 Nov 2024 02:04:47 +0100
Subject: [PATCH 13/16] Use `json_schema_extra` instead of extra keyword in
 `Field` (#6764)

> Using extra keyword arguments on `Field` is deprecated and will be
removed. Use `json_schema_extra` instead. (Extra keys: 'new_param').
Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2
Migration Guide at https://errors.pydantic.dev/2.9/migration/

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/zero/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 7cac7e3c1ce7..19b272ce9e92 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -311,7 +311,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     for efficient all_2_all_reduce comm
     """
 
-    mics_shard_size: int = Field(-1, new_param="mics_shard_size")
+    mics_shard_size: int = Field(-1, json_schema_extra={"new_param": "mics_shard_size"})
 
     mics_hierarchical_params_gather: bool = False
 

From b5709cce66ac09c879054dba0576e8ab0d770f73 Mon Sep 17 00:00:00 2001
From: Max Kovalenko <mkovalenko@habana.ai>
Date: Thu, 21 Nov 2024 18:01:13 +0200
Subject: [PATCH 14/16] Enable torch compile on _allgather_params (#6769)

* Previosuly ZerO3 was crashing when trying to compile _allgather_params
* Disabling grad solves the issue
---
 deepspeed/runtime/zero/partition_parameters.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 22a6746bb57c..cb0cd7c8017d 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1882,6 +1882,7 @@ def _allgather_params_coalesced(self, param_list, hierarchy=0, quantize=False):
 
         return None
 
+    @torch.no_grad()
     def _allgather_params(self, param_list, hierarchy=0):
         if len(param_list) == 0:
             return

From f515104e95f3b337f0cb59bb9e87439b44077799 Mon Sep 17 00:00:00 2001
From: Hyeonseung Lee <swigls1@gmail.com>
Date: Fri, 22 Nov 2024 02:37:29 +0900
Subject: [PATCH 15/16] Removes unnecessary cloning (#6761)

`clone_tensors_for_torch_save()` function:

When the `item.device` is different from `device` input,
`tensor.clone()` is not actually required because `to()` function also
clones the original tensor.


+) I observed memory bloat under following conditions:
* Training a Whisper model w/ `transformers` framework with `ZeRO-0` and
`ZeRO-1` configuration.
* Memory bloating can be observed every time the model state_dict is
cloned using `clone_tensors_for_torch_save()`

After I removed the unnecessary `clone()`, seems like the problem is
solved.

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/checkpoint/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py
index c305e8884e83..5964da00728e 100644
--- a/deepspeed/checkpoint/utils.py
+++ b/deepspeed/checkpoint/utils.py
@@ -51,7 +51,12 @@ def clone_tensors_for_torch_save(item, device=torch.device('cpu')):
         - copy of ``item`` with cloned tensors on target device
     """
     if torch.is_tensor(item):
-        return item.detach().clone().to(device)
+        if type(device) is str:
+            device = torch.device(device)
+        if device == item.device:
+            return item.detach().clone()
+        else:
+            return item.detach().to(device)
     elif isinstance(item, list):
         return [clone_tensors_for_torch_save(v, device) for v in item]
     elif isinstance(item, tuple):

From cd20a3bbc7713908d7fb5fd7af4a91d52f126370 Mon Sep 17 00:00:00 2001
From: ChenWenbin <wenbin.chen@intel.com>
Date: Fri, 22 Nov 2024 02:32:03 +0800
Subject: [PATCH 16/16] Fix potential memory issues when use deepspeed Z3
 (#6726)

I had OOM problem when doing DPO training using zero3. It needs to call
module twice in one training step, and second call is with no_grad().
The problem is caused by two bugs:
1. "__n_available_params", which helps to control fetched parameters,
becomes negative after release_and_reset_all() function.
2. module.ds_grads_remaining becomes negative in backward() if we call
module more than once in one training step.

I tried to create two patches to fix these issues.

---------

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Hongwei Chen <33092912+hwchen2017@users.noreply.github.com>
---
 deepspeed/runtime/zero/parameter_offload.py   |  3 +-
 .../zero/partitioned_param_coordinator.py     |  3 +-
 .../runtime/zero/test_zero_multiple_run.py    | 53 +++++++++++++++++++
 3 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/runtime/zero/test_zero_multiple_run.py

diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index 082d7e874e4d..f945f5166190 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -392,7 +392,8 @@ def _run_before_forward_function(input):
                                                                _run_after_backward_hook, inputs)
 
         def _post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
+            if not hasattr(module, "ds_grads_remaining"):
+                module.ds_grads_remaining = 0
 
             if not hasattr(module, "post_bwd_fn"):
 
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index 49f477cc4a1b..596d0e9c20f9 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -252,7 +252,6 @@ def reset_step(self) -> None:
         self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
         self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         self.__step_id = 0
-        self.__n_available_params = 0
         self.__profiler.reset_events()
 
     def _dump_params(self, tag, sub_module, params, step_id=None):
@@ -430,7 +429,7 @@ def release_and_reset_all(self, module: Module) -> None:
             # there's a hook execution issue
             param.ds_active_sub_modules.clear()
             self.__release_param(param)
-
+        self.__n_available_params = 0
         for param in iter_params(module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
diff --git a/tests/unit/runtime/zero/test_zero_multiple_run.py b/tests/unit/runtime/zero/test_zero_multiple_run.py
new file mode 100644
index 000000000000..d4eb3a578cc9
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_multiple_run.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+from unit.common import DistributedTest, preferred_dtype
+from unit.simple_model import SimpleModel, random_dataloader
+
+
+class TestZ3MultipleModelCall(DistributedTest):
+    world_size = 1
+
+    def test_z3_multiple_model_call(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": 3
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        hidden_dim, nlayers = 2048, 3
+        model = SimpleModel(hidden_dim=hidden_dim, nlayers=nlayers)
+        model_engine, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                     model=model,
+                                                     model_parameters=model.parameters())
+        data_loader = iter(
+            random_dataloader(model=model_engine, total_samples=10, hidden_dim=hidden_dim, device=model_engine.device))
+
+        for n, batch in enumerate(data_loader):
+            loss1 = model_engine(batch[0], batch[1])
+            with torch.no_grad():
+                loss2 = model_engine(batch[0], batch[1])
+            loss = loss1 + loss2
+            model_engine.backward(loss)
+            for name, submodule in model_engine.module.linears._modules.items():
+                assert hasattr(submodule, "ds_grads_remaining"), \
+                  f"linears.{name} does not have variable ds_grads_remaining"
+                assert submodule.ds_grads_remaining == 0, \
+                  f"ds_grads_remaining of linears.{name} is not 0 ({submodule.ds_grads_remaining})"
+            model_engine.step()