diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc
index 5cda893ec4c..be619cfad48 100644
--- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc
+++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc
@@ -17,7 +17,7 @@ public:
     __host__ __device__ static constexpr int dynamic_work_size(
         const int num_rows, int)
     {
-        return num_rows;
+        return num_rows * sizeof(value_type);
     }
 
     /**
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
new file mode 100644
index 00000000000..ffee501b58c
--- /dev/null
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
@@ -0,0 +1,228 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+template <typename Group, typename BatchMatrixType_entry, typename PrecType,
+          typename ValueType>
+__device__ __forceinline__ void initialize(
+    Group subgroup, const int num_rows, const BatchMatrixType_entry& mat_entry,
+    const ValueType* const b_global_entry,
+    const ValueType* const x_global_entry, ValueType* const x_shared_entry,
+    ValueType* const r_shared_entry, const PrecType& prec_shared,
+    ValueType* const z_shared_entry, ValueType& rho_old_shared_entry,
+    ValueType* const p_shared_entry,
+    typename gko::remove_complex<ValueType>& rhs_norms_sh)
+{
+    // copy x from global to shared memory
+    // r = b
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        x_shared_entry[iz] = x_global_entry[iz];
+        r_shared_entry[iz] = b_global_entry[iz];
+    }
+    __syncthreads();
+
+    // r = b - A*x
+    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+                   static_cast<ValueType>(1.0), r_shared_entry);
+    __syncthreads();
+
+    // z = precond * r
+    prec_shared.apply(num_rows, r_shared_entry, z_shared_entry);
+    __syncthreads();
+
+    if (threadIdx.x / config::warp_size == 0) {
+        // Compute norms of rhs
+        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                 rhs_norms_sh);
+    } else if (threadIdx.x / config::warp_size == 1) {
+        // rho_old = r' * z
+        single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
+                                    z_shared_entry, rho_old_shared_entry);
+    }
+
+    // p = z
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        p_shared_entry[iz] = z_shared_entry[iz];
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void update_p(const int num_rows,
+                                         const ValueType& rho_new_shared_entry,
+                                         const ValueType& rho_old_shared_entry,
+                                         const ValueType* const z_shared_entry,
+                                         ValueType* const p_shared_entry)
+{
+    for (int li = threadIdx.x; li < num_rows; li += blockDim.x) {
+        const ValueType beta = rho_new_shared_entry / rho_old_shared_entry;
+        p_shared_entry[li] = z_shared_entry[li] + beta * p_shared_entry[li];
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void update_x_and_r(
+    Group subgroup, const int num_rows, const ValueType& rho_old_shared_entry,
+    const ValueType* const p_shared_entry,
+    const ValueType* const Ap_shared_entry, ValueType& alpha_shared_entry,
+    ValueType* const x_shared_entry, ValueType* const r_shared_entry)
+{
+    if (threadIdx.x / config::warp_size == 0) {
+        single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
+                                    Ap_shared_entry, alpha_shared_entry);
+    }
+    __syncthreads();
+
+    for (int li = threadIdx.x; li < num_rows; li += blockDim.x) {
+        const ValueType alpha = rho_old_shared_entry / alpha_shared_entry;
+        x_shared_entry[li] += alpha * p_shared_entry[li];
+        r_shared_entry[li] -= alpha * Ap_shared_entry[li];
+    }
+}
+
+
+template <typename StopType, const int n_shared, const bool prec_shared_bool,
+          typename PrecType, typename LogType, typename BatchMatrixType,
+          typename ValueType>
+__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
+                             const int max_iter,
+                             const gko::remove_complex<ValueType> tol,
+                             LogType logger, PrecType prec_shared,
+                             const BatchMatrixType mat,
+                             const ValueType* const __restrict__ b,
+                             ValueType* const __restrict__ x,
+                             ValueType* const __restrict__ workspace = nullptr)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    const auto num_batch_items = mat.num_batch_items;
+    const auto num_rows = mat.num_rows;
+
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+
+    for (size_type batch_id = blockIdx.x; batch_id < num_batch_items;
+         batch_id += gridDim.x) {
+        const int gmem_offset =
+            batch_id * sconf.gmem_stride_bytes / sizeof(ValueType);
+        extern __shared__ char local_mem_sh[];
+
+        ValueType* r_sh;
+        ValueType* z_sh;
+        ValueType* p_sh;
+        ValueType* Ap_sh;
+        ValueType* x_sh;
+        ValueType* prec_work_sh;
+
+        if (n_shared >= 1) {
+            r_sh = reinterpret_cast<ValueType*>(local_mem_sh);
+        } else {
+            r_sh = workspace + gmem_offset;
+        }
+        if (n_shared == 1) {
+            z_sh = workspace + gmem_offset;
+        } else {
+            z_sh = r_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 2) {
+            p_sh = workspace + gmem_offset;
+        } else {
+            p_sh = z_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 3) {
+            Ap_sh = workspace + gmem_offset;
+        } else {
+            Ap_sh = p_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 4) {
+            x_sh = workspace + gmem_offset;
+        } else {
+            x_sh = Ap_sh + sconf.padded_vec_len;
+        }
+        if (!prec_shared_bool && n_shared == 5) {
+            prec_work_sh = workspace + gmem_offset;
+        } else {
+            prec_work_sh = x_sh + sconf.padded_vec_len;
+        }
+
+        __shared__ uninitialized_array<ValueType, 1> rho_old_sh;
+        __shared__ uninitialized_array<ValueType, 1> rho_new_sh;
+        __shared__ uninitialized_array<ValueType, 1> alpha_sh;
+        __shared__ real_type norms_rhs_sh[1];
+        __shared__ real_type norms_res_sh[1];
+
+        const auto mat_entry =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const ValueType* const b_global_entry =
+            gko::batch::multi_vector::batch_item_ptr(b, 1, num_rows, batch_id);
+        ValueType* const x_global_entry =
+            gko::batch::multi_vector::batch_item_ptr(x, 1, num_rows, batch_id);
+
+        // generate preconditioner
+        prec_shared.generate(batch_id, mat_entry, prec_work_sh);
+
+        // initialization
+        // compute b norms
+        // r = b - A*x
+        // z = precond*r
+        // rho_old = r' * z (' is for hermitian transpose)
+        // p = z
+        initialize(subgroup, num_rows, mat_entry, b_global_entry,
+                   x_global_entry, x_sh, r_sh, prec_shared, z_sh, rho_old_sh[0],
+                   p_sh, norms_rhs_sh[0]);
+        __syncthreads();
+
+        // stopping criterion object
+        StopType stop(tol, norms_rhs_sh);
+
+        int iter = 0;
+        for (; iter < max_iter; iter++) {
+            norms_res_sh[0] = sqrt(abs(rho_old_sh[0]));
+            __syncthreads();
+            if (stop.check_converged(norms_res_sh)) {
+                logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+                break;
+            }
+
+            // Ap = A * p
+            simple_apply(mat_entry, p_sh, Ap_sh);
+            __syncthreads();
+
+            // alpha = rho_old / (p' * Ap)
+            // x = x + alpha * p
+            // r = r - alpha * Ap
+            update_x_and_r(subgroup, num_rows, rho_old_sh[0], p_sh, Ap_sh,
+                           alpha_sh[0], x_sh, r_sh);
+            __syncthreads();
+
+            // z = precond * r
+            prec_shared.apply(num_rows, r_sh, z_sh);
+            __syncthreads();
+
+            if (threadIdx.x / config::warp_size == 0) {
+                // rho_new =  (r)' * (z)
+                single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
+                                            rho_new_sh[0]);
+            }
+            __syncthreads();
+
+            // beta = rho_new / rho_old
+            // p = z + beta * p
+            update_p(num_rows, rho_new_sh[0], rho_old_sh[0], z_sh, p_sh);
+            __syncthreads();
+
+            // rho_old = rho_new
+            if (threadIdx.x == 0) {
+                rho_old_sh[0] = rho_new_sh[0];
+            }
+            __syncthreads();
+        }
+
+        logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+
+        // copy x back to global memory
+        single_rhs_copy(num_rows, x_sh, x_global_entry);
+        __syncthreads();
+    }
+}
diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp
index 0741637f3d4..43f55f1356d 100644
--- a/core/solver/batch_bicgstab_kernels.hpp
+++ b/core/solver/batch_bicgstab_kernels.hpp
@@ -88,8 +88,7 @@ void set_gmem_stride_bytes(storage_config& sconf,
         gmem_stride += prec_storage_bytes;
     }
     // align global memory chunks
-    sconf.gmem_stride_bytes =
-        gmem_stride > 0 ? ceildiv(gmem_stride, align_bytes) * align_bytes : 0;
+    sconf.gmem_stride_bytes = ceildiv(gmem_stride, align_bytes) * align_bytes;
 }
 
 
@@ -134,8 +133,7 @@ storage_config compute_shared_storage(const int available_shared_mem,
     using real_type = remove_complex<ValueType>;
     const int vec_size = num_rows * num_rhs * sizeof(ValueType);
     const int num_main_vecs = 9;
-    const int prec_storage =
-        Prectype::dynamic_work_size(num_rows, num_nz) * sizeof(ValueType);
+    const int prec_storage = Prectype::dynamic_work_size(num_rows, num_nz);
     int rem_shared = available_shared_mem;
     // Set default values. Initially all vecs are in global memory.
     // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len}
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
index 5ff78524f11..d2c64460be2 100644
--- a/core/solver/batch_cg_kernels.hpp
+++ b/core/solver/batch_cg_kernels.hpp
@@ -121,8 +121,7 @@ storage_config compute_shared_storage(const int available_shared_mem,
     using real_type = remove_complex<ValueType>;
     const int vec_bytes = num_rows * num_rhs * sizeof(ValueType);
     const int num_main_vecs = 5;
-    const int prec_storage =
-        Prectype::dynamic_work_size(num_rows, num_nz) * sizeof(ValueType);
+    const int prec_storage = Prectype::dynamic_work_size(num_rows, num_nz);
     int rem_shared = available_shared_mem;
     // Set default values. Initially all vecs are in global memory.
     // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len}
@@ -160,13 +159,13 @@ storage_config compute_shared_storage(const int available_shared_mem,
 }  // namespace batch_cg
 
 
-#define GKO_DECLARE_BATCH_CG_APPLY_KERNEL(_type)                             \
-    void apply(                                                              \
-        std::shared_ptr<const DefaultExecutor> exec,                         \
-        const gko::kernels::batch_cg::settings<remove_complex<_type>>&       \
-            options,                                                         \
-        const batch::BatchLinOp* a, const batch::BatchLinOp* preconditioner, \
-        const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x,    \
+#define GKO_DECLARE_BATCH_CG_APPLY_KERNEL(_type)                               \
+    void apply(                                                                \
+        std::shared_ptr<const DefaultExecutor> exec,                           \
+        const gko::kernels::batch_cg::settings<remove_complex<_type>>&         \
+            options,                                                           \
+        const batch::BatchLinOp* mat, const batch::BatchLinOp* preconditioner, \
+        const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x,      \
         gko::batch::log::detail::log_data<remove_complex<_type>>& logdata)
 
 
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 57c3612df69..541f3f6b936 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -51,12 +51,173 @@ namespace batch_cg {
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(&funcattr,
+                          apply_kernel<StopType, 5, true, PrecType, LogType,
+                                       BatchMatrixType, ValueType>);
+    const int num_regs_used = funcattr.numRegs;
+    int max_regs_blk = 0;
+    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
+                           exec->get_device_id());
+    const int max_threads_regs =
+        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
+{
+    int shmem_per_sm = 0;
+    cudaDeviceGetAttribute(&shmem_per_sm,
+                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                           exec->get_device_id());
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
+        apply_kernel<StopType, 5, true, PrecType, LogType, BatchMatrixType,
+                     ValueType>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(&funcattr,
+                          apply_kernel<StopType, 5, true, PrecType, LogType,
+                                       BatchMatrixType, ValueType>);
+    return funcattr.maxDynamicSharedSizeBytes;
+}
 
 
 template <typename T>
 using settings = gko::kernels::batch_cg::settings<T>;
 
 
+template <typename CuValueType>
+class kernel_caller {
+public:
+    using value_type = CuValueType;
+
+    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
+                  const settings<remove_complex<value_type>> settings)
+        : exec_{std::move(exec)}, settings_{settings}
+    {}
+
+    template <typename StopType, const int n_shared,
+              const bool prec_shared_bool, typename PrecType, typename LogType,
+              typename BatchMatrixType>
+    void launch_apply_kernel(
+        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
+        PrecType& prec, const BatchMatrixType& mat,
+        const value_type* const __restrict__ b_values,
+        value_type* const __restrict__ x_values,
+        value_type* const __restrict__ workspace_data, const int& block_size,
+        const size_t& shared_size) const
+    {
+        apply_kernel<StopType, n_shared, prec_shared_bool>
+            <<<mat.num_batch_items, block_size, shared_size,
+               exec_->get_stream()>>>(sconf, settings_.max_iterations,
+                                      settings_.residual_tol, logger, prec, mat,
+                                      b_values, x_values, workspace_data);
+    }
+
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        LogType logger, const BatchMatrixType& mat, PrecType prec,
+        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+    {
+        using real_type = gko::remove_complex<value_type>;
+        const size_type num_batch_items = mat.num_batch_items;
+        constexpr int align_multiple = 8;
+        const int padded_num_rows =
+            ceildiv(mat.num_rows, align_multiple) * align_multiple;
+        auto shem_guard =
+            gko::kernels::cuda::detail::shared_memory_config_guard<
+                value_type>();
+        const int shmem_per_blk =
+            get_max_dynamic_shared_memory<StopType, PrecType, LogType,
+                                          BatchMatrixType, value_type>(exec_);
+        const int block_size =
+            get_num_threads_per_block<StopType, PrecType, LogType,
+                                      BatchMatrixType, value_type>(
+                exec_, mat.num_rows);
+        GKO_ASSERT(block_size >= 2 * config::warp_size);
+
+        const size_t prec_size = PrecType::dynamic_work_size(
+            padded_num_rows, mat.get_single_item_num_nnz());
+        const auto sconf =
+            gko::kernels::batch_cg::compute_shared_storage<PrecType,
+                                                           value_type>(
+                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
+                b.num_rhs);
+        const size_t shared_size =
+            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            (sconf.prec_shared ? prec_size : 0);
+        auto workspace = gko::array<value_type>(
+            exec_,
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
+        assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+
+        value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<StopType, n_shared,
+        // prec_shared>
+        if (sconf.prec_shared)
+            launch_apply_kernel<StopType, 5, true>(
+                sconf, logger, prec, mat, b.values, x.values, workspace_data,
+                block_size, shared_size);
+        else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<StopType, 0, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<StopType, 1, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<StopType, 2, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<StopType, 3, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<StopType, 4, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
+    }
+
+private:
+    std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<value_type>> settings_;
+};
+
+
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
@@ -66,7 +227,10 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    GKO_NOT_IMPLEMENTED;
+    using cu_value_type = cuda_type<ValueType>;
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
+    dispatcher.apply(b, x, logdata);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
diff --git a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc b/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc
index 7af84aa60bb..de82476519d 100644
--- a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc
+++ b/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc
@@ -16,7 +16,7 @@ public:
      */
     static constexpr int dynamic_work_size(const int num_rows, int)
     {
-        return num_rows;
+        return num_rows * sizeof(value_type);
     }
 
     /**
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 1288c15b4c0..a47dd594c31 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -50,12 +50,154 @@ namespace batch_cg {
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
+
+
+template <typename BatchMatrixType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
+    // This value has been taken from ROCm docs. This is the number of registers
+    // that maximizes the occupancy on an AMD GPU (MI200). HIP does not have an
+    // API to query the number of registers a function uses.
+    const int num_regs_used_per_thread = 64;
+    int max_regs_blk = 0;
+    GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+        &max_regs_blk, hipDeviceAttributeMaxRegistersPerBlock,
+        exec->get_device_id()));
+    const int max_threads_regs = (max_regs_blk / num_regs_used_per_thread);
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
 
 
 template <typename T>
 using settings = gko::kernels::batch_cg::settings<T>;
 
 
+template <typename HipValueType>
+class kernel_caller {
+public:
+    using value_type = HipValueType;
+
+    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
+                  const settings<remove_complex<value_type>> settings)
+        : exec_{exec}, settings_{settings}
+    {}
+
+    template <typename StopType, const int n_shared,
+              const bool prec_shared_bool, typename PrecType, typename LogType,
+              typename BatchMatrixType>
+    void launch_apply_kernel(
+        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
+        PrecType& prec, const BatchMatrixType& mat,
+        const value_type* const __restrict__ b_values,
+        value_type* const __restrict__ x_values,
+        value_type* const __restrict__ workspace_data, const int& block_size,
+        const size_t& shared_size) const
+    {
+        apply_kernel<StopType, n_shared, prec_shared_bool>
+            <<<mat.num_batch_items, block_size, shared_size,
+               exec_->get_stream()>>>(sconf, settings_.max_iterations,
+                                      settings_.residual_tol, logger, prec, mat,
+                                      b_values, x_values, workspace_data);
+    }
+
+
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        LogType logger, const BatchMatrixType& mat, PrecType prec,
+        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+    {
+        using real_type = gko::remove_complex<value_type>;
+        const size_type num_batch_items = mat.num_batch_items;
+        constexpr int align_multiple = 8;
+        const int padded_num_rows =
+            ceildiv(mat.num_rows, align_multiple) * align_multiple;
+        int shmem_per_blk = 0;
+        GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+            &shmem_per_blk, hipDeviceAttributeMaxSharedMemoryPerBlock,
+            exec_->get_device_id()));
+        const int block_size =
+            get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
+        GKO_ASSERT(block_size >= 2 * config::warp_size);
+
+        // Returns amount required in bytes
+        const size_t prec_size = PrecType::dynamic_work_size(
+            padded_num_rows, mat.get_single_item_num_nnz());
+        const auto sconf =
+            gko::kernels::batch_cg::compute_shared_storage<PrecType,
+                                                           value_type>(
+                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
+                b.num_rhs);
+        const size_t shared_size =
+            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            (sconf.prec_shared ? prec_size : 0);
+        auto workspace = gko::array<value_type>(
+            exec_,
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
+        assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+
+        value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<StopType, n_shared,
+        // prec_shared)
+        if (sconf.prec_shared) {
+            launch_apply_kernel<StopType, 5, true>(
+                sconf, logger, prec, mat, b.values, x.values, workspace_data,
+                block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<StopType, 0, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<StopType, 1, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<StopType, 2, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<StopType, 3, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<StopType, 4, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<StopType, 5, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
+    }
+
+private:
+    std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<value_type>> settings_;
+};
+
+
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,
@@ -65,7 +207,10 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    GKO_NOT_IMPLEMENTED;
+    using hip_value_type = hip_type<ValueType>;
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        kernel_caller<hip_value_type>(exec, settings), settings, mat, precon);
+    dispatcher.apply(b, x, logdata);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL);
diff --git a/reference/preconditioner/batch_scalar_jacobi.hpp b/reference/preconditioner/batch_scalar_jacobi.hpp
index 0ee09591e88..18b6aa20b14 100644
--- a/reference/preconditioner/batch_scalar_jacobi.hpp
+++ b/reference/preconditioner/batch_scalar_jacobi.hpp
@@ -25,7 +25,10 @@ class ScalarJacobi final {
     /**
      * The size of the work vector required in case of dynamic allocation.
      */
-    static int dynamic_work_size(const int nrows, int) { return nrows; }
+    static int dynamic_work_size(const int nrows, int)
+    {
+        return nrows * sizeof(ValueType);
+    }
 
     /**
      * Sets the input and generates the preconditioner by storing the inverse
diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt
index 9b8cfcb97fa..b66ddd7cebc 100644
--- a/test/solver/CMakeLists.txt
+++ b/test/solver/CMakeLists.txt
@@ -1,5 +1,5 @@
 ginkgo_create_common_test(batch_bicgstab_kernels)
-ginkgo_create_common_test(batch_cg_kernels DISABLE_EXECUTORS cuda hip dpcpp)
+ginkgo_create_common_test(batch_cg_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(bicg_kernels)
 ginkgo_create_common_test(bicgstab_kernels)
 ginkgo_create_common_test(cb_gmres_kernels)