diff --git a/CMakeLists.txt b/CMakeLists.txt
index c98cab48..87e263f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,9 +44,18 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
     message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
 endif ()
 
+option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
+
 # setup some defaults flags for everything
 set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
-set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
+set(DEFAULT_RELEASE_FLAGS -O3)
+if (BUILD_NATIVE)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
+    set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -mcpu=native)
+  else()
+    set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
+  endif()
+endif()
 
 macro(hint_flag FLAG DESCRIPTION)
     if (NOT DEFINED ${FLAG})
@@ -146,17 +155,15 @@ endif ()
 include(cmake/register_models.cmake)
 
 # register out models <model_name> <preprocessor_def_name> <source files...>
+register_model(serial SERIAL SerialStream.cpp)
 register_model(omp OMP OMPStream.cpp)
 register_model(ocl OCL OCLStream.cpp)
-register_model(std-data STD_DATA STDDataStream.cpp)
-register_model(std-indices STD_INDICES STDIndicesStream.cpp)
-register_model(std-ranges STD_RANGES STDRangesStream.cpp)
+register_model(std STD STDStream.cpp)
 register_model(hip HIP HIPStream.cpp)
 register_model(cuda CUDA CUDAStream.cu)
 register_model(kokkos KOKKOS KokkosStream.cpp)
 register_model(sycl SYCL SYCLStream.cpp)
-register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp)
-register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp)
+register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
 register_model(acc ACC ACCStream.cpp)
 # defining RAJA collides with the RAJA namespace so USE_RAJA
 register_model(raja USE_RAJA RAJAStream.cpp)
diff --git a/src/Stream.h b/src/Stream.h
index 45c144c3..f233f54a 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -7,14 +7,13 @@
 
 #pragma once
 
+#include <cstdint>
+#include <array>
 #include <vector>
 #include <string>
+#include "benchmark.h"
 
-// Array values
-#define startA (0.1)
-#define startB (0.2)
-#define startC (0.0)
-#define startScalar (0.4)
+using std::intptr_t;
 
 template <class T>
 class Stream
@@ -31,9 +30,8 @@ class Stream
     virtual void nstream() = 0;
     virtual T dot() = 0;
 
-    // Copy memory between host and device
-    virtual void init_arrays(T initA, T initB, T initC) = 0;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
+    // Set pointers to read from arrays
+    virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
 };
 
 // Implementation specific device functions
diff --git a/src/StreamModels.h b/src/StreamModels.h
index 556beb4d..820c08a4 100644
--- a/src/StreamModels.h
+++ b/src/StreamModels.h
@@ -3,12 +3,8 @@
 
 #if defined(CUDA)
 #include "CUDAStream.h"
-#elif defined(STD_DATA)
-#include "STDDataStream.h"
-#elif defined(STD_INDICES)
-#include "STDIndicesStream.h"
-#elif defined(STD_RANGES)
-#include "STDRangesStream.hpp"
+#elif defined(STD)
+#include "STDStream.h"
 #elif defined(TBB)
 #include "TBBStream.hpp"
 #elif defined(THRUST)
@@ -31,71 +27,69 @@
 #include "SYCLStream2020.h"
 #elif defined(OMP)
 #include "OMPStream.h"
+#elif defined(SERIAL)
+#include "SerialStream.h"
 #elif defined(FUTHARK)
 #include "FutharkStream.h"
 #endif
 
-template <typename T>
-std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
+template <typename T, typename...Args>
+std::unique_ptr<Stream<T>> make_stream(Args... args) {
 #if defined(CUDA)
   // Use the CUDA implementation
-  return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<CUDAStream<T>>(args...);
 
 #elif defined(HIP)
   // Use the HIP implementation
-  return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HIPStream<T>>(args...);
 
 #elif defined(HC)
   // Use the HC implementation
-  return std::make_unique<HCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<HCStream<T>>(args...);
 
 #elif defined(OCL)
   // Use the OpenCL implementation
-  return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OCLStream<T>>(args...);
 
 #elif defined(USE_RAJA)
   // Use the RAJA implementation
-  return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
+  return std::make_unique<RAJAStream<T>>(args...);
 
 #elif defined(KOKKOS)
   // Use the Kokkos implementation
-  return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
+  return std::make_unique<KokkosStream<T>>(args...);
 
-#elif defined(STD_DATA)
+#elif defined(STD)
   // Use the C++ STD data-oriented implementation
-  return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
-
-#elif defined(STD_INDICES)
-  // Use the C++ STD index-oriented implementation
-  return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
-
-#elif defined(STD_RANGES)
-  // Use the C++ STD ranges implementation
-  return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
+  return std::make_unique<STDStream<T>>(args...);
 
 #elif defined(TBB)
   // Use the C++20 implementation
-  return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
+  return std::make_unique<TBBStream<T>>(args...);
 
 #elif defined(THRUST)
   // Use the Thrust implementation
-  return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ThrustStream<T>>(args...);
 
 #elif defined(ACC)
   // Use the OpenACC implementation
-  return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
+  return std::make_unique<ACCStream<T>>(args...);
 
 #elif defined(SYCL) || defined(SYCL2020)
   // Use the SYCL implementation
-  return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
+  return std::make_unique<SYCLStream<T>>(args...);
 
 #elif defined(OMP)
   // Use the OpenMP implementation
-  return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
+  return std::make_unique<OMPStream<T>>(args...);
+
+#elif defined(SERIAL)
+  // Use the Serial implementation
+  return std::make_unique<SerialStream<T>>(args...);
 
 #elif defined(FUTHARK)
   // Use the Futhark implementation
-  return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
+  return std::make_unique<FutharkStream<T>>(args...);
 
 #else
 
diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp
index a346a39c..034336a4 100644
--- a/src/acc/ACCStream.cpp
+++ b/src/acc/ACCStream.cpp
@@ -8,11 +8,12 @@
 #include "ACCStream.h"
 
 template <class T>
-ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
-  : array_size{ARRAY_SIZE}
+ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+			T initA, T initB, T initC)
+  : array_size{array_size}
 {
   acc_device_t device_type = acc_get_device_type();
-  acc_set_device_num(device, device_type);
+  acc_set_device_num(device_id, device_type);
 
   // Set up data region on device
   this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
 
   #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -62,7 +65,7 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
   T *a = this->a;
   T *b = this->b;
@@ -70,12 +73,9 @@ void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
   #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h
index 1b053cb4..8345b785 100644
--- a/src/acc/ACCStream.h
+++ b/src/acc/ACCStream.h
@@ -19,32 +19,25 @@
 template <class T>
 class ACCStream : public Stream<T>
 {
-  struct A{
-    T *a;
-    T *b;
-    T *c;
-  };
-
-  protected:
     // Size of arrays
     intptr_t array_size;
-    A aa;
     // Device side pointers
-    T *a;
-    T *b;
-    T *c;
+    T* restrict a;
+    T* restrict b;
+    T* restrict c;
 
   public:
-    ACCStream(const intptr_t, int);
+    ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~ACCStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/benchmark.h b/src/benchmark.h
new file mode 100644
index 00000000..95d675f7
--- /dev/null
+++ b/src/benchmark.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <iostream>
+
+// Array values
+#define startA (0.1)
+#define startB (0.2)
+#define startC (0.0)
+#define startScalar (0.4)
+
+// Benchmark Identifier: identifies individual & groups of benchmarks:
+// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
+// - All: all kernels.
+// - Individual kernels only.
+enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
+
+struct Benchmark {
+  BenchId id;
+  char const* label;
+  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
+  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
+  size_t weight;
+  // Is it one of: Copy, Mul, Add, Triad, Dot?
+  bool classic = false;
+};
+
+// Benchmarks in the order in which - if present - should be run for validation purposes:
+constexpr size_t num_benchmarks = 6;
+constexpr std::array<Benchmark, num_benchmarks> bench = {
+  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
+  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
+  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
+};
+
+// Which buffers are needed by each benchmark
+inline bool needs_buffer(BenchId id, char n) {
+  auto in = [n](std::initializer_list<char> values) {
+    return std::find(values.begin(), values.end(), n) != values.end();
+  };
+  switch(id) {
+  case BenchId::All:     return in({'a','b','c'});       
+  case BenchId::Classic: return in({'a','b','c'});   
+  case BenchId::Copy:    return in({'a','c'});
+  case BenchId::Mul:	 return in({'b','c'});
+  case BenchId::Add:	 return in({'a','b','c'});
+  case BenchId::Triad:   return in({'a','b','c'});
+  case BenchId::Dot:	 return in({'a','b'});
+  case BenchId::Nstream: return in({'a','b','c'});  
+  default:
+    std::cerr << "Unknown benchmark" << std::endl;
+    abort();
+  }
+}
+
+// Returns true if the benchmark needs to be run:
+inline bool run_benchmark(BenchId selection, Benchmark const& b) {
+  if (selection == BenchId::All)                  return true;
+  if (selection == BenchId::Classic && b.classic) return true;
+  return selection == b.id;
+}
diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh
index 249bab4f..d10e8eef 100755
--- a/src/ci-test-compile.sh
+++ b/src/ci-test-compile.sh
@@ -138,6 +138,7 @@ build_gcc() {
   local name="gcc_build"
   local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}"
 
+  run_build $name "${GCC_CXX:?}" serial "$cxx"
   run_build $name "${GCC_CXX:?}" omp "$cxx"
   if [ "$MODEL" = "all" ] || [ "$MODEL" = "OMP" ]; then
     # sanity check that it at least runs
@@ -151,9 +152,10 @@ build_gcc() {
       *)   dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;;
     esac
     # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here
-    run_build $name "${GCC_CXX:?}" std-data    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    run_build $name "${GCC_CXX:?}" std-ranges  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
+    run_build $name "${GCC_CXX:?}" std    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17"
+    # Requires GCC 14 and newer CMake for C++23 support    
+    #run_build $name "${GCC_CXX:?}" std    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23"
+    run_build $name "${GCC_CXX:?}" std    "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES"
   done
 
   run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
@@ -219,6 +221,7 @@ build_gcc() {
 build_clang() {
   local name="clang_build"
   local cxx="-DCMAKE_CXX_COMPILER=${CLANG_CXX:?}"
+  run_build $name "${CLANG_CXX:?}" serial "$cxx"
   run_build $name "${CLANG_CXX:?}" omp "$cxx"
 
   if [ "${CLANG_OMP_OFFLOAD_AMD:-false}" != "false" ]; then
@@ -249,9 +252,11 @@ build_clang() {
       OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;;
       *)   dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0"  ;;
     esac
-    run_build $name "${CLANG_CXX:?}" std-data     "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    run_build $name "${CLANG_CXX:?}" std-indices  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl"
-    # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported
+    run_build $name "${CLANG_CXX:?}" std  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17"
+    # Requires GCC 14 and newer CMake for C++23 support
+    # run_build $name "${CLANG_CXX:?}" std  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23"
+    # TODO: clang is too old
+    #run_build $name "${CLANG_CXX:?}" std  "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES"    
   done
 
   run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB"
@@ -268,14 +273,17 @@ build_clang() {
 build_nvhpc() {
   local name="nvhpc_build"
   local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}"
-  run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
-  run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY"
+  run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA17"
+  # Requires GCC 14 and newer CMake for C++23 support
+  # run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA23"
+  run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=INDICES"  
 
   run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY"
   run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen"
 }
 
 build_aocc() {
+  run_build aocc_build "${AOCC_CXX:?}" serial "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}"
   run_build aocc_build "${AOCC_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}"
 }
 
@@ -309,6 +317,7 @@ build_icpc() {
   set -u
   local name="intel_build"
   local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}"
+  run_build $name "${ICPC_CXX:?}" serial "$cxx"
   run_build $name "${ICPC_CXX:?}" omp "$cxx"
   run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}"
   if check_cmake_ver "3.20.0"; then
diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu
index 24d05794..4f5599a7 100644
--- a/src/cuda/CUDAStream.cu
+++ b/src/cuda/CUDAStream.cu
@@ -5,22 +5,80 @@
 // source code
 
 #include "CUDAStream.h"
+#include <nvml.h>
 
-[[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) {
-  std::fprintf(stderr, "Error at %s:%d: %s (%d)\n  %s\n", file, line, cudaGetErrorString(e), e, expr);
+#if !defined(UNROLL_FACTOR)
+#define UNROLL_FACTOR 4
+#endif
+
+[[noreturn]] inline void cuda_error(char const* file, int line, char const* expr, cudaError_t e) {
+  std::fprintf(stderr, "CUDA Error at %s:%d: %s (%d)\n  %s\n", file, line, cudaGetErrorString(e), e, expr);
+  exit(e);
+}
+
+[[noreturn]] inline void nvml_error(char const* file, int line, char const* expr, nvmlReturn_t e) {
+  std::fprintf(stderr, "NVML Error at %s:%d: %s (%d)\n  %s\n", file, line, nvmlErrorString(e), e, expr);
   exit(e);
 }
 
 // The do while is there to make sure you remember to put a semi-colon after calling CU
-#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false)
+#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) cuda_error(__FILE__, __LINE__, #EXPR, __e); } while(false)
+#define NVML(EXPR) do { auto __e = (EXPR); if (__e != NVML_SUCCESS) nvml_error(__FILE__, __LINE__, #EXPR, __e); } while(false)
 
 // It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
 __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; }
 
 cudaStream_t stream;
 
+template <typename T>
+T* alloc_device(const intptr_t array_size) {
+  size_t array_bytes = sizeof(T) * array_size;
+  T* p = nullptr;
+#if defined(MANAGED)
+  CU(cudaMallocManaged(&p, array_bytes));
+#elif defined(PAGEFAULT)
+  p = (T*)malloc(array_bytes);
+#else
+  CU(cudaMalloc(&p, array_bytes));
+#endif
+  if (p == nullptr) throw std::runtime_error("Failed to allocate device array");
+  return p;
+}
+
+template <typename T>
+T* alloc_host(const intptr_t array_size) {
+  size_t array_bytes = sizeof(T) * array_size;
+  T* p = nullptr;
+#if defined(PAGEFAULT)
+  p = (T*)malloc(array_bytes);
+#else
+  CU(cudaHostAlloc(&p, array_bytes, cudaHostAllocDefault));
+#endif
+  if (p == nullptr) throw std::runtime_error("Failed to allocate host array");
+  return p;
+}
+
+template <typename T>
+void free_device(T* p) {
+#if defined(PAGEFAULT)
+  free(p);
+#else
+  CU(cudaFree(p));
+#endif
+}
+
+template <typename T>
+void free_host(T* p) {
+#if defined(PAGEFAULT)
+  free(p);
+#else
+  CU(cudaFreeHost(p));
+#endif
+}
+
 template <class T>
-CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
+CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
   : array_size(array_size)
 {
   // Set device
@@ -33,201 +91,197 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
   CU(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
   // Print out device information
-  std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
-  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
-#if defined(MANAGED)
-  std::cout << "Memory: MANAGED" << std::endl;
-#elif defined(PAGEFAULT)
-  std::cout << "Memory: PAGEFAULT" << std::endl;
-#else
-  std::cout << "Memory: DEFAULT" << std::endl;
-#endif
-
-  // Query device for sensible dot kernel block count
-  cudaDeviceProp props;
-  CU(cudaGetDeviceProperties(&props, device_index));
-  dot_num_blocks = props.multiProcessorCount * 4;
+  std::cout << "CUDA Driver: " << getDeviceDriver(device_index) << std::endl;
+  NVML(nvmlInit());
+  cudaDeviceProp dprop;
+  CU(cudaGetDeviceProperties(&dprop, device_index));
+  unsigned int memclock;
+  char mybus[16];
+  sprintf(&mybus[0], "%04x:%02x:%02x.0", dprop.pciDomainID, dprop.pciBusID, dprop.pciDeviceID);
+  nvmlDevice_t nvmldev;
+  NVML(nvmlDeviceGetHandleByPciBusId(mybus, &nvmldev));
+  NVML(nvmlDeviceGetClockInfo(nvmldev, NVML_CLOCK_MEM, &memclock));
+  std::cout << "CUDA Device " << device_index << ": \""
+	    << getDeviceName(device_index)
+	    << "\" " << dprop.multiProcessorCount << " SMs(" << dprop.major << "," << dprop.minor << ") "
+	    << "Memory: " << memclock << " MHz x " << dprop.memoryBusWidth << "-bit = "
+	    << 2.0*memclock*(dprop.memoryBusWidth/8)/1000.0 << " GB/s PEAK, ECC is "
+	    << (dprop.ECCEnabled ? "ON" : "OFF")
+	    << std::endl;
+
+  // Print Memory allocation API used for buffers
+  std::cout << "Memory Allocation: ";
+  #if defined(MANAGED)
+      std::cout << "MANAGED";
+  #elif defined(PAGEFAULT)
+      std::cout << "PAGEFAULT";
+  #else
+      std::cout << "DEFAULT";
+  #endif
+  std::cout << std::endl;
+
+  std::cout << "Parallel for kernel config: thread blocks of size " << TBSIZE << std::endl;
+
+  // Set sensible dot kernel block count
+  dot_num_blocks = dprop.multiProcessorCount * 4;
 
   // Size of partial sums for dot kernels
   size_t sums_bytes = sizeof(T) * dot_num_blocks;
   size_t array_bytes = sizeof(T) * array_size;
   size_t total_bytes = array_bytes * size_t(3) + sums_bytes;
-  std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl;
+  std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
 
   // Check buffers fit on the device
-  if (props.totalGlobalMem < total_bytes)
-    throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+  if (dprop.totalGlobalMem < total_bytes) {
+    std::cerr << "Requested array size of " << total_bytes * 1e-9
+	      << " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl;
+    throw std::runtime_error("Device does not have enough memory for all buffers");
+  }
 
-  // Create device buffers
-#if defined(MANAGED)
-  CU(cudaMallocManaged(&d_a, array_bytes));
-  CU(cudaMallocManaged(&d_b, array_bytes));
-  CU(cudaMallocManaged(&d_c, array_bytes));
-  CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
-#elif defined(PAGEFAULT)
-  d_a = (T*)malloc(array_bytes);
-  d_b = (T*)malloc(array_bytes);
-  d_c = (T*)malloc(array_bytes);
-  sums = (T*)malloc(sums_bytes);
-#else
-  CU(cudaMalloc(&d_a, array_bytes));
-  CU(cudaMalloc(&d_b, array_bytes));
-  CU(cudaMalloc(&d_c, array_bytes));
-  CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault));
-#endif
+  // Allocate buffers:
+  d_a = alloc_device<T>(array_size);
+  d_b = alloc_device<T>(array_size);
+  d_c = alloc_device<T>(array_size);
+  sums = alloc_host<T>(dot_num_blocks);
+
+  // Initialize buffers:
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
 CUDAStream<T>::~CUDAStream()
 {
   CU(cudaStreamDestroy(stream));
+  free_device(d_a);
+  free_device(d_b);
+  free_device(d_c);
+  free_host(sums);
+}
 
-#if defined(PAGEFAULT)
-  free(d_a);
-  free(d_b);
-  free(d_c);
-  free(sums);
+template <typename F>
+__global__ void for_each_kernel(size_t array_size, size_t start, F f) {
+  constexpr int unroll_factor = UNROLL_FACTOR;
+#if defined(GRID_STRIDE)
+  // Grid-stride loop
+  size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x;
+  #pragma unroll(unroll_factor)
+  for (; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
+    f(i);
+  }
+#elif defined(BLOCK_STRIDE)
+  // Block-stride loop
+  size_t i = start * blockIdx.x + threadIdx.x;
+  const size_t e = min(array_size, start * (blockIdx.x + size_t(1)) + threadIdx.x);
+  #pragma unroll(unroll_factor)
+  for (; i < e; i += blockDim.x) {
+    f(i);
+  }
 #else
-  CU(cudaFree(d_a));
-  CU(cudaFree(d_b));
-  CU(cudaFree(d_c));
-  CU(cudaFreeHost(sums));
+  #error Must pick grid-stride or block-stride loop
 #endif
 }
 
-template <typename T>
-__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size)
-{  
-  for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
-    a[i] = initA;
-    b[i] = initB;
-    c[i] = initC;
+template <typename F>
+void for_each(size_t array_size, F f) {
+  static int threads_per_block = 0;
+  if (threads_per_block == 0) {
+    // Pick suitable thread block size for F:
+    int min_blocks_per_grid;
+    auto dyn_smem = [] __host__ __device__ (int){ return 0; };
+    CU(cudaOccupancyMaxPotentialBlockSizeVariableSMem
+       (&min_blocks_per_grid, &threads_per_block, for_each_kernel<F>, dyn_smem, 0));
+    // Clamp to TBSIZE
+    threads_per_block = std::min(TBSIZE, threads_per_block);
   }
+  size_t blocks = ceil_div(array_size / UNROLL_FACTOR, threads_per_block);
+  size_t start = ceil_div(array_size, (size_t)blocks);
+  for_each_kernel<<<blocks, TBSIZE, 0, stream>>>(array_size, start, f);
+  CU(cudaPeekAtLastError());
+  CU(cudaStreamSynchronize(stream));
 }
 
 template <class T>
 void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  size_t blocks = ceil_div(array_size, TBSIZE);
-  init_kernel<<<blocks, TBSIZE, 0, stream>>>(d_a, d_b, d_c, initA, initB, initC, array_size);
-  CU(cudaPeekAtLastError());
-  CU(cudaStreamSynchronize(stream));
+  for_each(array_size, [=,a=d_a,b=d_b,c=d_c] __device__ (size_t i) {
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;
+  });
 }
 
 template <class T>
-void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  // Copy device memory to host
-#if defined(PAGEFAULT) || defined(MANAGED)
   CU(cudaStreamSynchronize(stream));
-  for (intptr_t i = 0; i < array_size; ++i)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+#if defined(PAGEFAULT) || defined(MANAGED)
+  // Unified memory: return pointers to device memory
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost));
-  CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost));
+  // No Unified memory: copy data to the host
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
+  CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
 #endif
 }
 
-template <typename T>
-__global__ void copy_kernel(const T * a, T * c, size_t array_size)
-{
-  for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
-    c[i] = a[i];
-  }
-}
-
 template <class T>
 void CUDAStream<T>::copy()
 {
-  size_t blocks = ceil_div(array_size, TBSIZE);
-  copy_kernel<<<blocks, TBSIZE, 0, stream>>>(d_a, d_c, array_size);
-  CU(cudaPeekAtLastError());
-  CU(cudaStreamSynchronize(stream));
-}
-
-template <typename T>
-__global__ void mul_kernel(T * b, const T * c, size_t array_size)
-{
-  const T scalar = startScalar;
-  for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
-    b[i] = scalar * c[i];
-  }
+  for_each(array_size, [a=d_a,c=d_c] __device__ (size_t i) {
+    c[i] = a[i];
+  });
 }
 
 template <class T>
 void CUDAStream<T>::mul()
 {
-  size_t blocks = ceil_div(array_size, TBSIZE);
-  mul_kernel<<<blocks, TBSIZE, 0, stream>>>(d_b, d_c, array_size);
-  CU(cudaPeekAtLastError());
-  CU(cudaStreamSynchronize(stream));
-}
-
-template <typename T>
-__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size)
-{
-  for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
-    c[i] = a[i] + b[i];
-  }
+  for_each(array_size, [b=d_b,c=d_c] __device__ (size_t i) {
+    b[i] = startScalar * c[i];
+  });
 }
 
 template <class T>
 void CUDAStream<T>::add()
 {
-  size_t blocks = ceil_div(array_size, TBSIZE);
-  add_kernel<<<blocks, TBSIZE, 0, stream>>>(d_a, d_b, d_c, array_size);
-  CU(cudaPeekAtLastError());  
-  CU(cudaStreamSynchronize(stream));
-}
-
-template <typename T>
-__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size)
-{
-  const T scalar = startScalar;
-  for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
-    a[i] = b[i] + scalar * c[i];
-  }
+  for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) {
+    c[i] = a[i] + b[i];
+  });
 }
 
 template <class T>
 void CUDAStream<T>::triad()
 {
-  size_t blocks = ceil_div(array_size, TBSIZE);
-  triad_kernel<<<blocks, TBSIZE, 0, stream>>>(d_a, d_b, d_c, array_size);
-  CU(cudaPeekAtLastError());
-  CU(cudaStreamSynchronize(stream));
-}
-
-template <typename T>
-__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size)
-{
-  const T scalar = startScalar;
-  for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
-    a[i] += b[i] + scalar * c[i];
-  }
+  for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) {
+    a[i] = b[i] + startScalar * c[i];
+  });
 }
 
 template <class T>
 void CUDAStream<T>::nstream()
 {
-  size_t blocks = ceil_div(array_size, TBSIZE);
-  nstream_kernel<<<blocks, TBSIZE, 0, stream>>>(d_a, d_b, d_c, array_size);
-  CU(cudaPeekAtLastError());
-  CU(cudaStreamSynchronize(stream));
+  for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) {
+    a[i] += b[i] + startScalar * c[i];
+  });
 }
 
 template <class T>
 __global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size)
 {
-  __shared__ T smem[TBSIZE];
+  __shared__ T smem[TBSIZE_DOT];
   T tmp = T(0.);
   const size_t tidx = threadIdx.x;
-  for (size_t i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
+  size_t i = tidx + (size_t)blockDim.x * blockIdx.x;
+  for (; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
     tmp += a[i] * b[i];
   }
   smem[tidx] = tmp;
@@ -244,7 +298,7 @@ __global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size)
 template <class T>
 T CUDAStream<T>::dot()
 {
-  dot_kernel<<<dot_num_blocks, TBSIZE, 0, stream>>>(d_a, d_b, sums, array_size);
+  dot_kernel<<<dot_num_blocks, TBSIZE_DOT, 0, stream>>>(d_a, d_b, sums, array_size);
   CU(cudaPeekAtLastError());
   CU(cudaStreamSynchronize(stream));
 
diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h
index 4b4a1a3a..50e099dc 100644
--- a/src/cuda/CUDAStream.h
+++ b/src/cuda/CUDAStream.h
@@ -15,7 +15,8 @@
 
 #define IMPLEMENTATION_STRING "CUDA"
 
-#define TBSIZE 1024
+#define TBSIZE 256
+#define TBSIZE_DOT 1024
 
 template <class T>
 class CUDAStream : public Stream<T>
@@ -25,27 +26,31 @@ class CUDAStream : public Stream<T>
     intptr_t array_size;
 
     // Host array for partial sums for dot kernel
-    T *sums;
+    T* sums;
 
     // Device side pointers to arrays
-    T *d_a;
-    T *d_b;
-    T *d_c;
+    T* d_a;
+    T* d_b;
+    T* d_c;
+
+    // If UVM is disabled, host arrays for verification purposes
+    std::vector<T> h_a, h_b, h_c;
 
     // Number of blocks for dot kernel
     intptr_t dot_num_blocks;
 
   public:
-    CUDAStream(const intptr_t, const int);
+    CUDAStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~CUDAStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/cuda/model.cmake b/src/cuda/model.cmake
index 7c1b0d6e..8314e1f2 100644
--- a/src/cuda/model.cmake
+++ b/src/cuda/model.cmake
@@ -9,6 +9,11 @@ register_flag_optional(MEM "Device memory mode:
         PAGEFAULT - shared memory, only host pointers allocated."
         "DEFAULT")
 
+register_flag_optional(STRIDE "Kernel stride: GRID_STRIDE or BLOCK_STRIDE" "GRID_STRIDE")
+
+register_flag_optional(UNROLL_FACTOR "Kernel unroll factor:" "4")
+
+
 register_flag_required(CMAKE_CUDA_COMPILER
         "Path to the CUDA nvcc compiler")
 
@@ -30,11 +35,17 @@ macro(setup)
 
     enable_language(CUDA)
     register_definitions(${MEM})
+    register_definitions(${STRIDE})
 
     # add -forward-unknown-to-host-compiler for compatibility reasons
-    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
+    # add --extended-lambda for device-lambdas
+    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}"
+      "--extended-lambda" "-DUNROLL_FACTOR=${UNROLL_FACTOR}" ${CUDA_EXTRA_FLAGS})
     string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
 
+    # Link against the NVIDIA Management Library for device information
+    register_link_library("nvidia-ml")
+    
     # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
     # appended later
     wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
diff --git a/src/dpl_shim.h b/src/dpl_shim.h
index 226693bd..9b8a7acc 100644
--- a/src/dpl_shim.h
+++ b/src/dpl_shim.h
@@ -29,12 +29,15 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, exe_policy.queue
 template<typename T>
 void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); }
 
+#define WORKAROUND
+
 #else
 
 // auto exe_policy = dpl::execution::seq;
 // auto exe_policy = dpl::execution::par;
 static constexpr auto exe_policy = dpl::execution::par_unseq;
 #define USE_STD_PTR_ALLOC_DEALLOC
+#define WORKAROUND
 
 #endif
 
diff --git a/src/futhark/FutharkStream.cpp b/src/futhark/FutharkStream.cpp
index ebd3633b..392ff898 100644
--- a/src/futhark/FutharkStream.cpp
+++ b/src/futhark/FutharkStream.cpp
@@ -11,9 +11,10 @@
 #include "FutharkStream.h"
 
 template <class T>
-FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
+FutharkStream<T>::FutharkStream(BenchId bs, const intptr_t array_size, const int device,
+				T initA, T initB, T initC)
+  : array_size(array_size)
 {
-  this->array_size = ARRAY_SIZE;
   this->cfg = futhark_context_config_new();
   this->device = "#" + std::to_string(device);
 #if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl)
@@ -23,6 +24,7 @@ FutharkStream<T>::FutharkStream(const int ARRAY_SIZE, int device)
   this->a = NULL;
   this->b = NULL;
   this->c = NULL;
+  init_arrays(initA, initB, initC);
 }
 
 template <>
@@ -98,19 +100,31 @@ void FutharkStream<double>::init_arrays(double initA, double initB, double initC
 }
 
 template <>
-void FutharkStream<float>::read_arrays(std::vector<float>& h_a, std::vector<float>& h_b, std::vector<float>& h_c) {
+void FutharkStream<float>::get_arrays(float const*& a_, float const*& b_, float const*& c_) {
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data());
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data());
   futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data());
   futhark_context_sync(this->ctx);
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
 }
 
 template <>
-void FutharkStream<double>::read_arrays(std::vector<double>& h_a, std::vector<double>& h_b, std::vector<double>& h_c) {
+void FutharkStream<double>::get_arrays(double const*& a_, double const*& b_, double const*& c_) {
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data());
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data());
   futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data());
   futhark_context_sync(this->ctx);
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
 }
 
 template <>
diff --git a/src/futhark/FutharkStream.h b/src/futhark/FutharkStream.h
index 6290e79a..eabdabbe 100644
--- a/src/futhark/FutharkStream.h
+++ b/src/futhark/FutharkStream.h
@@ -44,17 +44,21 @@ class FutharkStream : public Stream<T>
   void* b;
   void* c;
 
+  // Host side arrays for verification
+  std::vector<T> h_a, h_b, h_c;
+
 public:
-  FutharkStream(const int, int);
+  FutharkStream(BenchId bs, const intptr_t array_size, const int device_id,
+		T initA, T initB, T initC);
   ~FutharkStream();
 
-  virtual void copy() override;
-  virtual void add() override;
-  virtual void mul() override;
-  virtual void triad() override;
-  virtual void nstream() override;
-  virtual T dot() override;
+  void copy() override;
+  void add() override;
+  void mul() override;
+  void triad() override;
+  void nstream() override;
+  T dot() override;
 
-  virtual void init_arrays(T initA, T initB, T initC) override;
-  virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+  void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+  void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/futhark/model.cmake b/src/futhark/model.cmake
index edd21fa6..d7b08795 100644
--- a/src/futhark/model.cmake
+++ b/src/futhark/model.cmake
@@ -44,6 +44,7 @@ macro(setup)
   elseif (${FUTHARK_BACKEND} STREQUAL "cuda")
     find_package(CUDA REQUIRED)
     register_link_library("nvrtc" "cuda" "cudart")
+    set(CMAKE_C_COMPILER "nvcc")
   else ()
     message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}")
   endif()
diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp
index ec02425a..e3878afd 100644
--- a/src/hip/HIPStream.cpp
+++ b/src/hip/HIPStream.cpp
@@ -25,7 +25,9 @@ void check_error(void)
 __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; }
 
 template <class T>
-HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
+HIPStream<T>::HIPStream(BenchId bs, const intptr_t array_size, const int device_index,
+	      T initA, T initB, T initC)
+  : array_size(array_size)
 {
   // Set device
   int count;
@@ -47,13 +49,12 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
     std::cout << "Memory: DEFAULT" << std::endl;
 #endif
 
-  array_size = ARRAY_SIZE;
   // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
   dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
 
   size_t array_bytes = sizeof(T);
-  array_bytes *= ARRAY_SIZE;
-  size_t total_bytes = array_bytes * 3;
+  array_bytes *= array_size;
+  size_t total_bytes = array_bytes * std::size_t{3};
 
   // Allocate the host array for partial sums for dot kernels using hipHostMalloc.
   // This creates an array on the host which is visible to the device. However, it requires
@@ -65,7 +66,7 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
   // Check buffers fit on the device
   hipDeviceProp_t props;
   hipGetDeviceProperties(&props, 0);
-  if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
+  if (props.totalGlobalMem < total_bytes)
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
   // Create device buffers
@@ -88,6 +89,8 @@ HIPStream<T>::HIPStream(const intptr_t ARRAY_SIZE, const int device_index)
   hipMalloc(&d_c, array_bytes);
   check_error();
 #endif
+
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -127,24 +130,28 @@ void HIPStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void HIPStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-
-  // Copy device memory to host
+  hipDeviceSynchronize();
 #if defined(PAGEFAULT) || defined(MANAGED)
-    hipDeviceSynchronize();
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    a[i] = d_a[i];
-    b[i] = d_b[i];
-    c[i] = d_c[i];
-  }
+  // Unified memory: return pointers to device memory  
+  a = d_a;
+  b = d_b;
+  c = d_c;
 #else
-  hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
-  check_error();
-  hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
+  // No Unified memory: copy data to the host  
+  size_t nbytes = array_size * sizeof(T);
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  hipMemcpy(h_a.data(), d_a, nbytes, hipMemcpyDeviceToHost);
+  check_error();
+  hipMemcpy(h_b.data(), d_b, nbytes, hipMemcpyDeviceToHost);
+  check_error();
+  hipMemcpy(h_c.data(), d_c, nbytes, hipMemcpyDeviceToHost);
   check_error();
 #endif
 }
diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h
index 76ef7df4..a1c45802 100644
--- a/src/hip/HIPStream.h
+++ b/src/hip/HIPStream.h
@@ -48,20 +48,21 @@ class HIPStream : public Stream<T>
     T *d_b;
     T *d_c;
 
+    // If UVM is disabled, host arrays for verification purposes
+    std::vector<T> h_a, h_b, h_c;
 
   public:
-
-    HIPStream(const intptr_t, const int);
+    HIPStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~HIPStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;    
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp
index e49d5bcc..fcbdb7a7 100644
--- a/src/kokkos/KokkosStream.cpp
+++ b/src/kokkos/KokkosStream.cpp
@@ -8,21 +8,23 @@
 #include "KokkosStream.hpp"
 
 template <class T>
-KokkosStream<T>::KokkosStream(
-        const intptr_t ARRAY_SIZE, const int device_index)
-    : array_size(ARRAY_SIZE)
+KokkosStream<T>::KokkosStream(BenchId bs, const intptr_t array_size, const int device_index,
+			      T initA, T initB, T initC)
+    : array_size(array_size)
 {
   Kokkos::initialize(Kokkos::InitializationSettings().set_device_id(device_index));
 
-  d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE);
-  d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE);
-  d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE);
+  d_a = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_a"), array_size);
+  d_b = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_b"), array_size);
+  d_c = new Kokkos::View<T*>(Kokkos::ViewAllocateWithoutInitializing("d_c"), array_size);
   hm_a = new typename Kokkos::View<T*>::HostMirror();
   hm_b = new typename Kokkos::View<T*>::HostMirror();
   hm_c = new typename Kokkos::View<T*>::HostMirror();
   *hm_a = create_mirror_view(*d_a);
   *hm_b = create_mirror_view(*d_b);
   *hm_c = create_mirror_view(*d_c);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -47,18 +49,14 @@ void KokkosStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void KokkosStream<T>::read_arrays(
-        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void KokkosStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   deep_copy(*hm_a, *d_a);
   deep_copy(*hm_b, *d_b);
   deep_copy(*hm_c, *d_c);
-  for(intptr_t ii = 0; ii < array_size; ++ii)
-  {
-    a[ii] = (*hm_a)(ii);
-    b[ii] = (*hm_b)(ii);
-    c[ii] = (*hm_c)(ii);
-  }
+  a = hm_a->data();
+  b = hm_b->data();
+  c = hm_c->data();
 }
 
 template <class T>
diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp
index 8e40119c..bc3ac3ee 100644
--- a/src/kokkos/KokkosStream.hpp
+++ b/src/kokkos/KokkosStream.hpp
@@ -22,27 +22,27 @@ class KokkosStream : public Stream<T>
     intptr_t array_size;
 
     // Device side pointers to arrays
-     typename Kokkos::View<T*>* d_a;
-     typename Kokkos::View<T*>* d_b;
-     typename Kokkos::View<T*>* d_c;
-     typename Kokkos::View<T*>::HostMirror* hm_a;
-     typename Kokkos::View<T*>::HostMirror* hm_b;
-     typename Kokkos::View<T*>::HostMirror* hm_c;
+    typename Kokkos::View<T*>* d_a;
+    typename Kokkos::View<T*>* d_b;
+    typename Kokkos::View<T*>* d_c;
+    typename Kokkos::View<T*>::HostMirror* hm_a;
+    typename Kokkos::View<T*>::HostMirror* hm_b;
+    typename Kokkos::View<T*>::HostMirror* hm_c;
 
   public:
 
-    KokkosStream(const intptr_t, const int);
+    KokkosStream(BenchId bs, const intptr_t array_size, const int device_id,
+		 T initA, T initB, T initC);
     ~KokkosStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(
-            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake
index 7457eebd..2223c753 100644
--- a/src/kokkos/model.cmake
+++ b/src/kokkos/model.cmake
@@ -1,5 +1,5 @@
 register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection and RAJA.
+        "Any CXX compiler that is supported by CMake detection and Kokkos.
          See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
         "c++")
 
@@ -21,7 +21,7 @@ macro(setup)
     set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
     cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
 
-
+    message("KOKKOS_IN_PACKAGE=${KOKKOS_IN_PACKAGE}")
     if (EXISTS "${KOKKOS_IN_TREE}")
         message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
         add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
diff --git a/src/main.cpp b/src/main.cpp
index ee091259..af9fac6c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -19,11 +19,12 @@
 #define VERSION_STRING "5.0"
 
 #include "Stream.h"
+
 #include "StreamModels.h"
 #include "Unit.h"
 
 // Default size of 2^25
-intptr_t ARRAY_SIZE = 33554432;
+intptr_t array_size = 33554432;
 size_t num_times = 100;
 size_t deviceIndex = 0;
 bool use_float = false;
@@ -33,42 +34,11 @@ Unit unit{Unit::Kind::MegaByte};
 bool silence_errors = false;
 std::string csv_separator = ",";
 
-// Benchmark Identifier: identifies individual & groups of benchmarks:
-// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
-// - All: all kernels.
-// - Individual kernels only.
-enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
-
-struct Benchmark {
-  BenchId id;
-  char const* label;
-  // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
-  // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
-  size_t weight;
-  // Is it one of: Copy, Mul, Add, Triad, Dot?
-  bool classic = false;
-};
-
-// Benchmarks in the order in which - if present - should be run for validation purposes:
-constexpr size_t num_benchmarks = 6;
-std::array<Benchmark, num_benchmarks> bench = {
-  Benchmark { .id = BenchId::Copy,    .label = "Copy",    .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Mul,     .label = "Mul",     .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Add,     .label = "Add",     .weight = 3, .classic = true  },
-  Benchmark { .id = BenchId::Triad,   .label = "Triad",   .weight = 3, .classic = true  },
-  Benchmark { .id = BenchId::Dot,     .label = "Dot",     .weight = 2, .classic = true  },
-  Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
-};
-
 // Selected benchmarks to run: default is all 5 classic benchmarks.
 BenchId selection = BenchId::Classic;
 
 // Returns true if the benchmark needs to be run:
-bool run_benchmark(Benchmark const& b) {
-  if (selection == BenchId::All)                  return true;
-  if (selection == BenchId::Classic && b.classic) return true;
-  return selection == b.id;
-}
+bool run_benchmark(Benchmark const& b) { return run_benchmark(selection, b); }
 
 // Benchmark run order
 // - Classic: runs each bench once in the order above, and repeats n times.
@@ -174,8 +144,7 @@ std::vector<std::vector<double>> run_all(std::unique_ptr<Stream<T>>& stream, T&
 }
 
 template <typename T>
-void check_solution(const size_t ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c,
-		    T& sum);
+void check_solution(const size_t ntimes, T const* a, T const* b, T const* c, T sum);
 
 // Generic run routine
 // Runs the kernel(s) and prints output.
@@ -186,7 +155,7 @@ void run()
 
   // Formatting utilities:
   auto fmt_bw = [&](size_t weight, double dt) {
-    return unit.fmt((weight * sizeof(T) * ARRAY_SIZE)/dt);
+    return unit.fmt((weight * sizeof(T) * array_size)/dt);
   };
   auto fmt_csv_header = [] {
     std::cout
@@ -244,47 +213,44 @@ void run()
       }
       std::cout << " ";
     }
-    std::cout << num_times << " times" << std::endl;
-    std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
+    std::cout << num_times << " times in ";
+    switch (order) {
+    case BenchOrder::Classic: std::cout << " Classic"; break;
+    case BenchOrder::Isolated: std::cout << " Isolated"; break;
+    default: std::cerr << "Error: Unknown order" << std::endl; abort();
+    };
+    std::cout << " order " << std::endl;
+    std::cout << "Number of elements: " << array_size << std::endl;
     std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl;
 
-    size_t nbytes = ARRAY_SIZE * sizeof(T);
+    size_t nbytes = array_size * sizeof(T);
     std::cout << std::setprecision(1) << std::fixed
 	      << "Array size: " << unit.fmt(nbytes) << " " << unit.str() << std::endl;
     std::cout << "Total size: " << unit.fmt(3.0*nbytes) << " " << unit.str() << std::endl;
     std::cout.precision(ss);
   }
 
-  std::unique_ptr<Stream<T>> stream = make_stream<T>(ARRAY_SIZE, deviceIndex);
-  auto initElapsedS = time([&] { stream->init_arrays(startA, startB, startC); });
+  std::unique_ptr<Stream<T>> stream
+    = make_stream<T>(selection, array_size, deviceIndex, startA, startB, startC);
 
   // Result of the Dot kernel, if used.
   T sum{};
   std::vector<std::vector<double>> timings = run_all<T>(stream, sum);
 
   // Create & read host vectors:
-  std::vector<T> a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE);
-  auto readElapsedS = time([&] { stream->read_arrays(a, b, c); });
+  T const* a;
+  T const* b;
+  T const* c;
+  stream->get_arrays(a, b, c);
 
   check_solution<T>(num_times, a, b, c, sum);
-  auto initBWps = fmt_bw(3, initElapsedS);
-  auto readBWps = fmt_bw(3, readElapsedS);
 
   if (output_as_csv)
   {
     fmt_csv_header();
-    fmt_csv("Init", 1, ARRAY_SIZE, sizeof(T), initBWps, initElapsedS, initElapsedS, initElapsedS);
-    fmt_csv("Read", 1, ARRAY_SIZE, sizeof(T), readBWps, readElapsedS, readElapsedS, readElapsedS);
   }
   else
   {
-    std::cout << "Init: "
-      << std::setw(7)
-      << initElapsedS << " s (=" << initBWps << " " << unit.str() << "/s" << ")" << std::endl;
-    std::cout << "Read: "
-      << std::setw(7)
-      << readElapsedS << " s (=" << readBWps << " " << unit.str() << "/s" << ")" << std::endl;
-
     std::cout
       << std::left << std::setw(12) << "Function"
       << std::left << std::setw(12) << (std::string(unit.str()) + "/s")
@@ -307,15 +273,13 @@ void run()
       / (double)(num_times - 1);
 
     // Display results
-    fmt_result(bench[i].label, num_times, ARRAY_SIZE, sizeof(T),
+    fmt_result(bench[i].label, num_times, array_size, sizeof(T),
 	       fmt_bw(bench[i].weight, *minmax.first), *minmax.first, *minmax.second, average);
   }
 }
 
 template <typename T>
-void check_solution(const size_t num_times,
-		    std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
-{
+void check_solution(const size_t num_times, T const* a, T const* b, T const* c, T sum) {
   // Generate correct solution
   T goldA = startA;
   T goldB = startB;
@@ -332,7 +296,7 @@ void check_solution(const size_t num_times,
     case BenchId::Add:     goldC = goldA + goldB; break;
     case BenchId::Triad:   goldA = goldB + scalar * goldC; break;
     case BenchId::Nstream: goldA += goldB + scalar * goldC; break;
-    case BenchId::Dot:     goldS = goldA * goldB * T(ARRAY_SIZE); break; // This calculates the answer exactly
+    case BenchId::Dot:     goldS = goldA * goldB * T(array_size); break; // This calculates the answer exactly
     default:
     std::cerr << "Unimplemented Check: " << bench[b].label << std::endl;
     abort();
@@ -364,39 +328,42 @@ void check_solution(const size_t num_times,
     abort();
   }
 
-  // Error relative tolerance check
+  // Error relative tolerance check - a higher tolerance is used for reductions.
   size_t failed = 0;
-  T epsi = std::numeric_limits<T>::epsilon() * T(100000.0);
-  auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) {
-    if (e > epsi || std::isnan(e) || std::isnan(is)) {
+  T max_rel = std::numeric_limits<T>::epsilon() * T(100.0);
+  T max_rel_dot = std::numeric_limits<T>::epsilon() * T(10000000.0);
+  auto check = [&](const char* name, T is, T should, T mrel, size_t i = size_t(-1)) {
+    // Relative difference:
+    T diff = std::abs(is - should);
+    T abs_is = std::abs(is);
+    T abs_sh = std::abs(should);
+    T largest = std::max(abs_is, abs_sh);
+    T same = diff <= largest * mrel;
+    if (!same || std::isnan(is)) {
       ++failed;
       if (failed > 10) return;
       std::cerr << "FAILED validation of " << name;
       if (i != size_t(-1)) std::cerr << "[" << i << "]";
-      std::cerr << ": " << is << " != " << should
-		<< ", relative error=" << e << " > " << epsi << std::endl;
+      std::cerr << ": " << is << " (is) != " << should
+		<< " (should)" << ", diff=" << diff << " > "
+		<< largest * mrel << " (largest=" << largest
+		<< ", max_rel=" << mrel << ")" << std::endl;
     }
   };
 
   // Sum
-  T eS = std::fabs(sum - goldS) / std::fabs(goldS);
   for (size_t i = 0; i < num_benchmarks; ++i) {
     if (bench[i].id != BenchId::Dot) continue;
     if (run_benchmark(bench[i]))
-      check("sum", sum,  goldS, eS);
+      check("sum", sum, goldS, max_rel_dot);
     break;
   }
 
   // Calculate the L^infty-norm relative error
-  for (size_t i = 0; i < a.size(); ++i) {
-    T vA = a[i], vB = b[i], vC = c[i];
-    T eA = std::fabs(vA - goldA) / std::fabs(goldA);
-    T eB = std::fabs(vB - goldB) / std::fabs(goldB);
-    T eC = std::fabs(vC - goldC) / std::fabs(goldC);
-
-    check("a", a[i], goldA, eA, i);
-    check("b", b[i], goldB, eB, i);
-    check("c", c[i], goldC, eC, i);
+  for (size_t i = 0; i < array_size; ++i) {
+    check("a", a[i], goldA, max_rel, i);
+    check("b", b[i], goldB, max_rel, i);
+    check("c", c[i], goldC, max_rel, i);
   }
 
   if (failed > 0 && !silence_errors)
@@ -442,13 +409,11 @@ void parseArguments(int argc, char *argv[])
     else if (!std::string("--arraysize").compare(argv[i]) ||
              !std::string("-s").compare(argv[i]))
     {
-      intptr_t array_size;
       if (++i >= argc || !parseInt(argv[i], &array_size) || array_size <= 0)
       {
         std::cerr << "Invalid array size." << std::endl;
         std::exit(EXIT_FAILURE);
       }
-      ARRAY_SIZE = array_size;
     }
     else if (!std::string("--numtimes").compare(argv[i]) ||
              !std::string("-n").compare(argv[i]))
@@ -510,12 +475,12 @@ void parseArguments(int argc, char *argv[])
     {
       if (++i >= argc)
       {
-       std::cerr << "Expected benchmark order after --order. Options: \"classic\" (default), \"isolated\"."
+       std::cerr << "Expected benchmark order after --order. Options: \"Classic\" (default), \"Isolated\"."
 		  << std::endl;
         exit(EXIT_FAILURE);
       }
       auto key = std::string(argv[i]);
-      if (key == "isolated")
+      if (key == "Isolated")
       {
         order = BenchOrder::Isolated;
       }
@@ -566,7 +531,7 @@ void parseArguments(int argc, char *argv[])
       std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
       std::cout << "  -o  --only       NAME    Only run one benchmark (see --print-names)" << std::endl;
       std::cout << "      --print-names        Prints all available benchmark names" << std::endl;
-      std::cout << "      --order              Benchmark run order: \"classic\" (default) or \"isolated\"." << std::endl;
+      std::cout << "      --order              Benchmark run order: \"Classic\" (default) or \"Isolated\"." << std::endl;
       std::cout << "      --csv                Output as csv table" << std::endl;
       std::cout << "      --megabytes          Use MB=10^6 for bandwidth calculation (default)" << std::endl;
       std::cout << "      --mibibytes          Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl;
diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp
index c70a701d..fc1ae30c 100644
--- a/src/ocl/OCLStream.cpp
+++ b/src/ocl/OCLStream.cpp
@@ -100,8 +100,9 @@ std::string kernels{R"CLC(
 
 
 template <class T>
-OCLStream<T>::OCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-  : array_size{ARRAY_SIZE}
+OCLStream<T>::OCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+	       T initA, T initB, T initC)
+  : array_size{array_size}
 {
   if (!cached)
     getDeviceList();
@@ -172,18 +173,20 @@ OCLStream<T>::OCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   // Check buffers fit on the device
   cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
   cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
-  if (maxbuffer < sizeof(T)*ARRAY_SIZE)
+  if (maxbuffer < sizeof(T)*array_size)
     throw std::runtime_error("Device cannot allocate a buffer big enough");
-  if (totalmem < 3*sizeof(T)*ARRAY_SIZE)
+  if (totalmem < 3*sizeof(T)*array_size)
     throw std::runtime_error("Device does not have enough memory for all 3 buffers");
 
   // Create buffers
-  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
-  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
-  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
+  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
+  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size);
   d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups);
 
   sums = std::vector<T>(dot_num_groups);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -277,11 +280,17 @@ void OCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void OCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  cl::copy(queue, d_a, a.begin(), a.end());
-  cl::copy(queue, d_b, b.begin(), b.end());
-  cl::copy(queue, d_c, c.begin(), c.end());
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
+  a = h_a.data();
+  b = h_b.data();
+  c = h_c.data();
+  cl::copy(queue, d_a, h_a.begin(), h_a.end());
+  cl::copy(queue, d_b, h_b.begin(), h_b.end());
+  cl::copy(queue, d_c, h_c.begin(), h_c.end());
 }
 
 void getDeviceList(void)
diff --git a/src/ocl/OCLStream.h b/src/ocl/OCLStream.h
index e2366dad..e5405dde 100644
--- a/src/ocl/OCLStream.h
+++ b/src/ocl/OCLStream.h
@@ -42,6 +42,9 @@ class OCLStream : public Stream<T>
     cl::Buffer d_c;
     cl::Buffer d_sum;
 
+    // Host-side arrays for verification
+    std::vector<T> h_a, h_b, h_c;
+
     cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T> *init_kernel;
     cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
     cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
@@ -56,19 +59,19 @@ class OCLStream : public Stream<T>
 
   public:
 
-    OCLStream(const intptr_t, const int);
+    OCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~OCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp
index 09b749fd..f0389373 100644
--- a/src/omp/OMPStream.cpp
+++ b/src/omp/OMPStream.cpp
@@ -13,10 +13,10 @@
 #endif
 
 template <class T>
-OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
+OMPStream<T>::OMPStream(BenchId bs, const intptr_t array_size, const int device,
+			T initA, T initB, T initC)
+  : array_size(array_size)
 {
-  array_size = ARRAY_SIZE;
-
   // Allocate on the host
   this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
   this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
@@ -32,6 +32,7 @@ OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
   {}
 #endif
 
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -77,7 +78,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void OMPStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
 
 #ifdef OMP_TARGET_GPU
@@ -87,15 +88,9 @@ void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
   #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 #endif
-
-  #pragma omp parallel for
-  for (intptr_t i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
-
+  h_a = a;
+  h_b = b;
+  h_c = c;
 }
 
 template <class T>
diff --git a/src/omp/OMPStream.h b/src/omp/OMPStream.h
index 40770005..fca4906c 100644
--- a/src/omp/OMPStream.h
+++ b/src/omp/OMPStream.h
@@ -29,16 +29,17 @@ class OMPStream : public Stream<T>
     T *c;
 
   public:
-    OMPStream(const intptr_t, int);
+    OMPStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~OMPStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp
index 6d6e8342..35fe6e8d 100644
--- a/src/raja/RAJAStream.cpp
+++ b/src/raja/RAJAStream.cpp
@@ -16,8 +16,9 @@ using RAJA::forall;
 #endif
 
 template <class T>
-RAJAStream<T>::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index)
-    : array_size(ARRAY_SIZE), range(0, ARRAY_SIZE)
+RAJAStream<T>::RAJAStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size), range(0, array_size)
 {
 
 #ifdef RAJA_TARGET_CPU
@@ -25,11 +26,13 @@ RAJAStream<T>::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index)
   d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
   d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
-  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
-  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
-  cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_a, sizeof(T)*array_size, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_b, sizeof(T)*array_size, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_c, sizeof(T)*array_size, cudaMemAttachGlobal);
   cudaDeviceSynchronize();
 #endif
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -61,12 +64,11 @@ void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void RAJAStream<T>::read_arrays(
-        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void RAJAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
-  std::copy(d_a, d_a + array_size, a.data());
-  std::copy(d_b, d_b + array_size, b.data());
-  std::copy(d_c, d_c + array_size, c.data());
+  a = d_a;
+  b = d_b;
+  c = d_c;
 }
 
 template <class T>
diff --git a/src/raja/RAJAStream.hpp b/src/raja/RAJAStream.hpp
index e98b0778..a2565ccc 100644
--- a/src/raja/RAJAStream.hpp
+++ b/src/raja/RAJAStream.hpp
@@ -50,19 +50,18 @@ class RAJAStream : public Stream<T>
     T* d_c;
 
   public:
-
-    RAJAStream(const intptr_t, const int);
+    RAJAStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~RAJAStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-	virtual void nstream() override;
-	virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(
-            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/serial/SerialStream.cpp b/src/serial/SerialStream.cpp
new file mode 100644
index 00000000..394a657c
--- /dev/null
+++ b/src/serial/SerialStream.cpp
@@ -0,0 +1,132 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, Tom Lin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include <cstdlib>  // For aligned_alloc
+#include "SerialStream.h"
+
+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
+template <class T>
+SerialStream<T>::SerialStream(BenchId bs, const intptr_t array_size, const int device_id,
+			      T initA, T initB, T initC)
+  : array_size{array_size}
+{
+  // Allocate on the host
+  this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+
+  init_arrays(initA, initB, initC);
+}
+
+template <class T>
+SerialStream<T>::~SerialStream()
+{
+  free(a);
+  free(b);
+  free(c);
+}
+
+template <class T>
+void SerialStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  intptr_t array_size = this->array_size;
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;
+  }
+}
+
+template <class T>
+void SerialStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
+{
+  h_a = a;
+  h_b = b;
+  h_c = c;
+}
+
+template <class T>
+void SerialStream<T>::copy()
+{
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void SerialStream<T>::mul()
+{
+  const T scalar = startScalar;
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void SerialStream<T>::add()
+{
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void SerialStream<T>::triad()
+{
+  const T scalar = startScalar;
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+
+template <class T>
+void SerialStream<T>::nstream()
+{
+  const T scalar = startScalar;
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    a[i] += b[i] + scalar * c[i];
+  }
+}
+
+template <class T>
+T SerialStream<T>::dot()
+{
+  T sum{};
+  for (intptr_t i = 0; i < array_size; i++)
+  {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+
+
+void listDevices(void)
+{
+  std::cout << "0: CPU" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class SerialStream<float>;
+template class SerialStream<double>;
diff --git a/src/serial/SerialStream.h b/src/serial/SerialStream.h
new file mode 100644
index 00000000..9f5653d4
--- /dev/null
+++ b/src/serial/SerialStream.h
@@ -0,0 +1,42 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, Tom Lin
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+
+#define IMPLEMENTATION_STRING "Serial"
+
+template <class T>
+class SerialStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    intptr_t array_size;
+
+    // Device side pointers
+    T *a, *b, *c;
+
+  public:
+    SerialStream(BenchId bs, const intptr_t array_size, const int device_id,
+		 T initA, T initB, T initC);
+    ~SerialStream();
+
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
+
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
+};
diff --git a/src/serial/model.cmake b/src/serial/model.cmake
new file mode 100644
index 00000000..d1818466
--- /dev/null
+++ b/src/serial/model.cmake
@@ -0,0 +1,4 @@
+macro(setup)
+    # Nothing to do
+endmacro()
+
diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp
deleted file mode 100644
index 3efeb1b3..00000000
--- a/src/std-data/STDDataStream.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-// Updated 2021 by University of Bristol
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#include "STDDataStream.h"
-
-template <class T>
-STDDataStream<T>::STDDataStream(const intptr_t ARRAY_SIZE, int device)
-  noexcept : array_size{ARRAY_SIZE},
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
-{
-    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
-#ifdef USE_ONEDPL
-    std::cout << "Using oneDPL backend: ";
-#if ONEDPL_USE_DPCPP_BACKEND
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-#elif ONEDPL_USE_TBB_BACKEND
-    std::cout << "TBB " TBB_VERSION_STRING;
-#elif ONEDPL_USE_OPENMP_BACKEND
-    std::cout << "OpenMP";
-#else
-    std::cout << "Default";
-#endif
-    std::cout << std::endl;
-#endif
-}
-
-template<class T>
-STDDataStream<T>::~STDDataStream() {
-  dealloc_raw(a);
-  dealloc_raw(b);
-  dealloc_raw(c);
-}
-
-template <class T>
-void STDDataStream<T>::init_arrays(T initA, T initB, T initC)
-{
-  std::fill(exe_policy, a, a + array_size, initA);
-  std::fill(exe_policy, b, b + array_size, initB);
-  std::fill(exe_policy, c, c + array_size, initC);
-}
-
-template <class T>
-void STDDataStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
-{
-  std::copy(a, a + array_size, h_a.begin());
-  std::copy(b, b + array_size, h_b.begin());
-  std::copy(c, c + array_size, h_c.begin());
-}
-
-template <class T>
-void STDDataStream<T>::copy()
-{
-  // c[i] = a[i]
-  std::copy(exe_policy, a, a + array_size, c);
-}
-
-template <class T>
-void STDDataStream<T>::mul()
-{
-  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; });
-}
-
-template <class T>
-void STDDataStream<T>::add()
-{
-  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
-}
-
-template <class T>
-void STDDataStream<T>::triad()
-{
-  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
-}
-
-template <class T>
-void STDDataStream<T>::nstream()
-{
-  //  a[i] += b[i] + scalar * c[i];
-  //  Need to do in two stages with C++11 STL.
-  //  1: a[i] += b[i]
-  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
-  std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; });
-}
-   
-
-template <class T>
-T STDDataStream<T>::dot()
-{
-  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
-}
-
-void listDevices(void)
-{
-  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
-}
-
-std::string getDeviceName(const int)
-{
-  return std::string("Device name unavailable");
-}
-
-std::string getDeviceDriver(const int)
-{
-  return std::string("Device driver unavailable");
-}
-template class STDDataStream<float>;
-template class STDDataStream<double>;
diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h
deleted file mode 100644
index d92864be..00000000
--- a/src/std-data/STDDataStream.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-// Updated 2021 by University of Bristol
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#pragma once
-#include "dpl_shim.h"
-
-#include <iostream>
-#include <stdexcept>
-#include "Stream.h"
-
-#define IMPLEMENTATION_STRING "STD (data-oriented)"
-
-
-template <class T>
-class STDDataStream : public Stream<T>
-{
-  protected:
-    // Size of arrays
-    intptr_t array_size;
-
-    // Device side pointers
-    T *a, *b, *c;
-
-  public:
-    STDDataStream(const intptr_t, int) noexcept;
-    ~STDDataStream();
-
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-};
-
diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake
deleted file mode 100644
index 837d26bf..00000000
--- a/src/std-data/model.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-
-register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection"
-        "c++")
-
-register_flag_optional(NVHPC_OFFLOAD
-        "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK.
-         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
-
-         Possible values are:
-           cc35  - Compile for compute capability 3.5
-           cc50  - Compile for compute capability 5.0
-           cc60  - Compile for compute capability 6.0
-           cc62  - Compile for compute capability 6.2
-           cc70  - Compile for compute capability 7.0
-           cc72  - Compile for compute capability 7.2
-           cc75  - Compile for compute capability 7.5
-           cc80  - Compile for compute capability 8.0
-           ccall - Compile for all supported compute capabilities"
-        "")
-
-register_flag_optional(USE_TBB
-        "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
-        "OFF")
-
-register_flag_optional(USE_ONEDPL
-        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
-
-        Possible values are:
-          OPENMP - Implements policies using OpenMP.
-                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
-          TBB    - Implements policies using TBB.
-                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
-          DPCPP  - Implements policies through SYCL2020.
-                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
-        "OFF")
-
-macro(setup)
-    set(CMAKE_CXX_STANDARD 17)
-    if (NVHPC_OFFLOAD)
-        set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD})
-        # propagate flags to linker so that it links with the gpu stuff as well
-        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
-        register_append_link_flags(${NVHPC_FLAGS})
-    endif ()
-    if (USE_TBB)
-        register_link_library(TBB::tbb)
-    endif ()
-    if (USE_ONEDPL)
-        register_definitions(USE_ONEDPL)
-        register_link_library(oneDPL)
-    endif ()
-endmacro()
diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp
deleted file mode 100644
index 473d93d0..00000000
--- a/src/std-indices/STDIndicesStream.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2021 Tom Deakin and Tom Lin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#include "STDIndicesStream.h"
-
-#ifndef ALIGNMENT
-#define ALIGNMENT (2*1024*1024) // 2MB
-#endif
-
-template <class T>
-STDIndicesStream<T>::STDIndicesStream(const intptr_t ARRAY_SIZE, int device)
-noexcept : array_size{ARRAY_SIZE}, range(0, array_size),
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
-{
-    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
-#ifdef USE_ONEDPL
-    std::cout << "Using oneDPL backend: ";
-#if ONEDPL_USE_DPCPP_BACKEND
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-#elif ONEDPL_USE_TBB_BACKEND
-    std::cout << "TBB " TBB_VERSION_STRING;
-#elif ONEDPL_USE_OPENMP_BACKEND
-    std::cout << "OpenMP";
-#else
-    std::cout << "Default";
-#endif
-    std::cout << std::endl;
-#endif
-}
-
-template<class T>
-STDIndicesStream<T>::~STDIndicesStream() {
-  dealloc_raw(a);
-  dealloc_raw(b);
-  dealloc_raw(c);
-}
-
-template <class T>
-void STDIndicesStream<T>::init_arrays(T initA, T initB, T initC)
-{
-  std::fill(exe_policy, a, a + array_size, initA);
-  std::fill(exe_policy, b, b + array_size, initB);
-  std::fill(exe_policy, c, c + array_size, initC);
-}
-
-template <class T>
-void STDIndicesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
-{
-  std::copy(a, a + array_size, h_a.begin());
-  std::copy(b, b + array_size, h_b.begin());
-  std::copy(c, c + array_size, h_c.begin());
-}
-
-template <class T>
-void STDIndicesStream<T>::copy()
-{
-  // c[i] = a[i]
-  std::copy(exe_policy, a, a + array_size, c);
-}
-
-template <class T>
-void STDIndicesStream<T>::mul()
-{
-  //  b[i] = scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](intptr_t i) {
-    return scalar * c[i];
-  });
-}
-
-template <class T>
-void STDIndicesStream<T>::add()
-{
-  //  c[i] = a[i] + b[i];
-  std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](intptr_t i) {
-    return a[i] + b[i];
-  });
-}
-
-template <class T>
-void STDIndicesStream<T>::triad()
-{
-  //  a[i] = b[i] + scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](intptr_t i) {
-    return b[i] + scalar * c[i];
-  });
-}
-
-template <class T>
-void STDIndicesStream<T>::nstream()
-{
-  //  a[i] += b[i] + scalar * c[i];
-  //  Need to do in two stages with C++11 STL.
-  //  1: a[i] += b[i]
-  //  2: a[i] += scalar * c[i];
-  std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](intptr_t i) {
-    return a[i] + b[i] + scalar * c[i];
-  });
-}
-   
-
-template <class T>
-T STDIndicesStream<T>::dot()
-{
-  // sum = 0; sum += a[i]*b[i]; return sum;
-  return std::transform_reduce(exe_policy, a, a + array_size, b, T{});
-}
-
-void listDevices(void)
-{
-  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
-}
-
-std::string getDeviceName(const int)
-{
-  return std::string("Device name unavailable");
-}
-
-std::string getDeviceDriver(const int)
-{
-  return std::string("Device driver unavailable");
-}
-template class STDIndicesStream<float>;
-template class STDIndicesStream<double>;
diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h
deleted file mode 100644
index 8a8f5de8..00000000
--- a/src/std-indices/STDIndicesStream.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2021 Tom Deakin and Tom Lin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#pragma once
-#include "dpl_shim.h"
-
-#include <iostream>
-#include <stdexcept>
-#include "Stream.h"
-
-#define IMPLEMENTATION_STRING "STD (index-oriented)"
-
-// A lightweight counting iterator which will be used by the STL algorithms
-// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
-// implementation doesn't target
-template <typename N>
-class ranged {
-public:
-  class iterator {
-    friend class ranged;
-    public:
-    using difference_type = N;
-    using value_type = N;
-    using pointer = const N*;
-    using reference = N;
-    using iterator_category = std::random_access_iterator_tag;
-
-    // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled.
-    // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device.
-    // This type is unused for any nother STL impl.
-    using is_passed_directly = std::true_type;
-
-    reference operator *() const { return i_; }
-    iterator &operator ++() { ++i_; return *this; }
-    iterator operator ++(int) { iterator copy(*this); ++i_; return copy; }
-
-    iterator &operator --() { --i_; return *this; }
-    iterator operator --(int) { iterator copy(*this); --i_; return copy; }
-
-    iterator &operator +=(N by) { i_+=by; return *this; }
-
-    value_type operator[](const difference_type &i) const { return i_ + i; }
-
-    difference_type operator-(const iterator &it) const { return i_ - it.i_; }
-    iterator operator+(const value_type v) const { return iterator(i_ + v); }
-
-    bool operator ==(const iterator &other) const { return i_ == other.i_; }
-    bool operator !=(const iterator &other) const { return i_ != other.i_; }
-    bool operator < (const iterator &other) const { return i_ < other.i_; }
-
-    protected:
-      explicit iterator(N start) : i_ (start) {}
-
-    private:
-        N i_;
-  };
-
-  [[nodiscard]] iterator begin() const { return begin_; }
-  [[nodiscard]] iterator end() const { return end_; }
-  ranged(N begin, N end) : begin_(begin), end_(end) {}
-private:
-  iterator begin_;
-  iterator end_;
-};
-
-template <class T>
-class STDIndicesStream : public Stream<T>
-{
-  protected:
-    // Size of arrays
-    intptr_t array_size;
-
-    // induction range
-    ranged<intptr_t> range;
-
-    // Device side pointers
-    T *a, *b, *c;
-
-  public:
-    STDIndicesStream(const intptr_t, int) noexcept;
-    ~STDIndicesStream();
-
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-};
-
diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake
deleted file mode 100644
index 60ef575f..00000000
--- a/src/std-indices/model.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-
-register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection"
-        "c++")
-
-register_flag_optional(NVHPC_OFFLOAD
-        "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
-         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
-
-         Possible values are:
-           cc35  - Compile for compute capability 3.5
-           cc50  - Compile for compute capability 5.0
-           cc60  - Compile for compute capability 6.0
-           cc62  - Compile for compute capability 6.2
-           cc70  - Compile for compute capability 7.0
-           cc72  - Compile for compute capability 7.2
-           cc75  - Compile for compute capability 7.5
-           cc80  - Compile for compute capability 8.0
-           ccall - Compile for all supported compute capabilities"
-        "")
-
-register_flag_optional(USE_TBB
-        "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
-        "OFF")
-
-register_flag_optional(USE_ONEDPL
-        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
-
-        Possible values are:
-          OPENMP - Implements policies using OpenMP.
-                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
-          TBB    - Implements policies using TBB.
-                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
-          DPCPP  - Implements policies through SYCL2020.
-                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
-        "OFF")
-
-macro(setup)
-    set(CMAKE_CXX_STANDARD 17)
-    if (NVHPC_OFFLOAD)
-        set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD})
-        # propagate flags to linker so that it links with the gpu stuff as well
-        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
-        register_append_link_flags(${NVHPC_FLAGS})
-    endif ()
-    if (USE_TBB)
-        register_link_library(TBB::tbb)
-    endif ()
-    if (USE_ONEDPL)
-        register_definitions(USE_ONEDPL)
-        register_link_library(oneDPL)
-    endif ()
-endmacro()
diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp
deleted file mode 100644
index 8b7ada4b..00000000
--- a/src/std-ranges/STDRangesStream.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#include "STDRangesStream.hpp"
-#include <ranges>
-
-#ifndef ALIGNMENT
-#define ALIGNMENT (2*1024*1024) // 2MB
-#endif
-
-template <class T>
-STDRangesStream<T>::STDRangesStream(const intptr_t ARRAY_SIZE, int device)
-noexcept : array_size{ARRAY_SIZE},
-  a(alloc_raw<T>(ARRAY_SIZE)), b(alloc_raw<T>(ARRAY_SIZE)), c(alloc_raw<T>(ARRAY_SIZE))
-{
-    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
-#ifdef USE_ONEDPL
-    std::cout << "Using oneDPL backend: ";
-#if ONEDPL_USE_DPCPP_BACKEND
-    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
-#elif ONEDPL_USE_TBB_BACKEND
-    std::cout << "TBB " TBB_VERSION_STRING;
-#elif ONEDPL_USE_OPENMP_BACKEND
-    std::cout << "OpenMP";
-#else
-    std::cout << "Default";
-#endif
-    std::cout << std::endl;
-#endif
-}
-
-template<class T>
-STDRangesStream<T>::~STDRangesStream() {
-  dealloc_raw(a);
-  dealloc_raw(b);
-  dealloc_raw(c);
-}
-
-template <class T>
-void STDRangesStream<T>::init_arrays(T initA, T initB, T initC)
-{
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size, // loop range
-    [=, this] (intptr_t i) {
-      a[i] = initA;
-      b[i] = initB;
-      c[i] = initC;
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
-{
-  // Element-wise copy.
-    std::copy(a, a + array_size, h_a.begin());
-    std::copy(b, b + array_size, h_b.begin());
-    std::copy(c, c + array_size, h_c.begin());
-}
-
-template <class T>
-void STDRangesStream<T>::copy()
-{
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      c[i] = a[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::mul()
-{
-  const T scalar = startScalar;
-
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      b[i] = scalar * c[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::add()
-{
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      c[i] = a[i] + b[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::triad()
-{
-  const T scalar = startScalar;
-
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      a[i] = b[i] + scalar * c[i];
-    }
-  );
-}
-
-template <class T>
-void STDRangesStream<T>::nstream()
-{
-  const T scalar = startScalar;
-
-  std::for_each_n(
-    exe_policy,
-    std::views::iota((intptr_t)0).begin(), array_size,
-    [=, this] (intptr_t i) {
-      a[i] += b[i] + scalar * c[i];
-    }
-  );
-}
-
-template <class T>
-T STDRangesStream<T>::dot()
-{
-  // sum += a[i] * b[i];
-  return
-    std::transform_reduce(
-      exe_policy,
-      a, a + array_size, b, T{});
-}
-
-void listDevices(void)
-{
-  std::cout << "C++20 does not expose devices" << std::endl;
-}
-
-std::string getDeviceName(const int)
-{
-  return std::string("Device name unavailable");
-}
-
-std::string getDeviceDriver(const int)
-{
-  return std::string("Device driver unavailable");
-}
-
-template class STDRangesStream<float>;
-template class STDRangesStream<double>;
diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp
deleted file mode 100644
index 51680c62..00000000
--- a/src/std-ranges/STDRangesStream.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#pragma once
-#include "dpl_shim.h"
-
-#include <iostream>
-#include <stdexcept>
-#include "Stream.h"
-
-#define IMPLEMENTATION_STRING "STD C++ ranges"
-
-template <class T>
-class STDRangesStream : public Stream<T>
-{
-  protected:
-    // Size of arrays
-    intptr_t array_size;
-
-    // Device side pointers
-    T *a, *b, *c;
-
-  public:
-    STDRangesStream(const intptr_t, int) noexcept;
-    ~STDRangesStream();
-
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
-};
-
diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake
deleted file mode 100644
index d7fd6a8b..00000000
--- a/src/std-ranges/model.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-
-register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges"
-        "c++")
-
-register_flag_optional(NVHPC_OFFLOAD
-        "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK.
-         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
-
-         Possible values are:
-           cc35  - Compile for compute capability 3.5
-           cc50  - Compile for compute capability 5.0
-           cc60  - Compile for compute capability 6.0
-           cc62  - Compile for compute capability 6.2
-           cc70  - Compile for compute capability 7.0
-           cc72  - Compile for compute capability 7.2
-           cc75  - Compile for compute capability 7.5
-           cc80  - Compile for compute capability 8.0
-           ccall - Compile for all supported compute capabilities"
-        "")
-
-register_flag_optional(USE_TBB
-        "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
-        "OFF")
-
-register_flag_optional(USE_ONEDPL
-        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
-
-        Possible values are:
-          OPENMP - Implements policies using OpenMP.
-                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
-          TBB    - Implements policies using TBB.
-                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
-          DPCPP  - Implements policies through SYCL2020.
-                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
-        "OFF")
-
-macro(setup)
-
-    # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here
-
-    # C++ 2a is too new, disable CMake's std flags completely:
-    set(CMAKE_CXX_EXTENSIONS OFF)
-    set(CMAKE_CXX_STANDARD_REQUIRED OFF)
-    unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default
-    # and append our own:
-    register_append_cxx_flags(ANY -std=c++20)
-    if (NVHPC_OFFLOAD)
-        set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD})
-        # propagate flags to linker so that it links with the gpu stuff as well
-        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
-        register_append_link_flags(${NVHPC_FLAGS})
-    endif ()
-    if (USE_TBB)
-        register_link_library(TBB::tbb)
-    endif ()
-    if (USE_ONEDPL)
-        register_definitions(USE_ONEDPL)
-        register_link_library(oneDPL)
-    endif ()
-endmacro()
-
-macro(setup_target NAME)
-    if (USE_ONEDPL)
-        target_compile_features(${NAME} INTERFACE cxx_std_20)
-        target_compile_features(oneDPL INTERFACE cxx_std_20)
-    endif ()
-endmacro()
diff --git a/src/std/STDStream.cpp b/src/std/STDStream.cpp
new file mode 100644
index 00000000..ae3cd016
--- /dev/null
+++ b/src/std/STDStream.cpp
@@ -0,0 +1,209 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Updated 2021 by University of Bristol
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "STDStream.h"
+#include <algorithm>
+#include <execution>
+
+#if defined(DATA23) || defined(INDICES)
+#include <ranges>
+#endif
+
+ // OneDPL workaround; TODO: remove this eventually
+#include "dpl_shim.h"
+
+#ifdef INDICES
+// NVHPC workaround: TODO: remove this eventually 
+#if defined(__NVCOMPILER) && defined(_NVHPC_STDPAR_GPU)
+#define WORKAROUND   
+#include <thrust/iterator/counting_iterator.h>
+auto counting_iter(intptr_t i) { return thrust::counting_iterator<intptr_t>(i); }
+auto counting_range(intptr_t b, intptr_t e) {
+  struct R {
+    thrust::counting_iterator<intptr_t> b, e;
+    thrust::counting_iterator<intptr_t> begin() { return b; }
+    thrust::counting_iterator<intptr_t> end() { return e; }
+  };
+  return R { .b = counting_iter(b), .e = counting_iter(e) };
+}
+#else  // NVHPC Workaround
+auto counting_iter(intptr_t i) { return std::views::iota(i).begin(); }
+auto counting_range(intptr_t b, intptr_t e) { return std::views::iota(b, e); }
+#endif // NVHPC Workaround
+#endif // INDICES  
+
+template <class T>
+STDStream<T>::STDStream(BenchId bs, const intptr_t array_size, const int device_id,
+			      T initA, T initB, T initC)
+  noexcept : array_size{array_size},
+  a(alloc_raw<T>(array_size)), b(alloc_raw<T>(array_size)), c(alloc_raw<T>(array_size))
+{
+    std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+#ifdef USE_ONEDPL
+    std::cout << "Using oneDPL backend: ";
+#if ONEDPL_USE_DPCPP_BACKEND
+    std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info<sycl::info::device::name>() << ")";
+#elif ONEDPL_USE_TBB_BACKEND
+    std::cout << "TBB " TBB_VERSION_STRING;
+#elif ONEDPL_USE_OPENMP_BACKEND
+    std::cout << "OpenMP";
+#else
+    std::cout << "Default";
+#endif
+    std::cout << std::endl;
+#endif
+
+#ifdef WORKAROUND
+    std::cout << "Non-conforming implementation: requires non-portable workarounds to run STREAM" << std::endl;
+#endif      
+    init_arrays(initA, initB, initC);
+}
+
+template<class T>
+STDStream<T>::~STDStream() {
+  dealloc_raw(a);
+  dealloc_raw(b);
+  dealloc_raw(c);
+}
+
+template <class T>
+void STDStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  std::fill_n(exe_policy, a, array_size, initA);
+  std::fill_n(exe_policy, b, array_size, initB);
+  std::fill_n(exe_policy, c, array_size, initC);
+}
+
+template <class T>
+void STDStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
+{
+  h_a = a;
+  h_b = b;
+  h_c = c;
+}
+
+template <class T>
+void STDStream<T>::copy()
+{
+  // c[i] = a[i]
+#if defined(DATA17) || defined(DATA23)
+  std::copy(exe_policy, a, a + array_size, c);
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,c=c](intptr_t i) {
+      c[i] = a[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::mul()
+{
+  //  b[i] = scalar * c[i];
+#if defined(DATA17) || defined(DATA23)  
+  std::transform(exe_policy, c, c + array_size, b, [](T ci){ return startScalar*ci; });
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [b=b, c=c](intptr_t i) {
+    b[i] = startScalar * c[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::add()
+{
+  //  c[i] = a[i] + b[i];
+#if defined(DATA17) || defined(DATA23)  
+  std::transform(exe_policy, a, a + array_size, b, c, std::plus<T>());
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) {
+      c[i] = a[i] + b[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::triad()
+{
+  //  a[i] = b[i] + scalar * c[i];
+#if defined(DATA17) || defined(DATA23)
+  std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; });
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) {
+      a[i] = b[i] + startScalar * c[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+
+template <class T>
+void STDStream<T>::nstream()
+{
+  //  a[i] += b[i] + scalar * c[i];
+#if defined(DATA17)
+  //  Need to do in two round-trips with C++17 STL.
+  //  1: a[i] += b[i]
+  //  2: a[i] += scalar * c[i];
+  std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; });
+  std::transform(exe_policy, a, a + array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; });
+#elif DATA23
+  // Requires GCC 14.1 (Ubuntu 24.04):
+  auto as = std::ranges::subrange(a, a + array_size);
+  auto bs = std::ranges::subrange(b, b + array_size);
+  auto cs = std::ranges::subrange(c, c + array_size);
+  auto r = std::views::zip(as, bs, cs);
+  std::transform(exe_policy, r.begin(), r.end(), a, [](auto vs) {
+      auto [a, b, c] = vs;
+      return a + b + startScalar * c;
+  });
+#elif INDICES
+  std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,b=b,c=c](intptr_t i) {
+    a[i] += b[i] + startScalar * c[i];
+  });
+#else
+  #error unimplemented
+#endif  
+}
+   
+
+template <class T>
+T STDStream<T>::dot()
+{
+#if defined(DATA17) || defined(DATA23)  
+  // sum = 0; sum += a[i] * b[i]; return sum;
+  return std::transform_reduce(exe_policy, a, a + array_size, b, T{0});
+#elif INDICES
+  auto r = counting_range(intptr_t(0), array_size);
+  return std::transform_reduce(exe_policy, r.begin(), r.end(), T{0}, std::plus<T>{}, [a=a, b=b](intptr_t i) {
+      return a[i] * b[i];
+  });
+#else
+  #error unimplemented
+#endif
+}
+
+void listDevices(void)
+{
+  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class STDStream<float>;
+template class STDStream<double>;
diff --git a/src/std/STDStream.h b/src/std/STDStream.h
new file mode 100644
index 00000000..254d68d7
--- /dev/null
+++ b/src/std/STDStream.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+// Updated 2021 by University of Bristol
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include "Stream.h"
+
+#ifdef DATA17
+#define STDIMPL "DATA17"
+#elif DATA23
+#define STDIMPL "DATA23"
+#elif INDICES
+#define STDIMPL "INDICES"
+#else
+#error unimplemented
+#endif
+
+#define IMPLEMENTATION_STRING "STD (" STDIMPL ")"
+
+
+template <class T>
+class STDStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    intptr_t array_size;
+
+    // Device side pointers
+    T *a, *b, *c;
+
+  public:
+    STDStream(BenchId bs, const intptr_t array_size, const int device_id,
+		  T initA, T initB, T initC) noexcept;
+    ~STDStream();
+
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
+
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
+};
+
diff --git a/src/std/model.cmake b/src/std/model.cmake
new file mode 100644
index 00000000..d66e6c87
--- /dev/null
+++ b/src/std/model.cmake
@@ -0,0 +1,115 @@
+register_flag_optional(CMAKE_CXX_COMPILER
+        "Any CXX compiler that is supported by CMake detection"
+        "c++")
+
+register_flag_optional(NVHPC_OFFLOAD
+        "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK.
+         The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
+
+         Possible values are:
+           cc35  - Compile for compute capability 3.5
+           cc50  - Compile for compute capability 5.0
+           cc60  - Compile for compute capability 6.0
+           cc62  - Compile for compute capability 6.2
+           cc70  - Compile for compute capability 7.0
+           cc72  - Compile for compute capability 7.2
+           cc75  - Compile for compute capability 7.5
+           cc80  - Compile for compute capability 8.0
+           cc90  - Compile for compute capability 8.0
+           ccall - Compile for all supported compute capabilities
+           ccnative - Compiles for compute capability of current device"
+	"")
+
+register_flag_optional(NVHPC_MULTICORE
+        "Enable multicore parallelization with the NVHPC SDK."
+	"")
+
+register_flag_optional(ACPP_OFFLOAD
+        "Enable offloading support (via the non-standard `-acpp-stdpar`) for AdaptiveCpp."
+	"OFF")
+      
+register_flag_optional(USE_TBB
+        "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
+        "OFF")
+
+register_flag_optional(USE_ONEDPL
+        "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
+
+        Possible values are:
+          OPENMP - Implements policies using OpenMP.
+                   CMake will handle any flags needed to enable OpenMP if the compiler supports it.
+          TBB    - Implements policies using TBB.
+                   TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
+          DPCPP  - Implements policies through SYCL2020.
+                   This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
+        "OFF")
+
+register_flag_optional(STDIMPL
+  "Implementation strategy (default = DATA20):
+     DATA17  - Parallel algorithms over data (requires C++17).
+     DATA23  - (default) Parallel algorithms over data (requires C++20).
+     INDICES - Parallel algorithms over indices (requires C++20)."
+  "DATA20"
+)
+
+register_flag_optional(AMDGPU_TARGET_OFFLOAD
+        "Enable offloading support (via the non-standard `-stdpar`) for
+         Clang/LLVM. The values are AMDGPU processors (https://www.llvm.org/docs/AMDGPUUsage.html#processors)
+         which will be passed in via `--offload-arch=` argument.
+
+         Example values are:
+           gfx906  - Compile for Vega20 GPUs
+           gfx908  - Compile for CDNA1 GPUs
+           gfx90a  - Compile for CDNA2 GPUs
+           gfx942  - Compile for CDNA3 GPUs
+           gfx1030 - Compile for RDNA2 NV21 GPUs
+           gfx1100 - Compile for RDNA3 NV31 GPUs"
+        "")
+
+macro(setup)
+    register_definitions(${STDIMPL})
+    if (${STDIMPL} STREQUAL "DATA17")
+      set(CMAKE_CXX_STANDARD 17)
+    elseif (${STDIMPL} STREQUAL "INDICES")
+      set(CMAKE_CXX_STANDARD 20)
+    elseif (${STDIMPL} STREQUAL "DATA23")
+      set(CMAKE_CXX_STANDARD 23)
+    endif ()
+    if (NVHPC_OFFLOAD)
+        set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD})
+        # propagate flags to linker so that it links with the gpu stuff as well
+        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
+        register_append_link_flags(${NVHPC_FLAGS})
+      endif ()
+    if (NVHPC_MULTICORE)
+        set(NVHPC_FLAGS -stdpar=multicore)
+        # propagate flags to linker so that it links with the gpu stuff as well
+        register_append_cxx_flags(ANY ${NVHPC_FLAGS})
+        register_append_link_flags(${NVHPC_FLAGS})
+    endif ()
+
+    if (ACPP_OFFLOAD)
+        set(ACPP_FLAGS --acpp-stdpar)
+        register_append_cxx_flags(ANY ${ACPP_FLAGS})
+        register_append_link_flags(${ACPP_FLAGS})      
+    endif ()
+    if (USE_TBB)
+      if (FETCH_TBB)
+	register_link_library(TBB::tbb)
+      else ()
+	register_link_library(tbb)
+      endif ()
+    endif ()
+    if (USE_ONEDPL)
+        register_definitions(USE_ONEDPL)
+        register_link_library(oneDPL)
+    endif ()
+    if (AMDGPU_TARGET_OFFLOAD)
+        set(AMDGPU_TARGET_OFFLOAD_FLAGS --hipstdpar --offload-arch=${AMDGPU_TARGET_OFFLOAD})
+        if (NOT AMDGPU_TARGET_OFFLOAD MATCHES "^gfx9" OR AMDGPU_INTERPOSE_ALLOC)
+            list(APPEND AMDGPU_TARGET_OFFLOAD_FLAGS --hipstdpar-interpose-alloc)
+        endif ()
+        register_append_cxx_flags(ANY ${AMDGPU_TARGET_OFFLOAD_FLAGS})
+        register_append_link_flags(--hipstdpar)
+    endif ()
+endmacro()
diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp
index e99454e6..5c00211e 100644
--- a/src/sycl/SYCLStream.cpp
+++ b/src/sycl/SYCLStream.cpp
@@ -17,13 +17,13 @@ std::vector<device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size)
 {
   if (!cached)
     getDeviceList();
 
-  array_size = ARRAY_SIZE;
-
   if (device_index >= devices.size())
     throw std::runtime_error("Invalid device index");
   device dev = devices[device_index];
@@ -79,6 +79,8 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
   d_b = new buffer<T>(array_size);
   d_c = new buffer<T>(array_size);
   d_sum = new buffer<T>(dot_num_groups);
+
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -238,17 +240,14 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void SYCLStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
 {
   auto _a = d_a->template get_access<access::mode::read>();
   auto _b = d_b->template get_access<access::mode::read>();
   auto _c = d_c->template get_access<access::mode::read>();
-  for (int i = 0; i < array_size; i++)
-  {
-    a[i] = _a[i];
-    b[i] = _b[i];
-    c[i] = _c[i];
-  }
+  a = &_a[0];
+  b = &_b[0];
+  c = &_c[0];
 }
 
 void getDeviceList(void)
diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h
index 1a40242d..94c3c4e9 100644
--- a/src/sycl/SYCLStream.h
+++ b/src/sycl/SYCLStream.h
@@ -54,19 +54,19 @@ class SYCLStream : public Stream<T>
 
   public:
 
-    SYCLStream(const intptr_t, const int);
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
     ~SYCLStream();
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
 // Populate the devices list
diff --git a/src/sycl/model.cmake b/src/sycl/model.cmake
index 3826c3c7..72aa7c40 100644
--- a/src/sycl/model.cmake
+++ b/src/sycl/model.cmake
@@ -9,22 +9,34 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
 
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp)
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        endif ()
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
+        endif ()
 
+        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
+        message(STATUS "ok")
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
 
         if (NOT EXISTS "${hipSYCL_DIR}")
@@ -38,7 +50,6 @@ macro(setup)
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
         find_package(hipSYCL CONFIG REQUIRED)
         message(STATUS "ok")
-
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
@@ -62,7 +73,14 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
+        # own custom integration header flags AFTER the target has been specified
+        # hence this macro here
+        add_sycl_to_target(
+                TARGET ${NAME}
+                SOURCES ${IMPL_SOURCES})
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         # so hipSYCL has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp
deleted file mode 100644
index 742be95b..00000000
--- a/src/sycl2020-acc/SYCLStream2020.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#include "SYCLStream2020.h"
-
-#include <iostream>
-
-// Cache list of devices
-bool cached = false;
-std::vector<sycl::device> devices;
-void getDeviceList(void);
-
-template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-: array_size {ARRAY_SIZE},
-  d_a {ARRAY_SIZE},
-  d_b {ARRAY_SIZE},
-  d_c {ARRAY_SIZE},
-  d_sum {1}
-{
-  if (!cached)
-    getDeviceList();
-
-  if (device_index >= devices.size())
-    throw std::runtime_error("Invalid device index");
-
-  sycl::device dev = devices[device_index];
-
-  // Print out device information
-  std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl;
-  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
-
-  // Check device can support FP64 if needed
-  if (sizeof(T) == sizeof(double))
-  {
-    if (!dev.has(sycl::aspect::fp64))
-    {
-      throw std::runtime_error("Device does not support double precision, please use --float");
-    }
-  }
-
-  queue = std::make_unique<sycl::queue>(dev, sycl::async_handler{[&](sycl::exception_list l)
-  {
-    bool error = false;
-    for(auto e: l)
-    {
-      try
-      {
-        std::rethrow_exception(e);
-      }
-      catch (sycl::exception e)
-      {
-        std::cout << e.what();
-        error = true;
-      }
-    }
-    if(error)
-    {
-      throw std::runtime_error("SYCL errors detected");
-    }
-  }});
-
-  // No longer need list of devices
-  devices.clear();
-  cached = true;
-
-
-}
-
-
-template <class T>
-void SYCLStream<T>::copy()
-{
-  queue->submit([&](sycl::handler &cgh)
-  {
-    sycl::accessor ka {d_a, cgh, sycl::read_only};
-    sycl::accessor kc {d_c, cgh, sycl::write_only};
-    cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx)
-    {
-      kc[idx] = ka[idx];
-    });
-  });
-  queue->wait();
-}
-
-template <class T>
-void SYCLStream<T>::mul()
-{
-  const T scalar = startScalar;
-  queue->submit([&](sycl::handler &cgh)
-  {
-    sycl::accessor kb {d_b, cgh, sycl::write_only};
-    sycl::accessor kc {d_c, cgh, sycl::read_only};
-    cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx)
-    {
-      kb[idx] = scalar * kc[idx];
-    });
-  });
-  queue->wait();
-}
-
-template <class T>
-void SYCLStream<T>::add()
-{
-  queue->submit([&](sycl::handler &cgh)
-  {
-    sycl::accessor ka {d_a, cgh, sycl::read_only};
-    sycl::accessor kb {d_b, cgh, sycl::read_only};
-    sycl::accessor kc {d_c, cgh, sycl::write_only};
-    cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx)
-    {
-      kc[idx] = ka[idx] + kb[idx];
-    });
-  });
-  queue->wait();
-}
-
-template <class T>
-void SYCLStream<T>::triad()
-{
-  const T scalar = startScalar;
-  queue->submit([&](sycl::handler &cgh)
-  {
-    sycl::accessor ka {d_a, cgh, sycl::write_only};
-    sycl::accessor kb {d_b, cgh, sycl::read_only};
-    sycl::accessor kc {d_c, cgh, sycl::read_only};
-    cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx)
-    {
-      ka[idx] = kb[idx] + scalar * kc[idx];
-    });
-  });
-  queue->wait();
-}
-
-template <class T>
-void SYCLStream<T>::nstream()
-{
-  const T scalar = startScalar;
-
-  queue->submit([&](sycl::handler &cgh)
-  {
-    sycl::accessor ka {d_a, cgh};
-    sycl::accessor kb {d_b, cgh, sycl::read_only};
-    sycl::accessor kc {d_c, cgh, sycl::read_only};
-    cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx)
-    {
-      ka[idx] += kb[idx] + scalar * kc[idx];
-    });
-  });
-  queue->wait();
-}
-
-template <class T>
-T SYCLStream<T>::dot()
-{
-
-  queue->submit([&](sycl::handler &cgh)
-  {
-    sycl::accessor ka {d_a, cgh, sycl::read_only};
-    sycl::accessor kb {d_b, cgh, sycl::read_only};
-
-    cgh.parallel_for(sycl::range<1>{array_size},
-       // Reduction object, to perform summation - initialises the result to zero
-       // hipSYCL doesn't sypport the initialize_to_identity property yet
-#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
-      sycl::reduction(d_sum. template get_access<sycl::access_mode::read_write>(cgh), sycl::plus<T>()),
-#else
-      sycl::reduction(d_sum, cgh, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
-#endif
-      [=](sycl::id<1> idx, auto& sum)
-      {
-        sum += ka[idx] * kb[idx];
-      });
-
-  });
-
-  // Get access on the host, and return a copy of the data (single number)
-  // This will block until the result is available, so no need to wait on the queue.
-  sycl::host_accessor result {d_sum, sycl::read_only};
-  return result[0];
-
-}
-
-template <class T>
-void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
-{
-  queue->submit([&](sycl::handler &cgh)
-  {
-    sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init};
-    sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init};
-    sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init};
-
-    cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx)
-    {
-      ka[idx] = initA;
-      kb[idx] = initB;
-      kc[idx] = initC;
-    });
-  });
-
-  queue->wait();
-}
-
-template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
-{
-  sycl::host_accessor _a {d_a, sycl::read_only};
-  sycl::host_accessor _b {d_b, sycl::read_only};
-  sycl::host_accessor _c {d_c, sycl::read_only};
-  for (int i = 0; i < array_size; i++)
-  {
-    a[i] = _a[i];
-    b[i] = _b[i];
-    c[i] = _c[i];
-  }
-}
-
-void getDeviceList(void)
-{
-  // Ask SYCL runtime for all devices in system
-  devices = sycl::device::get_devices();
-  cached = true;
-}
-
-void listDevices(void)
-{
-  getDeviceList();
-
-  // Print device names
-  if (devices.size() == 0)
-  {
-    std::cerr << "No devices found." << std::endl;
-  }
-  else
-  {
-    std::cout << std::endl;
-    std::cout << "Devices:" << std::endl;
-    for (int i = 0; i < devices.size(); i++)
-    {
-      std::cout << i << ": " << getDeviceName(i) << std::endl;
-    }
-    std::cout << std::endl;
-  }
-}
-
-std::string getDeviceName(const int device)
-{
-  if (!cached)
-    getDeviceList();
-
-  std::string name;
-
-  if (device < devices.size())
-  {
-    name = devices[device].get_info<sycl::info::device::name>();
-  }
-  else
-  {
-    throw std::runtime_error("Error asking for name for non-existant device");
-  }
-
-  return name;
-}
-
-std::string getDeviceDriver(const int device)
-{
-  if (!cached)
-    getDeviceList();
-
-  std::string driver;
-
-  if (device < devices.size())
-  {
-    driver = devices[device].get_info<sycl::info::device::driver_version>();
-  }
-  else
-  {
-    throw std::runtime_error("Error asking for driver for non-existant device");
-  }
-
-  return driver;
-}
-
-template class SYCLStream<float>;
-template class SYCLStream<double>;
diff --git a/src/sycl2020-acc/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h
deleted file mode 100644
index cd515f87..00000000
--- a/src/sycl2020-acc/SYCLStream2020.h
+++ /dev/null
@@ -1,54 +0,0 @@
-
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#pragma once
-
-#include <sstream>
-#include <memory>
-
-#include "Stream.h"
-
-#include <sycl/sycl.hpp>
-
-#define IMPLEMENTATION_STRING "SYCL2020 accessors"
-
-template <class T>
-class SYCLStream : public Stream<T>
-{
-  protected:
-    // Size of arrays
-    size_t array_size;
-
-    // SYCL objects
-    // Queue is a pointer because we allow device selection
-    std::unique_ptr<sycl::queue> queue;
-
-    // Buffers
-    sycl::buffer<T> d_a;
-    sycl::buffer<T> d_b;
-    sycl::buffer<T> d_c;
-    sycl::buffer<T> d_sum;
-
-  public:
-
-    SYCLStream(const intptr_t, const int);
-    ~SYCLStream() = default;
-
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
-};
-
-// Populate the devices list
-void getDeviceList(void);
diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake
deleted file mode 100644
index 3826c3c7..00000000
--- a/src/sycl2020-acc/model.cmake
+++ /dev/null
@@ -1,73 +0,0 @@
-
-register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler"
-        "c++")
-
-register_flag_required(SYCL_COMPILER
-        "Compile using the specified SYCL compiler implementation
-        Supported values are
-           ONEAPI-ICPX  - icpx as a standalone compiler
-           ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
-           DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
-
-register_flag_optional(SYCL_COMPILER_DIR
-        "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
-           ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
-           ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
-        "")
-
-macro(setup)
-    set(CMAKE_CXX_STANDARD 17)
-
-
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-
-
-        set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
-
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
-            set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
-        endif ()
-        if (NOT EXISTS "${hipSYCL_DIR}")
-            message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
-        endif ()
-
-        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
-        find_package(hipSYCL CONFIG REQUIRED)
-        message(STATUS "ok")
-
-    elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
-        set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
-        include_directories(${SYCL_COMPILER_DIR}/include/sycl)
-        register_append_cxx_flags(ANY -fsycl)
-        register_append_link_flags(-fsycl)
-    elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
-        set(CMAKE_CXX_COMPILER icpx)
-        set(CMAKE_C_COMPILER icx)
-        register_append_cxx_flags(ANY -fsycl)
-        register_append_link_flags(-fsycl)
-    elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang")
-        set(CMAKE_CXX_COMPILER clang++)
-        set(CMAKE_C_COMPILER clang)
-        register_append_cxx_flags(ANY -fsycl)
-        register_append_link_flags(-fsycl)
-    else ()
-        message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
-    endif ()
-
-endmacro()
-
-
-macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-        # so hipSYCL has this weird (and bad) CMake usage where they append their
-        # own custom integration header flags AFTER the target has been specified
-        # hence this macro here
-        add_sycl_to_target(
-                TARGET ${NAME}
-                SOURCES ${IMPL_SOURCES})
-    endif ()
-endmacro()
diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h
deleted file mode 100644
index 811c26ef..00000000
--- a/src/sycl2020-usm/SYCLStream2020.h
+++ /dev/null
@@ -1,54 +0,0 @@
-
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-#pragma once
-
-#include <sstream>
-#include <memory>
-
-#include "Stream.h"
-
-#include <sycl/sycl.hpp>
-
-#define IMPLEMENTATION_STRING "SYCL2020 USM"
-
-template <class T>
-class SYCLStream : public Stream<T>
-{
-  protected:
-    // Size of arrays
-    size_t array_size;
-
-    // SYCL objects
-    // Queue is a pointer because we allow device selection
-    std::unique_ptr<sycl::queue> queue;
-
-    // Buffers
-    T *a{};
-    T *b{};
-    T *c{};
-    T *sum{};
-
-  public:
-
-    SYCLStream(const intptr_t, const int);
-    ~SYCLStream();
-
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T    dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
-};
-
-// Populate the devices list
-void getDeviceList(void);
diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020/SYCLStream2020.cpp
similarity index 62%
rename from src/sycl2020-usm/SYCLStream2020.cpp
rename to src/sycl2020/SYCLStream2020.cpp
index e4c6ec27..56292046 100644
--- a/src/sycl2020-usm/SYCLStream2020.cpp
+++ b/src/sycl2020/SYCLStream2020.cpp
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, and Tom Lin
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
@@ -15,8 +15,9 @@ std::vector<sycl::device> devices;
 void getDeviceList(void);
 
 template <class T>
-SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
-: array_size {ARRAY_SIZE}
+SYCLStream<T>::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index,
+			  T initA, T initB, T initC)
+  : array_size(array_size)
 {
   if (!cached)
     getDeviceList();
@@ -60,24 +61,36 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
     }
   }});
 
+  // Allocate memory
+  #ifdef SYCL2020ACC
+  d_a = sycl::buffer<T>{array_size};
+  d_b = sycl::buffer<T>{array_size};
+  d_c = sycl::buffer<T>{array_size};
+  d_sum = sycl::buffer<T>{1};
+  #elif SYCL2020USM
   a = sycl::malloc_shared<T>(array_size, *queue);
   b = sycl::malloc_shared<T>(array_size, *queue);
   c = sycl::malloc_shared<T>(array_size, *queue);
   sum = sycl::malloc_shared<T>(1, *queue);
-
+  #else
+  #error unimplemented
+  #endif
+  
   // No longer need list of devices
   devices.clear();
   cached = true;
 
-
+  init_arrays(initA, initB, initC);
 }
 
 template<class T>
 SYCLStream<T>::~SYCLStream() {
- sycl::free(a, *queue);
- sycl::free(b, *queue);
- sycl::free(c, *queue);
- sycl::free(sum, *queue);
+#ifdef SYCL2020USM
+  sycl::free(a, *queue);
+  sycl::free(b, *queue);
+  sycl::free(c, *queue);
+  sycl::free(sum, *queue);
+#endif
 }
 
 template <class T>
@@ -85,7 +98,11 @@ void SYCLStream<T>::copy()
 {
   queue->submit([&](sycl::handler &cgh)
   {
-    cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a](sycl::id<1> idx)
+#ifdef SYCL2020ACC
+    sycl::accessor a {d_a, cgh, sycl::read_only};
+    sycl::accessor c {d_c, cgh, sycl::write_only};
+#endif    
+    cgh.parallel_for(sycl::range<1>{array_size}, [c=c,a=a](sycl::id<1> idx)
     {
       c[idx] = a[idx];
     });
@@ -99,7 +116,11 @@ void SYCLStream<T>::mul()
   const T scalar = startScalar;
   queue->submit([&](sycl::handler &cgh)
   {
-    cgh.parallel_for(sycl::range<1>{array_size}, [=, b = this->b, c = this->c](sycl::id<1> idx)
+#ifdef SYCL2020ACC
+    sycl::accessor b {d_b, cgh, sycl::write_only};
+    sycl::accessor c {d_c, cgh, sycl::read_only};
+#endif    
+    cgh.parallel_for(sycl::range<1>{array_size}, [=,b=b,c=c](sycl::id<1> idx)
     {
       b[idx] = scalar * c[idx];
     });
@@ -112,7 +133,12 @@ void SYCLStream<T>::add()
 {
   queue->submit([&](sycl::handler &cgh)
   {
-    cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a, b = this->b](sycl::id<1> idx)
+#ifdef SYCL2020ACC
+    sycl::accessor a {d_a, cgh, sycl::read_only};
+    sycl::accessor b {d_b, cgh, sycl::read_only};
+    sycl::accessor c {d_c, cgh, sycl::write_only};
+#endif    
+    cgh.parallel_for(sycl::range<1>{array_size}, [c=c,a=a,b=b](sycl::id<1> idx)
     {
       c[idx] = a[idx] + b[idx];
     });
@@ -126,7 +152,12 @@ void SYCLStream<T>::triad()
   const T scalar = startScalar;
   queue->submit([&](sycl::handler &cgh)
   {
-    cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx)
+#ifdef SYCL2020ACC    
+    sycl::accessor a {d_a, cgh, sycl::write_only};
+    sycl::accessor b {d_b, cgh, sycl::read_only};
+    sycl::accessor c {d_c, cgh, sycl::read_only};
+#endif    
+    cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx)
     {
       a[idx] = b[idx] + scalar * c[idx];
     });
@@ -138,10 +169,14 @@ template <class T>
 void SYCLStream<T>::nstream()
 {
   const T scalar = startScalar;
-
   queue->submit([&](sycl::handler &cgh)
   {
-    cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx)
+#if SYCL2020ACC
+    sycl::accessor a {d_a, cgh};
+    sycl::accessor b {d_b, cgh, sycl::read_only};
+    sycl::accessor c {d_c, cgh, sycl::read_only};
+#endif    
+    cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx)
     {
       a[idx] += b[idx] + scalar * c[idx];
     });
@@ -154,19 +189,22 @@ T SYCLStream<T>::dot()
 {
   queue->submit([&](sycl::handler &cgh)
   {
+#if SYCL2020ACC    
+    sycl::accessor a {d_a, cgh, sycl::read_only};
+    sycl::accessor b {d_b, cgh, sycl::read_only};
+#endif
     cgh.parallel_for(sycl::range<1>{array_size},
       // Reduction object, to perform summation - initialises the result to zero
-      // hipSYCL doesn't sypport the initialize_to_identity property yet
-#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
+      // AdaptiveCpp doesn't sypport the initialize_to_identity property yet
+#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__)
       sycl::reduction(sum, sycl::plus<T>()),
 #else
       sycl::reduction(sum, sycl::plus<T>(), sycl::property::reduction::initialize_to_identity{}),
 #endif
-      [a = this->a, b = this->b](sycl::id<1> idx, auto& sum)
+      [a=a,b=b](sycl::id<1> idx, auto& sum)
       {
         sum += a[idx] * b[idx];
       });
-
   });
   queue->wait();
   return *sum;
@@ -177,26 +215,32 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 {
   queue->submit([&](sycl::handler &cgh)
   {
-    cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx)
+#if SYCL2020ACC    
+    sycl::accessor a {d_a, cgh, sycl::write_only, sycl::no_init};
+    sycl::accessor b {d_b, cgh, sycl::write_only, sycl::no_init};
+    sycl::accessor c {d_c, cgh, sycl::write_only, sycl::no_init};
+#endif
+    cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx)
     {
       a[idx] = initA;
       b[idx] = initB;
       c[idx] = initC;
     });
   });
-
   queue->wait();
 }
 
 template <class T>
-void SYCLStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void SYCLStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  for (int i = 0; i < array_size; i++)
-  {
-    h_a[i] = a[i];
-    h_b[i] = b[i];
-    h_c[i] = c[i];
-  }
+#if SYCL2020ACC
+  sycl::host_accessor a {d_a, sycl::read_only};
+  sycl::host_accessor b {d_b, sycl::read_only};
+  sycl::host_accessor c {d_c, sycl::read_only};
+#endif  
+  h_a = &a[0];
+  h_b = &b[0];
+  h_c = &c[0];
 }
 
 void getDeviceList(void)
diff --git a/src/sycl2020/SYCLStream2020.h b/src/sycl2020/SYCLStream2020.h
new file mode 100644
index 00000000..3b4e1ef0
--- /dev/null
+++ b/src/sycl2020/SYCLStream2020.h
@@ -0,0 +1,60 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <sstream>
+#include <memory>
+
+#include "Stream.h"
+
+#include <sycl/sycl.hpp>
+
+#ifdef SYCL2020ACC
+#define SYCLIMPL "Accessors"
+#elif SYCL2020USM
+#define SYCLIMPL "USM"
+#else
+#error unimplemented
+#endif
+
+#define IMPLEMENTATION_STRING "SYCL2020 " SYCLIMPL
+
+template <class T>
+class SYCLStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    size_t array_size;
+
+    // SYCL objects
+    // Queue is a pointer because we allow device selection
+    std::unique_ptr<sycl::queue> queue;
+
+    // Buffers
+    T *a, *b, *c, *sum{};
+    sycl::buffer<T> d_a, d_b, d_c, d_sum;
+
+  public:
+
+    SYCLStream(BenchId bs, const intptr_t array_size, const int device_id,
+	       T initA, T initB, T initC);
+    ~SYCLStream();
+
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T    dot() override;
+
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;    
+    void init_arrays(T initA, T initB, T initC);
+};
+
+// Populate the devices list
+void getDeviceList(void);
diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020/model.cmake
similarity index 65%
rename from src/sycl2020-usm/model.cmake
rename to src/sycl2020/model.cmake
index 950daefd..d6452534 100644
--- a/src/sycl2020-usm/model.cmake
+++ b/src/sycl2020/model.cmake
@@ -9,23 +9,40 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)")
+           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
+           AdaptiveCpp  - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)")
 
 register_flag_optional(SYCL_COMPILER_DIR
         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
            ONEAPI-ICPX   - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
            ONEAPI-Clang  - set to the directory that contains the Intel clang++ binary.
-           HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
+           AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
+register_flag_optional(SYCL_ACCESS
+  "Data access method:
+     - ACCESSOR
+     - USM"
+  "ACCESSOR")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
+    register_definitions(${SYCL_ACCESS})
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp)
 
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure")
+            set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
+        endif ()
+        if (NOT EXISTS "${AdaptiveCpp_DIR}")
+            message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp")
+        endif ()
 
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
-
-
+        # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+        find_package(AdaptiveCpp CONFIG REQUIRED)
+        message(STATUS "ok")
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
 
         if (NOT EXISTS "${hipSYCL_DIR}")
@@ -39,7 +56,6 @@ macro(setup)
         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
         find_package(hipSYCL CONFIG REQUIRED)
         message(STATUS "ok")
-
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
@@ -63,7 +79,14 @@ endmacro()
 
 
 macro(setup_target NAME)
-    if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
+    if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp")
+        # so AdaptiveCpp has this weird (and bad) CMake usage where they append their
+        # own custom integration header flags AFTER the target has been specified
+        # hence this macro here
+        add_sycl_to_target(
+                TARGET ${NAME}
+                SOURCES ${IMPL_SOURCES})
+    elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL")
         # so hipSYCL has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified
         # hence this macro here
diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp
index 75af6141..01508022 100644
--- a/src/tbb/TBBStream.cpp
+++ b/src/tbb/TBBStream.cpp
@@ -20,15 +20,16 @@
 #endif
 
 template <class T>
-TBBStream<T>::TBBStream(const intptr_t ARRAY_SIZE, int device)
-  : partitioner(), range(0, (size_t)ARRAY_SIZE),
+TBBStream<T>::TBBStream(BenchId bs, const intptr_t array_size, const int device,
+			T initA, T initB, T initC)
+  : partitioner(), range(0, (size_t)array_size),
 #ifdef USE_VECTOR
-   a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE)
+   a(array_size), b(array_size), c(array_size)
 #else
-   array_size(ARRAY_SIZE),
-   a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-   b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)),
-   c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE))
+   array_size(array_size),
+   a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)),
+   b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)),
+   c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size))
 #endif
 {
   if(device != 0){
@@ -36,6 +37,8 @@ TBBStream<T>::TBBStream(const intptr_t ARRAY_SIZE, int device)
   }
   std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl;
   std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl;
+
+  init_arrays(initA, initB, initC);
 }
 
 
@@ -54,12 +57,17 @@ void TBBStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void TBBStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+void TBBStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
 {
-  // Element-wise copy.
-  std::copy(BEGIN(a), END(a), h_a.begin());
-  std::copy(BEGIN(b), END(b), h_b.begin());
-  std::copy(BEGIN(c), END(c), h_c.begin());
+#ifdef USE_VECTOR
+  h_a = a.data();
+  h_b = b.data();
+  h_c = c.data();
+#else
+  h_a = a;
+  h_b = b;
+  h_c = c;
+#endif
 }
 
 template <class T>
diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp
index 80f11c17..0a73e892 100644
--- a/src/tbb/TBBStream.hpp
+++ b/src/tbb/TBBStream.hpp
@@ -31,7 +31,6 @@ using tbb_partitioner = tbb::auto_partitioner;
 #define PARTITIONER_NAME  "auto_partitioner"
 #endif
 
-
 template <class T>
 class TBBStream : public Stream<T>
 {
@@ -48,17 +47,17 @@ class TBBStream : public Stream<T>
 #endif
 
   public:
-    TBBStream(const intptr_t, int);
+    TBBStream(BenchId bs, const intptr_t array_size, const int device_id,
+	      T initA, T initB, T initC);
     ~TBBStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;  
+    void init_arrays(T initA, T initB, T initC);
 };
-
diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu
index 84b27b8e..321470b8 100644
--- a/src/thrust/ThrustStream.cu
+++ b/src/thrust/ThrustStream.cu
@@ -19,7 +19,8 @@ static inline void synchronise()
 }
 
 template <class T>
-ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
+ThrustStream<T>::ThrustStream(BenchId bs, const intptr_t array_size, const int device,
+			      T initA, T initB, T initC)
     : array_size{array_size}, a(array_size), b(array_size), c(array_size) {
   std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
@@ -36,8 +37,6 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
   std::cout << "Thrust backend: TBB" << std::endl;
 #elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
   std::cout << "Thrust backend: CPP" << std::endl;
-#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
-  std::cout << "Thrust backend: TBB" << std::endl;
 #else
 
 #if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP
@@ -48,6 +47,7 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
 
 #endif
 
+  init_arrays(initA, initB, initC);
 }
 
 template <class T>
@@ -60,11 +60,23 @@ void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
 }
 
 template <class T>
-void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
-{
+void ThrustStream<T>::get_arrays(T const*& a_, T const*& b_, T const*& c_)
+{
+  #if defined(MANAGED)
+  a_ = &*a.data();
+  b_ = &*b.data();
+  c_ = &*c.data();
+  #else
+  h_a.resize(array_size);
+  h_b.resize(array_size);
+  h_c.resize(array_size);
   thrust::copy(a.begin(), a.end(), h_a.begin());
   thrust::copy(b.begin(), b.end(), h_b.begin());
   thrust::copy(c.begin(), c.end(), h_c.begin());
+  a_ = h_a.data();
+  b_ = h_b.data();
+  c_ = h_c.data();
+  #endif
 }
 
 template <class T>
diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h
index b0acd80f..676ecaeb 100644
--- a/src/thrust/ThrustStream.h
+++ b/src/thrust/ThrustStream.h
@@ -26,28 +26,25 @@ class ThrustStream : public Stream<T>
     intptr_t array_size;
 
   #if defined(MANAGED)
-    thrust::universtal_vector<T> a;
-    thrust::universtal_vector<T> b;
-    thrust::universtal_vector<T> c;
+    thrust::universal_vector<T> a, b, c;
   #else
-    thrust::device_vector<T> a;
-    thrust::device_vector<T> b;
-    thrust::device_vector<T> c;
+    thrust::device_vector<T> a, b, c;
+    std::vector<T> h_a, h_b, h_c;
   #endif
 
   public:
-    ThrustStream(const intptr_t, int);
+    ThrustStream(BenchId bs, const intptr_t array_size, const int device_id,
+		 T initA, T initB, T initC);
     ~ThrustStream() = default;
 
-    virtual void copy() override;
-    virtual void add() override;
-    virtual void mul() override;
-    virtual void triad() override;
-    virtual void nstream() override;
-    virtual T dot() override;
-
-    virtual void init_arrays(T initA, T initB, T initC) override;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+    void copy() override;
+    void add() override;
+    void mul() override;
+    void triad() override;
+    void nstream() override;
+    T dot() override;
 
+    void get_arrays(T const*& a, T const*& b, T const*& c) override;
+    void init_arrays(T initA, T initB, T initC);
 };
 
diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake
index 6b82ef59..23627c11 100644
--- a/src/thrust/model.cmake
+++ b/src/thrust/model.cmake
@@ -18,8 +18,7 @@ register_flag_optional(BACKEND
         "
         "CUDA")
 
-      register_flag_optional(MANAGED "Enabled managed memory mode."
-        "OFF")
+register_flag_optional(MANAGED "Enabled managed memory mode." "OFF")
 
 register_flag_optional(CMAKE_CUDA_COMPILER
         "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"