diff --git a/CMakeLists.txt b/CMakeLists.txt
index 36bcc155a9b29..deacb50958b84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,9 +12,7 @@
 # Preamble
 
 cmake_minimum_required(VERSION 3.27.1 FATAL_ERROR)
-add_definitions(-DGPUCA_NO_FAST_MATH=1)
-set(GPUCA_NO_FAST_MATH 1)
-set(GPUCA_NO_FAST_MATH_WHOLEO2 1)
+
 # it's important to specify accurately the list of languages. for instance C and
 # C++ as we _do_ have some C files to compile explicitely as C (e.g. gl3w.c)
 project(O2 LANGUAGES C CXX VERSION 1.2.0)
diff --git a/DataFormats/Reconstruction/include/ReconstructionDataFormats/PID.h b/DataFormats/Reconstruction/include/ReconstructionDataFormats/PID.h
index 432bfdefba497..3f6ab06f6b2f2 100644
--- a/DataFormats/Reconstruction/include/ReconstructionDataFormats/PID.h
+++ b/DataFormats/Reconstruction/include/ReconstructionDataFormats/PID.h
@@ -94,8 +94,8 @@ class PID
   static constexpr ID Alpha = 8;
 
   static constexpr ID First = Electron;
-  static constexpr ID Last = Alpha;    ///< if extra IDs added, update this !!!
-  static constexpr ID NIDs = Last + 1; ///< number of defined IDs
+  static constexpr ID Last = Alpha;     ///< if extra IDs added, update this !!!
+  static constexpr ID NIDs = Last + 1;  ///< number of defined IDs
 
   // PID for derived particles
   static constexpr ID PI0 = 9;
@@ -136,6 +136,7 @@ class PID
   }
   GPUd() static const char* getName(ID id) { return pid_constants::sNames[id]; }
 #endif
+
  private:
   ID mID = Pion;
 
@@ -148,8 +149,7 @@ class PID
 
   GPUdi() static constexpr ID nameToID(char const* name, ID id)
   {
-    return id > LastExt ? id : sameStr(name, pid_constants::sNames[id]) ? id
-                                                                        : nameToID(name, id + 1);
+    return id > LastExt ? id : sameStr(name, pid_constants::sNames[id]) ? id : nameToID(name, id + 1);
   }
 #endif
 
diff --git a/DataFormats/Reconstruction/src/TrackParametrization.cxx b/DataFormats/Reconstruction/src/TrackParametrization.cxx
index 738548c1ce5eb..a30d74d940977 100644
--- a/DataFormats/Reconstruction/src/TrackParametrization.cxx
+++ b/DataFormats/Reconstruction/src/TrackParametrization.cxx
@@ -592,7 +592,7 @@ GPUd() void TrackParametrization<value_T>::printParam() const
 #else
   printf("X:%+.4e Alp:%+.3e Par: %+.4e %+.4e %+.4e %+.4e %+.4e |Q|:%d %s",
          getX(), getAlpha(), getY(), getZ(), getSnp(), getTgl(), getQ2Pt(), getAbsCharge(),
-#ifndef __OPENCL__
+#if !defined(__OPENCL__) && defined(GPUCA_GPU_DEBUG_PRINT)
          getPID().getName()
 #else
          ""
@@ -618,7 +618,7 @@ GPUd() void TrackParametrization<value_T>::printParamHexadecimal()
          gpu::CAMath::Float2UIntReint(getTgl()),
          gpu::CAMath::Float2UIntReint(getQ2Pt()),
          gpu::CAMath::Float2UIntReint(getAbsCharge()),
-#ifndef __OPENCL__
+#if !defined(__OPENCL__) && defined(GPUCA_GPU_DEBUG_PRINT)
          getPID().getName()
 #else
          ""
diff --git a/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx b/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx
index f1ba9a6798842..b07fd8d6c3e6d 100644
--- a/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx
+++ b/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx
@@ -1188,7 +1188,7 @@ template <typename value_T>
 std::string TrackParametrizationWithError<value_T>::asString() const
 {
   return TrackParametrization<value_t>::asString() +
-         fmt::format(" \nCovMat: [{:+.3e}]\n        [{:+.3e} {:+.3e}]\n        [{:+.3e} {:+.3e} {:+.3e}]\n        [{:+.3e} {:+.3e} {:+.3e} {:+.3e}]\n        [{:+.3e} {:+.3e} {:+.3e} {:+.3e} {:+.3e}]",
+         fmt::format(" Cov: [{:+.3e}] [{:+.3e} {:+.3e}] [{:+.3e} {:+.3e} {:+.3e}] [{:+.3e} {:+.3e} {:+.3e} {:+.3e}] [{:+.3e} {:+.3e} {:+.3e} {:+.3e} {:+.3e}]",
                      mC[kSigY2], mC[kSigZY], mC[kSigZ2], mC[kSigSnpY], mC[kSigSnpZ], mC[kSigSnp2], mC[kSigTglY],
                      mC[kSigTglZ], mC[kSigTglSnp], mC[kSigTgl2], mC[kSigQ2PtY], mC[kSigQ2PtZ], mC[kSigQ2PtSnp], mC[kSigQ2PtTgl],
                      mC[kSigQ2Pt2]);
@@ -1198,7 +1198,7 @@ template <typename value_T>
 std::string TrackParametrizationWithError<value_T>::asStringHexadecimal()
 {
   return TrackParametrization<value_t>::asStringHexadecimal() +
-         fmt::format(" \n<> CovMat: [{:x}]\n<>         [{:x} {:x}]\n<>         [{:x} {:x} {:x}]\n<>         [{:x} {:x} {:x} {:x}]\n<>         [{:x} {:x} {:x} {:x} {:x}]",
+         fmt::format(" Cov: [{:x}] [{:x} {:x}] [{:x} {:x} {:x}] [{:x} {:x} {:x} {:x}] [{:x} {:x} {:x} {:x} {:x}]",
                      reinterpret_cast<const unsigned int&>(mC[kSigY2]), reinterpret_cast<const unsigned int&>(mC[kSigZY]), reinterpret_cast<const unsigned int&>(mC[kSigZ2]),
                      reinterpret_cast<const unsigned int&>(mC[kSigSnpY]), reinterpret_cast<const unsigned int&>(mC[kSigSnpZ]), reinterpret_cast<const unsigned int&>(mC[kSigSnp2]),
                      reinterpret_cast<const unsigned int&>(mC[kSigTglY]), reinterpret_cast<const unsigned int&>(mC[kSigTglZ]), reinterpret_cast<const unsigned int&>(mC[kSigTglSnp]),
@@ -1217,13 +1217,9 @@ GPUd() void TrackParametrizationWithError<value_T>::print() const
 #else
   TrackParametrization<value_T>::printParam();
   printf(
-    "\n%7s [%+.3e]\n"
-    "%7s [%+.3e %+.3e]\n"
-    "%7s [%+.3e %+.3e %+.3e]\n"
-    "%7s [%+.3e %+.3e %+.3e %+.3e]\n"
-    "%7s [%+.3e %+.3e %+.3e %+.3e %+.3e]\n",
-    "CovMat:", mC[kSigY2], "", mC[kSigZY], mC[kSigZ2], "", mC[kSigSnpY], mC[kSigSnpZ], mC[kSigSnp2], "", mC[kSigTglY],
-    mC[kSigTglZ], mC[kSigTglSnp], mC[kSigTgl2], "", mC[kSigQ2PtY], mC[kSigQ2PtZ], mC[kSigQ2PtSnp], mC[kSigQ2PtTgl],
+    "\nCov: [%+.3e] [%+.3e %+.3e] [%+.3e %+.3e %+.3e] [%+.3e %+.3e %+.3e %+.3e] [%+.3e %+.3e %+.3e %+.3e %+.3e]",
+    mC[kSigY2], mC[kSigZY], mC[kSigZ2], mC[kSigSnpY], mC[kSigSnpZ], mC[kSigSnp2], mC[kSigTglY],
+    mC[kSigTglZ], mC[kSigTglSnp], mC[kSigTgl2], mC[kSigQ2PtY], mC[kSigQ2PtZ], mC[kSigQ2PtSnp], mC[kSigQ2PtTgl],
     mC[kSigQ2Pt2]);
 #endif
 }
@@ -1238,16 +1234,11 @@ GPUd() void TrackParametrizationWithError<value_T>::printHexadecimal()
 #else
   TrackParametrization<value_T>::printParamHexadecimal();
   printf(
-    "\n<> %7s [%x]\n"
-    "<> %7s [%x %x]\n"
-    "<> %7s [%x %x %x]\n"
-    "<> %7s [%x %x %x %x]\n"
-    "<> %7s [%x %x %x %x %x]\n",
-    "<> CovMat:",
-    gpu::CAMath::Float2UIntReint(mC[kSigY2]), "",
-    gpu::CAMath::Float2UIntReint(mC[kSigZY]), gpu::CAMath::Float2UIntReint(mC[kSigZ2]), "",
-    gpu::CAMath::Float2UIntReint(mC[kSigSnpY]), gpu::CAMath::Float2UIntReint(mC[kSigSnpZ]), gpu::CAMath::Float2UIntReint(mC[kSigSnp2]), "",
-    gpu::CAMath::Float2UIntReint(mC[kSigTglY]), gpu::CAMath::Float2UIntReint(mC[kSigTglZ]), gpu::CAMath::Float2UIntReint(mC[kSigTglSnp]), gpu::CAMath::Float2UIntReint(mC[kSigTgl2]), "",
+    "\nCov: [%x] [%x %x] [%x %x %x] [%x %x %x %x] [%x %x %x %x %x]",
+    gpu::CAMath::Float2UIntReint(mC[kSigY2]),
+    gpu::CAMath::Float2UIntReint(mC[kSigZY]), gpu::CAMath::Float2UIntReint(mC[kSigZ2]),
+    gpu::CAMath::Float2UIntReint(mC[kSigSnpY]), gpu::CAMath::Float2UIntReint(mC[kSigSnpZ]), gpu::CAMath::Float2UIntReint(mC[kSigSnp2]),
+    gpu::CAMath::Float2UIntReint(mC[kSigTglY]), gpu::CAMath::Float2UIntReint(mC[kSigTglZ]), gpu::CAMath::Float2UIntReint(mC[kSigTglSnp]), gpu::CAMath::Float2UIntReint(mC[kSigTgl2]),
     gpu::CAMath::Float2UIntReint(mC[kSigQ2PtY]), gpu::CAMath::Float2UIntReint(mC[kSigQ2PtZ]), gpu::CAMath::Float2UIntReint(mC[kSigQ2PtSnp]), gpu::CAMath::Float2UIntReint(mC[kSigQ2PtTgl]), gpu::CAMath::Float2UIntReint(mC[kSigQ2Pt2]));
 #endif
 }
diff --git a/Detectors/Base/src/MatLayerCylSet.cxx b/Detectors/Base/src/MatLayerCylSet.cxx
index afb103f70fee4..12156fc55b381 100644
--- a/Detectors/Base/src/MatLayerCylSet.cxx
+++ b/Detectors/Base/src/MatLayerCylSet.cxx
@@ -18,7 +18,7 @@
 #include "GPUCommonLogger.h"
 #include <TFile.h>
 #include "CommonUtils/TreeStreamRedirector.h"
-// #define _DBG_LOC_ // for local debugging only
+//#define _DBG_LOC_ // for local debugging only
 
 #endif // !GPUCA_ALIGPUCODE
 #undef NDEBUG
@@ -256,7 +256,7 @@ void MatLayerCylSet::print(bool data) const
          float(getFlatBufferSize()) / 1024 / 1024);
 }
 
-#endif //! GPUCA_ALIGPUCODE
+#endif //!GPUCA_ALIGPUCODE
 
 #ifndef GPUCA_GPUCODE
 //________________________________________________________________________________
@@ -391,8 +391,8 @@ GPUd() MatBudget MatLayerCylSet::getMatBudget(float x0, float y0, float z0, floa
   } // loop over layers
 
   if (rval.length != 0.f) {
-    rval.meanRho /= rval.length;    // average
-    rval.meanX2X0 *= ray.getDist(); // normalize
+    rval.meanRho /= rval.length;                                       // average
+    rval.meanX2X0 *= ray.getDist();                                    // normalize
   }
   rval.length = ray.getDist();
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 9b71d95188aa5..532e4d1492d0b 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -499,20 +499,18 @@ void TimeFrameGPU<nLayers>::loadClustersDevice()
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
 {
-  for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", mTrackingFrameInfo[iLayer].size(), iLayer, mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, getExtAllocator());
-    // Register and move data
-    if (!iteration) {
+  if (!iteration) {
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(info, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", mTrackingFrameInfo[iLayer].size(), iLayer, mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, getExtAllocator());
+      // Register and move data
       checkGPUError(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable));
+      checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
-    checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-  }
-  allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, getExtAllocator());
-  if (!iteration) {
+    allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, getExtAllocator());
     checkGPUError(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
+    checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 4af0781b5cdb2..274c0b573e454 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -86,11 +86,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
     if (!track.o2::track::TrackParCovF::rotate(trackingHit.alphaTrackingFrame)) {
       return false;
     }
-#ifdef __HIPCC__
-    if (!track.propagateTo(trackingHit.xTrackingFrame, Bz)) {
-      return false;
-    }
-#else
+
     if (!prop->propagateToX(track,
                             trackingHit.xTrackingFrame,
                             Bz,
@@ -99,10 +95,8 @@ GPUd() bool fitTrack(TrackITSExt& track,
                             matCorrType)) {
       return false;
     }
-#endif
-#ifndef __HIPCC__
+
     if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) {
-#endif
       track.setChi2(track.getChi2() + track.getPredictedChi2(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame));
       if (!track.TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) {
         return false;
@@ -113,9 +107,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
       if (!track.correctForMaterial(xx0, xx0 * radiationLength * density, true)) {
         return false;
       }
-#ifndef __HIPCC__
     }
-#endif
 
     auto predChi2{track.getPredictedChi2(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)};
 
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx
index b52c20eaef83a..6e7114d9ca54d 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx
@@ -39,7 +39,7 @@ struct ClusterHelper {
 
 float MSangle(float mass, float p, float xX0)
 {
-  float beta = p / std::hypot(mass, p);
+  float beta = p / o2::gpu::CAMath::Hypot(mass, p);
   return 0.0136f * o2::gpu::CAMath::Sqrt(xX0) * (1.f + 0.038f * o2::gpu::CAMath::Log(xX0)) / (beta * p);
 }
 
@@ -376,12 +376,11 @@ void TimeFrame::initialise(const int iteration, const TrackingParameters& trkPar
   for (unsigned int iLayer{0}; iLayer < mClusters.size(); ++iLayer) {
     mMSangles[iLayer] = MSangle(0.14f, trkParam.TrackletMinPt, trkParam.LayerxX0[iLayer]);
     mPositionResolution[iLayer] = o2::gpu::CAMath::Sqrt(0.5f * (trkParam.SystErrorZ2[iLayer] + trkParam.SystErrorY2[iLayer]) + trkParam.LayerResolution[iLayer] * trkParam.LayerResolution[iLayer]);
-
     if (iLayer < mClusters.size() - 1) {
       const float& r1 = trkParam.LayerRadii[iLayer];
       const float& r2 = trkParam.LayerRadii[iLayer + 1];
-      const float res1 = std::hypot(trkParam.PVres, mPositionResolution[iLayer]);
-      const float res2 = std::hypot(trkParam.PVres, mPositionResolution[iLayer + 1]);
+      const float res1 = o2::gpu::CAMath::Hypot(trkParam.PVres, mPositionResolution[iLayer]);
+      const float res2 = o2::gpu::CAMath::Hypot(trkParam.PVres, mPositionResolution[iLayer + 1]);
       const float cosTheta1half = o2::gpu::CAMath::Sqrt(1.f - Sq(0.5f * r1 * oneOverR));
       const float cosTheta2half = o2::gpu::CAMath::Sqrt(1.f - Sq(0.5f * r2 * oneOverR));
       float x = r2 * cosTheta1half - r1 * cosTheta2half;
@@ -430,7 +429,7 @@ void TimeFrame::fillPrimaryVerticesXandAlpha()
   }
   mPValphaX.reserve(mPrimaryVertices.size());
   for (auto& pv : mPrimaryVertices) {
-    mPValphaX.emplace_back(std::array<float, 2>{std::hypot(pv.getX(), pv.getY()), math_utils::computePhi(pv.getX(), pv.getY())});
+    mPValphaX.emplace_back(std::array<float, 2>{o2::gpu::CAMath::Hypot(pv.getX(), pv.getY()), math_utils::computePhi(pv.getX(), pv.getY())});
   }
 }
 
diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h
index fbf67c1369b19..cf15d9ee0409b 100644
--- a/GPU/Common/GPUCommonMath.h
+++ b/GPU/Common/GPUCommonMath.h
@@ -211,10 +211,12 @@ GPUdi() float GPUCommonMath::Modf(float x, float y) { return CHOICE(fmodf(x, y),
 
 GPUdi() unsigned int GPUCommonMath::Float2UIntReint(const float& x)
 {
-#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
-  return reinterpret_cast<const unsigned int&>(x);
-#else
+#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
   return __float_as_uint(x);
+#elif defined(GPUCA_GPUCODE_DEVICE) && (defined(__OPENCL__) || defined(__OPENCLCPP__))
+  return as_uint(x);
+#else
+  return reinterpret_cast<const unsigned int&>(x);
 #endif
 }