diff --git a/cpp/daal/src/algorithms/cholesky/cholesky_impl.i b/cpp/daal/src/algorithms/cholesky/cholesky_impl.i
index 76716837ff1..8331773609c 100755
--- a/cpp/daal/src/algorithms/cholesky/cholesky_impl.i
+++ b/cpp/daal/src/algorithms/cholesky/cholesky_impl.i
@@ -151,13 +151,13 @@ bool CholeskyKernel<algorithmFPType, method, cpu>::copyToFullMatrix(NumericTable
 
             for (size_t i = iBlock * blockSize; i < endBlock; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j <= i; j++)
                 {
                     pL[i * dim + j] = pA[i * dim + j];
                 }
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = (i + 1); j < dim; j++)
                 {
@@ -176,13 +176,13 @@ bool CholeskyKernel<algorithmFPType, method, cpu>::copyToFullMatrix(NumericTable
             {
                 const size_t ind = (i + 1) * i / 2;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j <= i; j++)
                 {
                     pL[i * dim + j] = pA[ind + j];
                 }
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = (i + 1); j < dim; j++)
                 {
@@ -201,13 +201,13 @@ bool CholeskyKernel<algorithmFPType, method, cpu>::copyToFullMatrix(NumericTable
             {
                 const size_t ind = (2 * dim - j + 1) * j / 2;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < j; i++)
                 {
                     pL[i * dim + j] = algorithmFPType(0);
                 }
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = j; i < dim; i++)
                 {
@@ -247,7 +247,7 @@ services::Status CholeskyKernel<algorithmFPType, method, cpu>::copyToLowerTriang
             {
                 const size_t ind = (i + 1) * i / 2;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j <= i; j++)
                 {
@@ -276,7 +276,7 @@ services::Status CholeskyKernel<algorithmFPType, method, cpu>::copyToLowerTriang
             {
                 const size_t ind = (j + 1) * j / 2;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i <= j; i++)
                 {
diff --git a/cpp/daal/src/algorithms/covariance/covariance_impl.i b/cpp/daal/src/algorithms/covariance/covariance_impl.i
index 2448a8e9b90..297159ca4b5 100644
--- a/cpp/daal/src/algorithms/covariance/covariance_impl.i
+++ b/cpp/daal/src/algorithms/covariance/covariance_impl.i
@@ -227,7 +227,7 @@ public:
                 /* Sum input array elements in case of non-normalized data */
                 for (DAAL_INT i = 0; i < nRows; i++)
                 {
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (DAAL_INT j = 0; j < _nFeatures; j++)
                     {
@@ -269,7 +269,7 @@ public:
         }
 
         /// It is safe to use aligned loads and stores because the data in TArrayScalableCalloc data structures is aligned
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         PRAGMA_VECTOR_ALIGNED
         for (size_t i = 0; i < (_nFeatures * _nFeatures); i++)
@@ -286,7 +286,7 @@ public:
                 return;
             }
             /// It is safe to use aligned loads and stores because the data is aligned
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             PRAGMA_VECTOR_ALIGNED
             for (size_t i = 0; i < _nFeatures; i++)
@@ -407,7 +407,7 @@ services::Status updateDenseCrossProductAndSums(bool isNormalized, size_t nFeatu
             }
             for (size_t i = 0; i < nFeatures; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nFeatures; j++)
                 {
@@ -501,7 +501,7 @@ void mergeCrossProductAndSums(size_t nFeatures, const algorithmFPType * partialC
         if (nObsValue == 0)
         {
             daal::threader_for(nFeatures, nFeatures, [=](size_t i) {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j <= i; j++)
                 {
@@ -517,7 +517,7 @@ void mergeCrossProductAndSums(size_t nFeatures, const algorithmFPType * partialC
             algorithmFPType invNewNObs     = 1.0 / (nObsValue + partialNObsValue);
 
             daal::threader_for(nFeatures, nFeatures, [=](size_t i) {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j <= i; j++)
                 {
diff --git a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i
index 16064a701b0..a36c9da7bd2 100644
--- a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i
+++ b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i
@@ -419,7 +419,7 @@ int doPartition(SizeType n, const IndexType * aIdx, const ResponseType * aRespon
     SizeType iRight  = 0;
     int iRowSplitVal = -1;
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (SizeType i = 0; i < n; ++i)
     {
@@ -457,7 +457,7 @@ int doPartitionIdx(SizeType n, const IndexType * aIdx, const IndexType * aIdx2,
 
     if (aIdx2)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (SizeType i = 0; i < n; ++i)
         {
@@ -478,7 +478,7 @@ int doPartitionIdx(SizeType n, const IndexType * aIdx, const IndexType * aIdx2,
     }
     else
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (SizeType i = 0; i < n; ++i)
         {
diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i
index 9b5eb542864..4cc98e908e4 100644
--- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i
@@ -68,7 +68,7 @@ DAAL_FORCEINLINE void fillResults(const size_t nClasses, const enum VotingMethod
 {
     if (votingMethod == VotingMethod::unweighted || probas == nullptr)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < blockSize; ++i)
         {
@@ -80,7 +80,7 @@ DAAL_FORCEINLINE void fillResults(const size_t nClasses, const enum VotingMethod
     {
         for (size_t i = 0; i < blockSize; ++i)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nClasses; ++j)
             {
@@ -340,7 +340,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTrees(const siz
         }
         else if (_votingMethod == VotingMethod::weighted)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < _nClasses; ++i)
             {
@@ -357,7 +357,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTrees(const siz
                     sum += resPtr[i];
                 }
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < _nClasses; ++i)
                 {
@@ -403,7 +403,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTreesWithoutCon
         }
         else if (_votingMethod == VotingMethod::weighted)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < _nClasses; ++i)
             {
@@ -420,7 +420,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictByTreesWithoutCon
                     sum += resPtr[i];
                 }
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < _nClasses; ++i)
                 {
@@ -449,7 +449,7 @@ Status PredictClassificationTask<algorithmFPType, cpu>::parallelPredict(const al
 
     SafeStatus safeStat;
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < treeSize; ++i)
     {
@@ -769,7 +769,7 @@ DAAL_FORCEINLINE Status PredictClassificationTask<algorithmFPType, cpu>::predict
     }
     if (probPtr != nullptr)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < _nClasses; ++j)
         {
@@ -891,7 +891,7 @@ DAAL_FORCEINLINE Status PredictClassificationTask<float, avx512>::predictOneRowB
             {
                 const size_t treeSize          = _aTree[iTree + i]->getNumberOfRows();
                 const DecisionTreeNode * aNode = (const DecisionTreeNode *)(*_aTree[iTree + i]).getArray();
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < treeSize; ++j)
                 {
@@ -1002,7 +1002,7 @@ DAAL_FORCEINLINE Status PredictClassificationTask<float, avx512>::predictOneRowB
     }
     if (probPtr != nullptr)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < _nClasses; ++j)
         {
diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i
index 6561366ded0..4e157357f93 100644
--- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i
@@ -138,7 +138,7 @@ protected: //enables specific functions for UnorderedRespHelperBest
         const double one       = double(1);
         const double cDiv      = isZero<double, cpu>(sqWeights) ? one : (one / sqWeights);
         double var             = one;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < _nClasses; ++i) var -= cDiv * double(imp.hist[i]) * double(imp.hist[i]);
         imp.var = var;
@@ -193,7 +193,7 @@ int UnorderedRespHelperBest<algorithmFPType, cpu>::findSplitByHistDefault(int nD
 
         if (!split.featureUnordered)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t iClass = 0; iClass < _nClasses; ++iClass) histLeft[iClass] += nSamplesPerClass[i * _nClasses + iClass];
         }
@@ -201,7 +201,7 @@ int UnorderedRespHelperBest<algorithmFPType, cpu>::findSplitByHistDefault(int nD
 
         if (split.featureUnordered)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             //one against others
             for (size_t iClass = 0; iClass < _nClasses; ++iClass) histLeft[iClass] = nSamplesPerClass[i * _nClasses + iClass];
@@ -210,7 +210,7 @@ int UnorderedRespHelperBest<algorithmFPType, cpu>::findSplitByHistDefault(int nD
         auto histTotal           = curImpurity.hist.get();
         algorithmFPType sumLeft  = 0;
         algorithmFPType sumRight = 0;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         //proximal impurity improvement
         for (size_t iClass = 0; iClass < _nClasses; ++iClass)
@@ -485,7 +485,7 @@ bool UnorderedRespHelperBest<algorithmFPType, cpu>::findSplitCategoricalFeature(
         if ((count < nMinSplitPart) || ((n - count) < nMinSplitPart) || (leftWeights < minWeightLeaf)
             || ((totalWeights - leftWeights) < minWeightLeaf))
             continue;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < _nClasses; ++j) _impRight.hist[j] = curImpurity.hist[j] - _impLeft.hist[j];
         calcGini(leftWeights, _impLeft);
@@ -632,7 +632,7 @@ public:
         DAAL_ASSERT(n > 0);
         node.count    = n;
         node.impurity = imp.var;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < this->_nClasses; ++i)
         {
@@ -681,7 +681,7 @@ protected:
         auto histTotal = total.get();
         auto histRight = right.get();
         auto histLeft  = left.get();
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t iClass = 0; iClass < this->_nClasses; ++iClass) histRight[iClass] = histTotal[iClass] - histLeft[iClass];
     }
@@ -1089,7 +1089,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitByHistDefault(int
         nLeft       = nFeatIdx[idx];
         leftWeights = featWeights[idx];
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         //one against others
         for (size_t iClass = 0; iClass < this->_nClasses; ++iClass) histLeft[iClass] = nSamplesPerClass[idx * this->_nClasses + iClass];
@@ -1108,7 +1108,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitByHistDefault(int
             nLeft += nFeatIdx[i];
             leftWeights += featWeights[i];
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t iClass = 0; iClass < this->_nClasses; ++iClass) histLeft[iClass] += nSamplesPerClass[i * this->_nClasses + iClass];
         }
@@ -1120,7 +1120,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitByHistDefault(int
         auto histTotal           = curImpurity.hist.get();
         algorithmFPType sumLeft  = 0;
         algorithmFPType sumRight = 0;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         //proximal impurity improvement
         for (size_t iClass = 0; iClass < this->_nClasses; ++iClass)
@@ -1186,7 +1186,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
         {
             minidx++;
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t iClass = 0; iClass < K; ++iClass)
             {
@@ -1202,7 +1202,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
         while ((minidx < maxidx) && isZero<IndexType, cpu>(thisNFeatIdx)) thisNFeatIdx = nFeatIdx[++minidx];
         nLeft = thisNFeatIdx;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t iClass = 0; iClass < K; ++iClass)
         {
@@ -1217,7 +1217,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
         return idxFeatureBestSplit;
 
     //set histLeft
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] = nSamplesPerClass[minidx * K + iClass];
 
@@ -1228,7 +1228,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
         while ((minidx < maxidx) && isZero<algorithmFPType, cpu>(thisNFeatIdx))
         {
             maxidx--;
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t iClass = 0; iClass < K; ++iClass)
             {
@@ -1260,7 +1260,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
         //iterate idx down to a bin with values for FinalizeBestSplit
         algorithmFPType thisNFeatIdx(0);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t iC = 0; iC < K; ++iC) thisNFeatIdx += nSamplesPerClass[idx * K + iC];
         while ((minidx < idx) && isZero<algorithmFPType, cpu>(thisNFeatIdx))
@@ -1276,13 +1276,13 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
 
         if (split.featureUnordered) //only need last index
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] = nSamplesPerClass[idx * K + iClass];
         }
         else //sum over all to idx
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = minidx + 1; i <= idx; i++)
             {
@@ -1290,7 +1290,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
             }
         }
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t iClass = 0; iClass < K; ++iClass)
             leftWeights += histLeft[iClass]; //histleft is forced to float, and may cause issues with algorithmFPType = double
@@ -1307,7 +1307,7 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
         if (split.featureUnordered) //only need last index
         {
             nLeft = nFeatIdx[idx];
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] = nSamplesPerClass[idx * K + iClass];
         }
@@ -1316,13 +1316,13 @@ int UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitFewClasses(int nDi
             for (size_t i = minidx + 1; i <= idx; i++)
             {
                 nLeft += nFeatIdx[i];
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] += nSamplesPerClass[i * K + iClass];
             }
         }
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t iClass = 0; iClass < K; ++iClass) leftWeights += histLeft[iClass];
     }
@@ -1412,7 +1412,7 @@ bool UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitOrderedFeature(co
 
     if (noWeights)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (i = 0; i < r; ++i)
         {
@@ -1423,7 +1423,7 @@ bool UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitOrderedFeature(co
     }
     else
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (i = 0; i < r; ++i)
         {
@@ -1550,7 +1550,7 @@ bool UnorderedRespHelperRandom<algorithmFPType, cpu>::findSplitCategoricalFeatur
         if ((count < nMinSplitPart) || ((n - count) < nMinSplitPart) || (leftWeights < minWeightLeaf)
             || ((totalWeights - leftWeights) < minWeightLeaf))
             continue;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < this->_nClasses; ++j) this->_impRight.hist[j] = curImpurity.hist[j] - this->_impLeft.hist[j];
         this->calcGini(leftWeights, this->_impLeft);
@@ -1618,9 +1618,13 @@ public:
         {
             OOBClassificationData * dst       = (OOBClassificationData *)other.oobBuf;
             const OOBClassificationData * src = (const OOBClassificationData *)this->oobBuf;
-            PRAGMA_IVDEP
+            const size_t n                    = _nClasses * nSamples;
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
-            for (size_t i = 0, n = _nClasses * nSamples; i < n; ++i) dst[i] += src[i];
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] += src[i];
+            }
         }
     }
     Status finalizeOOBError(const NumericTable * resp, algorithmFPType * res, algorithmFPType * resPerObs, algorithmFPType * resAccuracy,
diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
index 1289d31c5f7..7321356c41b 100644
--- a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
@@ -243,9 +243,12 @@ void TreeThreadCtxBase<algorithmFPType, cpu>::finalizeVarImp(training::VariableI
         }
         else
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
-            for (size_t i = 0; i < nVars; ++i) varImp[i] = 0;
+            for (size_t i = 0; i < nVars; ++i)
+            {
+                varImp[i] = 0;
+            }
         }
     }
     else if (mode == training::MDI)
@@ -363,7 +366,7 @@ services::Status copyBinIndex(const size_t nRows, const size_t nCols, const Inde
 
         for (size_t i = iStart; i < iEnd; ++i)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nCols; ++j)
             {
@@ -725,8 +728,6 @@ services::Status TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, H
     DAAL_CHECK_MALLOC(_aSample.get() && _helper.reset(_nSamples) && _helper.resetWeights(_nSamples) && _aFeatureBuf.get() && _aFeatureIndexBuf.get()
                       && _aFeatureIdx.get());
 
-    PRAGMA_IVDEP
-    PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < _nFeatureBufs; ++i)
     {
         _aFeatureBuf[i].reset(_data->getNumberOfRows());
@@ -745,17 +746,23 @@ services::Status TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, H
     else
     {
         auto aSample = _aSample.get();
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
-        for (size_t i = 0; i < _nSamples; ++i) aSample[i] = i;
+        for (size_t i = 0; i < _nSamples; ++i)
+        {
+            aSample[i] = i;
+        }
     }
     //init responses buffer, keep _aSample values in it
     DAAL_CHECK_MALLOC(_helper.init(_data, _resp, _aSample.get(), _weights));
 
     //use _aSample as an array of response indices stored by helper from now on
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
-    for (size_t i = 0; i < _aSample.size(); ++i) _aSample[i] = i;
+    for (size_t i = 0; i < _aSample.size(); ++i)
+    {
+        _aSample[i] = i;
+    }
 
     setupHostApp();
 
diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i
index 5f284ad6048..a0fbcb41e53 100644
--- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i
@@ -942,7 +942,7 @@ int OrderedRespHelperRandom<algorithmFPType, cpu>::findBestSplitByHist(size_t nD
         }
         else
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = minidx; i <= idx; ++i)
             {
@@ -962,7 +962,7 @@ int OrderedRespHelperRandom<algorithmFPType, cpu>::findBestSplitByHist(size_t nD
         }
         else
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = minidx; i <= idx; ++i)
             {
@@ -1263,7 +1263,7 @@ public:
         algorithmFPType sumMeanDiff        = 0;
         RegErr<algorithmFPType, cpu> * ptr = (RegErr<algorithmFPType, cpu> *)this->oobBuf;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nSamples; ++i)
         {
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i
index 4d5d4829d74..3cfa6881687 100644
--- a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i
@@ -464,7 +464,7 @@ services::Status PredictBinaryClassificationTask<algorithmFPType, cpu>::run(cons
             const size_t finishRow = (((iBlock + 1) == nBlocks) ? nRows : (iBlock + 1) * blockSize);
             daal::internal::MathInst<algorithmFPType, cpu>::vExp(finishRow - startRow, res + startRow, expVal + startRow);
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t iRow = startRow; iRow < finishRow; ++iRow)
             {
@@ -515,7 +515,7 @@ services::Status PredictBinaryClassificationTask<algorithmFPType, cpu>::run(cons
         {
             // convert the score to a class label
             typedef services::internal::SignBit<algorithmFPType, cpu> SignBit;
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t iRow = 0; iRow < nRows; ++iRow)
             {
                 // probability is a sigmoid(f) hence sign(f) can be checked
@@ -787,7 +787,7 @@ void PredictMulticlassTask<algorithmFPType, cpu>::predictByTreesVector(algorithm
         gbt::prediction::internal::predictForTreeVector<algorithmFPType, TreeType, cpu, hasUnorderedFeatures, hasAnyMissing, vectorBlockSize>(
             *this->_aTree[iTree], this->_featHelper, x, v, dispatcher);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < vectorBlockSize; ++j) val[(iTree % nClasses) + j * nClasses] += v[j];
     }
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i
index 5f10da858c2..9b84b45f33d 100644
--- a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i
@@ -71,7 +71,7 @@ public:
             const size_t end   = iBlock + 1 > nSurplus ? start + nPerBlock : start + (nPerBlock + 1);
             if (sampleInd)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = start; i < end; i++)
                 {
@@ -83,7 +83,7 @@ public:
             }
             else
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = start; i < end; i++)
                 {
@@ -96,7 +96,7 @@ public:
             daal::internal::MathInst<algorithmFPType, cpu>::vExp(end - start, exp + start, exp + start);
             if (sampleInd)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = start; i < end; i++)
                 {
@@ -107,7 +107,7 @@ public:
             }
             else
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = start; i < end; i++)
                 {
@@ -139,7 +139,7 @@ public:
             algorithmFPType * p  = bUseTLS ? lsData.local() : buf;
             const size_t iSample = (sampleInd ? sampleInd[i] : i);
             getSoftmax(f + _nClasses * iSample, p);
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t k = 0; k < _nClasses; ++k)
             {
@@ -165,7 +165,7 @@ protected:
         {
             if (maxArg < arg[i]) maxArg = arg[i];
         }
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < _nClasses; ++i)
         {
@@ -180,7 +180,7 @@ protected:
         for (size_t i = 0; i < _nClasses; ++i) sum += res[i];
 
         sum = algorithmFPType(1.) / sum;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < _nClasses; ++i) res[i] *= sum;
     }
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i
index fd76f8da721..34960d4c9c9 100644
--- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i
@@ -112,7 +112,7 @@ inline void predictForTreeVector(const DecisionTreeType & t, const FeatureTypes
 
     for (FeatureIndexType itr = 0; itr < maxLvl; itr++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (FeatureIndexType k = 0; k < vectorBlockSize; k++)
         {
@@ -122,7 +122,7 @@ inline void predictForTreeVector(const DecisionTreeType & t, const FeatureTypes
         }
     }
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (FeatureIndexType k = 0; k < vectorBlockSize; k++)
     {
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i
index 13bf793be15..c3762dcf732 100644
--- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i
@@ -177,7 +177,7 @@ protected:
         const auto nF    = nRows * _nTrees;
         //initialize f. TODO: input argument
         algorithmFPType * pf = f();
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nF; ++i) pf[i] = initValue;
     }
@@ -255,7 +255,7 @@ double TrainBatchTaskBase<algorithmFPType, BinIndexType, cpu>::computeLeafWeight
     LoopHelper<cpu>::run(inParallel, nBlocks, [&](size_t iBlock) {
         const size_t start = iBlock + 1 > nSurplus ? nPerBlock * iBlock + nSurplus : (nPerBlock + 1) * iBlock;
         const size_t end   = iBlock + 1 > nSurplus ? start + nPerBlock : start + (nPerBlock + 1);
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = start; i < end; i++) pf[idx[i] * this->_nTrees + iTree] += inc;
     });
@@ -463,7 +463,7 @@ services::Status computeTypeDisp(HostAppIface * pHostApp, const NumericTable * x
 
             for (size_t i = iStart; i < iEnd; ++i)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nCols; ++j)
                 {
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i
index 4b3bee150d9..cb31a063e4e 100644
--- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i
@@ -208,7 +208,7 @@ public:
         algorithmFPType * aGHSumPrevFP   = (algorithmFPType *)aGHSumPrev;
         algorithmFPType * aGHSumsOtherFP = (algorithmFPType *)aGHSumsOther;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nUnique * 4; ++i)
         {
@@ -237,7 +237,7 @@ struct ComputeGHSumByRows
         const size_t nCacheLinesToPrefetchOneRow = nFeatures / elementsInCacheLine + !!(nFeatures % elementsInCacheLine);
 
         RowIndexType i = iStart;
-        PRAGMA_IVDEP
+
         for (; i < iEndWithPrefetch; ++i)
         {
             DAAL_PREFETCH_READ_T0(pgh + 2 * aIdx[i + prefetchOffset]);
@@ -246,7 +246,7 @@ struct ComputeGHSumByRows
 
             const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures;
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (RowIndexType j = 0; j < nFeatures; j++)
             {
                 const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]);
@@ -256,12 +256,11 @@ struct ComputeGHSumByRows
             }
         }
 
-        PRAGMA_IVDEP
         for (; i < iEnd; ++i)
         {
             const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures;
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (RowIndexType j = 0; j < nFeatures; j++)
             {
                 const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]);
@@ -285,19 +284,19 @@ struct MergeGHSums
         algorithmFPType * cur = (algorithmFPType *)res.ghSums;
         algorithmFPType * ptr = results[0] + 4 * iStart;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < 4 * nUnique; i++) cur[i] = ptr[i];
 
         for (size_t iB = 1; iB < nBlocks; ++iB)
         {
             algorithmFPType * ptr = results[iB] + 4 * iStart;
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < 4 * nUnique; i++) cur[i] += ptr[i];
         }
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nUnique; ++i)
         {
@@ -340,7 +339,7 @@ struct ComputeGHSumByRows<RowIndexType, BinIndexType, float, SSE42_ALL>
         addsPtr[3]      = 0.0f;
 
         RowIndexType i = iStart;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (; i < iEndWithPrefetch; ++i)
         {
             DAAL_PREFETCH_READ_T0(pgh + 2 * aIdx[i + prefetchOffset]);
@@ -351,7 +350,7 @@ struct ComputeGHSumByRows<RowIndexType, BinIndexType, float, SSE42_ALL>
             addsPtr[0]                   = pgh[2 * aIdx[i]];
             addsPtr[1]                   = pgh[2 * aIdx[i] + 1];
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (IndexType j = 0; j < nFeatures; j++)
             {
@@ -362,14 +361,14 @@ struct ComputeGHSumByRows<RowIndexType, BinIndexType, float, SSE42_ALL>
             }
         }
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (; i < iEnd; ++i)
         {
             const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures;
             addsPtr[0]                   = pgh[2 * aIdx[i]];
             addsPtr[1]                   = pgh[2 * aIdx[i] + 1];
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (IndexType j = 0; j < nFeatures; j++)
             {
                 const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]);
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i
index 760bbb9debe..a8ea7d33c92 100644
--- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i
@@ -185,7 +185,7 @@ protected:
 
             if (featureUnordered)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (IndexType i = iStart; i < iEnd; ++i)
                 {
@@ -197,7 +197,7 @@ protected:
             }
             else
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (IndexType i = iStart; i < iEnd; ++i)
                 {
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i
index ec3c3969dfd..a5b124b254d 100644
--- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i
@@ -397,7 +397,7 @@ services::Status PredictRegressionTask<algorithmFPType, cpu>::predictContributio
     algorithmFPType * contribsOn   = buffer + 2 * elementsInMatrix;
 
     // Copy nominal values (for bias term) to the condition = 0 buffer
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0ul; i < nRowsData; ++i)
     {
@@ -557,7 +557,7 @@ void PredictRegressionTask<algorithmFPType, cpu>::predictByTreesVector(size_t iF
         gbt::prediction::internal::predictForTreeVector<algorithmFPType, TreeType, cpu, hasUnorderedFeatures, hasAnyMissing, vectorBlockSize>(
             *_aTree[iTree], _featHelper, x, v, dispatcher);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t row = 0ul; row < vectorBlockSize; ++row)
         {
diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i
index d5a2081818c..0fd670a2994 100755
--- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i
@@ -65,7 +65,7 @@ public:
             const size_t end   = iBlock + 1 > nSurplus ? start + nPerBlock : start + (nPerBlock + 1);
             if (sampleInd)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = start; i < end; i++)
                 {
@@ -75,7 +75,7 @@ public:
             }
             else
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = start; i < end; i++)
                 {
diff --git a/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i b/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i
index 2d6fe05d0fb..2701f05a12a 100755
--- a/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i
@@ -132,7 +132,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
 
             for (size_t i = 0; i < numRowsInBlock; ++i)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t id = 0; id < nDependentVariables; ++id)
                 {
@@ -142,7 +142,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
         });
         yTlsData.reduceTo(yMeansPtr, nDependentVariables);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nDependentVariables; ++i)
         {
@@ -160,7 +160,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
 
             for (size_t i = 0; i < numRowsInBlock; ++i)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t id = 0; id < nDependentVariables; ++id)
                 {
@@ -190,7 +190,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
 
             for (size_t i = 0; i < numRowsInBlock; ++i)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nFeatures; ++j)
                 {
diff --git a/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i b/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i
index 64757061fee..f0890cb3555 100755
--- a/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i
@@ -178,7 +178,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
             for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
             {
                 algorithmFPType tp = 0;
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nFeatures; j++)
                 {
@@ -198,7 +198,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
         {
             for (size_t j = 0; j < nFeatures; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
                 {
@@ -211,7 +211,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
             algorithmFPType addition = t.logAlpha[k] + t.logSqrtInvDetSigma[k];
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
             {
@@ -220,7 +220,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
             for (size_t j = 0; j < nFeatures; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
                 {
@@ -228,7 +228,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
                 }
             }
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
             {
@@ -239,7 +239,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
     t.partLogLikelyhood        = 0;
     algorithmFPType * maxInRow = t.rowSum;
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
     {
@@ -248,7 +248,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
     for (size_t k = 1; k < nComponents; k++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
         {
@@ -261,7 +261,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
     for (size_t k = 0; k < nComponents; k++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
         {
@@ -273,14 +273,14 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
         }
     }
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
     {
         t.partLogLikelyhood += maxInRow[i];
     }
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
     {
@@ -291,7 +291,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
     for (size_t k = 0; k < nComponents; k++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
         {
@@ -301,7 +301,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
     t.rowSumInv         = t.rowSum;
     algorithmFPType one = 1.0;
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
     {
@@ -310,7 +310,7 @@ void EMKernelTask<algorithmFPType, method, cpu>::stepE(const size_t nVectorsInCu
 
     for (size_t k = 0; k < nComponents; k++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
         {
@@ -334,7 +334,7 @@ algorithmFPType EMKernelTask<algorithmFPType, method, cpu>::computePartialLogLik
     MathInst<algorithmFPType, cpu>::vLog(nVectorsInCurrentBlock, t.rowSumInv, logRowSumInv);
 
     algorithmFPType loglikPartial = t.partLogLikelyhood;
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nVectorsInCurrentBlock; i++)
     {
diff --git a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i
index 1d21ff4965f..43439535a6e 100644
--- a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i
@@ -113,7 +113,7 @@ services::Status ImplicitALSInitKernel<algorithmFPType, fastCSR, cpu>::reduceSum
         {
             algorithmFPType * const s = arrSum[k];
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = start; j < end; ++j)
             {
@@ -164,14 +164,14 @@ services::Status ImplicitALSInitKernel<algorithmFPType, fastCSR, cpu>::compute(c
 
     s |= this->randFactors(nItems, nFactors, itemsFactors, engine);
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nItems; i++) // if number of not null elems is equal 0
     {
         notNullElemSum[i] = (notNullElemSum[i] == algorithmFPType(0.0) ? algorithmFPType(1.0) : notNullElemSum[i]);
     }
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nItems; i++)
     {
diff --git a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i
index 09c3f2b9b77..dc02e9d1ab8 100644
--- a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i
+++ b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i
@@ -279,7 +279,7 @@ Status ImplicitALSInitDistrKernel<algorithmFPType, fastCSR, cpu>::computePartial
 
             algorithmFPType itemsSum = 0;
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t k = start; k < end; ++k)
             {
diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i
index 5cc08923966..4241682084b 100644
--- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i
@@ -350,7 +350,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
                     size_t i = first;
                     b.upper  = dx[indexes[i]];
                     b.lower  = dx[indexes[i]];
-                    PRAGMA_IVDEP
+
                     for (++i; i < last; ++i)
                     {
                         if (b.lower > dx[indexes[i]])
@@ -416,7 +416,7 @@ size_t KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
             const_cast<NumericTable &>(x).getBlockOfColumnValues(j, 0, xRowCount, readOnly, columnBD);
             const algorithmFpType * const dx = columnBD.getBlockPtr();
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t i = 0; i < elementCount; ++i)
             {
                 sampleValues[i] = dx[indexes[start + i]];
@@ -580,7 +580,7 @@ algorithmFpType KNNClassificationTrainBatchKernel<algorithmFpType, training::def
     histTLS.reduce([=, &masterHist](Hist * v) -> void {
         if (v)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < sampleCount; ++j)
             {
@@ -734,8 +734,6 @@ size_t KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
         size_t left  = first;
         size_t right = last - 1;
 
-        PRAGMA_IVDEP
-        PRAGMA_VECTOR_ALWAYS
         for (;;)
         {
             while ((left <= right) && (dx[indexes[left]] < median))
diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_csr_impl.i b/cpp/daal/src/algorithms/kernel_function/kernel_function_csr_impl.i
index 1017cc4ec0c..7bfdf7a8717 100644
--- a/cpp/daal/src/algorithms/kernel_function/kernel_function_csr_impl.i
+++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_csr_impl.i
@@ -178,7 +178,7 @@ inline double KernelCSRImplBase<double, avx512>::computeDotProduct(const size_t
         double partialSum[8];
         _mm512_storeu_pd(partialSum, vSum);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (int i = 0; i < 8; i++)
         {
@@ -289,7 +289,7 @@ inline float KernelCSRImplBase<float, avx512>::computeDotProduct(const size_t st
         double partialSum[8];
         _mm512_storeu_pd(partialSum, vSum);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (int i = 0; i < 8; i++)
         {
diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i
index f8e5925ce22..5c6cc4aa933 100755
--- a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i
@@ -65,7 +65,7 @@ services::Status KernelImplRBF<defaultDense, algorithmFPType, cpu>::computeInter
     //compute
     const algorithmFPType invSqrSigma = (algorithmFPType)(1.0 / (par->sigma * par->sigma));
     algorithmFPType factor            = 0.0;
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nFeatures; i++)
     {
@@ -102,7 +102,7 @@ services::Status KernelImplRBF<defaultDense, algorithmFPType, cpu>::computeInter
     for (size_t i = 0; i < nVectors1; i++)
     {
         algorithmFPType factor = 0.0;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
diff --git a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i
index bff8c1d7a30..14fab573d52 100644
--- a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i
+++ b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i
@@ -139,7 +139,7 @@ services::Status KernelImplPolynomial<fastCSR, algorithmFPType, cpu>::computeInt
         if (k != one || b != zero)
         {
             daal::threader_for_optional(nVectors1, nVectors1, [=](size_t i) {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j <= i; j++)
                 {
@@ -158,7 +158,7 @@ services::Status KernelImplPolynomial<fastCSR, algorithmFPType, cpu>::computeInt
         }
 
         daal::threader_for_optional(nVectors1, nVectors1, [=](size_t i) {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = i + 1; j < nVectors1; j++)
             {
diff --git a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i
index b95ee30d666..153e80cd7a7 100644
--- a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i
@@ -63,7 +63,7 @@ services::Status KernelImplPolynomial<defaultDense, algorithmFPType, cpu>::compu
 
     //compute
     dataR[0] = 0.0;
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nFeatures; i++)
     {
@@ -102,7 +102,7 @@ services::Status KernelImplPolynomial<defaultDense, algorithmFPType, cpu>::compu
     services::internal::service_memset_seq<algorithmFPType, cpu>(dataR, b, nVectors1);
     for (size_t i = 0; i < nVectors1; i++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
@@ -183,7 +183,7 @@ services::Status KernelImplPolynomial<defaultDense, algorithmFPType, cpu>::compu
                 BlasInst<algorithmFPType, cpu>::xxgemm(&trans, &notrans, &nRowsInBlock2, &nRowsInBlock1, (DAAL_INT *)&nFeatures, &alpha, dataA2,
                                                        (DAAL_INT *)&nFeatures, dataA1, (DAAL_INT *)&nFeatures, &beta, dataR, (DAAL_INT *)&nVectors2);
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < nRowsInBlock1; ++i)
                 {
@@ -218,7 +218,7 @@ services::Status KernelImplPolynomial<defaultDense, algorithmFPType, cpu>::compu
                 BlasInst<algorithmFPType, cpu>::xxgemm(&trans, &notrans, &nRowsInBlock1, &nRowsInBlock2, (DAAL_INT *)&nFeatures, &alpha, dataA1,
                                                        (DAAL_INT *)&nFeatures, dataA2, (DAAL_INT *)&nFeatures, &beta, mklBuff, &ldc2);
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < blockSize * blockSize; ++i)
                 {
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i
index 7640be0cb51..08c9b38a55b 100644
--- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i
@@ -150,7 +150,7 @@ Status KMeansBatchKernel<method, algorithmFPType, cpu>::compute(const NumericTab
                 {
                     const algorithmFPType coeff = 1.0 / clusterS0[i];
 
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t j = 0; j < p; j++)
                     {
@@ -167,7 +167,7 @@ Status KMeansBatchKernel<method, algorithmFPType, cpu>::compute(const NumericTab
                     ReadRows<algorithmFPType, cpu> mtRow(ntData, cIndices[cPos], 1);
                     const algorithmFPType * row = mtRow.get();
 
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t j = 0; j < p; j++)
                     {
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i
index c612cf4f9a3..c96580fe3f4 100644
--- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i
@@ -180,7 +180,7 @@ Status KMeansDistributedStep1Kernel<method, algorithmFPType, cpu>::finalizeCompu
     DAAL_CHECK_BLOCK_STATUS(outBlock);
     int * outAssignments = outBlock.get();
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     for (size_t i = 0; i < n; i++)
     {
         outAssignments[i] = inAssignments[i];
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i
index c8297ddb336..8e366e8e2fc 100644
--- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i
@@ -68,7 +68,7 @@ struct TaskKMeansLloyd
             for (size_t k = 0; k < clNum; k++)
             {
                 algorithmFPType sum = algorithmFPType(0);
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_ICC_NO16(omp simd reduction(+ : sum))
                 for (size_t j = 0; j < dim; j++)
                 {
@@ -185,7 +185,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedDense(const Num
 
         for (size_t j = 0; j < nClusters; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < blockSize; i++)
             {
@@ -223,7 +223,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedDense(const Num
             const size_t minIdx        = *((algIntType *)&(x_clusters[i]));
             algorithmFPType minGoalVal = x_clusters[i + blockSize];
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t j = 0; j < p; j++)
             {
                 cS1[minIdx * p + j] += data[i * p + j];
@@ -372,7 +372,7 @@ int TaskKMeansLloyd<algorithmFPType, cpu>::kmeansUpdateCluster(int jidx, centroi
 
     tls_task->reduce([=](TlsTask<algorithmFPType, cpu> * tt) -> void {
         int j;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (j = 0; j < dim; j++)
         {
             s1[j] += tt->cS1[idx * dim + j];
@@ -392,7 +392,7 @@ void TaskKMeansLloyd<algorithmFPType, cpu>::kmeansComputeCentroids(int * cluster
             service_memset_seq<double, cpu>(auxData, 0.0, dim);
             clusterS0[i] = kmeansUpdateCluster<double>(i, auxData);
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < dim; j++)
             {
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h
index 598bd40e7a6..131a23af549 100644
--- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h
@@ -76,7 +76,7 @@ struct PostProcessing<lloydDense, algorithmFPType, cpu>
         for (size_t k = 0; k < nClusters; k++)
         {
             algorithmFPType sum = algorithmFPType(0);
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_ICC_NO16(omp simd reduction(+ : sum))
             for (size_t j = 0; j < p; j++)
             {
@@ -203,7 +203,7 @@ struct PostProcessing<lloydCSR, algorithmFPType, cpu>
         {
             clSq[k]             = 0;
             algorithmFPType sum = algorithmFPType(0);
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_ICC_NO16(omp simd reduction(+ : sum))
             for (size_t j = 0; j < p; j++)
             {
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i
index be8859690b3..71e826c0260 100644
--- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i
@@ -198,7 +198,7 @@ public:
         for (size_t iRow = 0u; iRow < nRowsToProcess; iRow++)
         {
             algorithmFPType dist2 = algorithmFPType(0);
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0u; i < dim; i++)
             {
@@ -224,7 +224,7 @@ public:
         const algorithmFPType * pData = ntDataBD.get();
         algorithmFPType res(0.);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < dim; ++i)
         {
diff --git a/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i b/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i
index 4c1a5e599a6..96e1cc062e4 100755
--- a/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i
@@ -132,7 +132,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
 
             for (size_t i = 0; i < numRowsInBlock; ++i)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t id = 0; id < nDependentVariables; ++id)
                 {
@@ -142,7 +142,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
         });
         yTlsData.reduceTo(yMeansPtr, nDependentVariables);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nDependentVariables; ++i)
         {
@@ -160,7 +160,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
 
             for (size_t i = 0; i < numRowsInBlock; ++i)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t id = 0; id < nDependentVariables; ++id)
                 {
@@ -188,7 +188,7 @@ services::Status TrainBatchKernel<algorithmFPType, method, cpu>::compute(
 
             for (size_t i = 0; i < numRowsInBlock; ++i)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nFeatures; ++j)
                 {
diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i
index e3601d74439..18c76fb752d 100644
--- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i
+++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i
@@ -127,7 +127,7 @@ Status FinalizeKernel<algorithmFPType, cpu>::compute(const NumericTable & xtxTab
     {
         for (size_t i = 0; i < nResponses; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 1; j < nBetas; j++)
             {
@@ -140,7 +140,7 @@ Status FinalizeKernel<algorithmFPType, cpu>::compute(const NumericTable & xtxTab
     {
         for (size_t i = 0; i < nResponses; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nBetas - 1; j++)
             {
diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i
index e9698ddc674..5628e3b64e0 100644
--- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i
+++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i
@@ -101,7 +101,7 @@ Status ThreadingTask<algorithmFPType, cpu>::update(DAAL_INT startRow, DAAL_INT n
 
         for (DAAL_INT i = 0; i < nRows; i++, xPtr += nFeatures)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (DAAL_INT j = 0; j < nFeatures; j++)
             {
@@ -124,7 +124,7 @@ Status ThreadingTask<algorithmFPType, cpu>::update(DAAL_INT startRow, DAAL_INT n
         const algorithmFPType * yPtr = y;
         for (DAAL_INT i = 0; i < nRows; i++, yPtr += _nResponses)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (DAAL_INT j = 0; j < _nResponses; j++)
             {
@@ -140,7 +140,7 @@ void ThreadingTask<algorithmFPType, cpu>::reduce(algorithmFPType * xtx, algorith
 {
     {
         DAAL_PROFILER_THREADING_TASK(reduce.syrkX);
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < (_nBetasIntercept * _nBetasIntercept); i++)
         {
@@ -150,7 +150,7 @@ void ThreadingTask<algorithmFPType, cpu>::reduce(algorithmFPType * xtx, algorith
 
     {
         DAAL_PROFILER_THREADING_TASK(reduce.gemmXY);
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < (_nBetasIntercept * _nResponses); i++)
         {
diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i
index a7fd526f13e..b497a8933e5 100755
--- a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i
+++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i
@@ -87,7 +87,7 @@ Status CommonKernel<algorithmFPType, cpu>::computeQRForBlock(DAAL_INT p, DAAL_IN
 
     for (size_t i = 0; i < nRowsInR; ++i)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j <= i + jOffset; ++j)
         {
@@ -112,7 +112,7 @@ Status CommonKernel<algorithmFPType, cpu>::computeQRForBlock(DAAL_INT p, DAAL_IN
         for (size_t i = 0; i < p - n; ++i)
         {
             r[i * p + i] = one;
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < i; ++j)
             {
diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i
index 5c9068cbb22..2a260573bb5 100755
--- a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i
+++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i
@@ -87,7 +87,7 @@ Status FinalizeKernel<algorithmFPType, cpu>::compute(const NumericTable & rTable
 
             for (size_t i = 0; i < nResponses; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nBetasIntercept; j++)
                 {
@@ -115,7 +115,7 @@ Status FinalizeKernel<algorithmFPType, cpu>::compute(const NumericTable & rTable
     {
         for (size_t i = 0; i < nResponses; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 1; j < nBetas; j++)
             {
@@ -128,7 +128,7 @@ Status FinalizeKernel<algorithmFPType, cpu>::compute(const NumericTable & rTable
     {
         for (size_t i = 0; i < nResponses; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nBetas - 1; j++)
             {
diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i
index cbaa63d87a1..9a1562172ac 100644
--- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i
@@ -180,7 +180,7 @@ protected:
             s |= gemvSoa(x, beta + 1, xb, nRows, nCols, xOffset);
             if (bIntercept)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < nRows; ++i) xb[i] += beta[0];
             }
diff --git a/cpp/daal/src/algorithms/logitboost/logitboost_impl.i b/cpp/daal/src/algorithms/logitboost/logitboost_impl.i
index 479101ccf21..b6fdee988c7 100644
--- a/cpp/daal/src/algorithms/logitboost/logitboost_impl.i
+++ b/cpp/daal/src/algorithms/logitboost/logitboost_impl.i
@@ -110,7 +110,7 @@ void UpdateP(size_t nc, size_t n, algorithmFPType * F, algorithmFPType * P, algo
 
         algorithmFPType invs = (algorithmFPType)1.0 / s;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nc; j++)
         {
diff --git a/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i b/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i
index 89a8f6f9e09..a517725e500 100755
--- a/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i
+++ b/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i
@@ -172,7 +172,7 @@ services::Status UpdateFPNew(size_t nc, size_t n, algorithmFPType * F, algorithm
         /* Update additive function's values
            Step 2.b) of the Algorithm 6 from [1] */
         /* i-row contains Fi() for all classes in i-th point x */
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = start; i < start + size; i++)
         {
@@ -206,7 +206,7 @@ services::Status UpdateFPNew(size_t nc, size_t n, algorithmFPType * F, algorithm
            Step 2.c) of the Algorithm 6 from [1] */
         const bool useFullBuffer = size * nc <= n;
         if (useFullBuffer) daal::internal::MathInst<algorithmFPType, cpu>::vExp(nc * size, F + start * nc, buffer);
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < size; i++)
         {
diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i
index 3c071e5ac73..ee573b95204 100644
--- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i
+++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i
@@ -278,7 +278,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result)
 
                 const algorithmFPType * const argi = _dataArray_block + i * _cd.nFeatures;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < _cd.nFeatures; j++)
                 {
@@ -373,7 +373,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result)
                     size_t _jstart = iFeatureBlock * numFeaturesInBlock;
                     size_t _jend   = _jstart + ((iFeatureBlock < (numFeatureBlocks - 1)) ? numFeaturesInBlock : numFeaturesInLastBlock);
 
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t j = _jstart; j < _jend; j++)
                     {
@@ -408,7 +408,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result)
             }    /* if(_cd.nFeatures >= _THREAD_REDUCTION_MIN_SIZE_) */
             else /* if(_cd.nFeatures < _THREAD_REDUCTION_MIN_SIZE_) */
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < _cd.nFeatures; j++)
                 {
@@ -458,7 +458,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result)
 
 #if (defined _VART_ENABLE_ || defined _SORM_ENABLE_)
         const algorithmFPType _invN = algorithmFPType(1.0) / algorithmFPType(_cd.nVectors);
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < _cd.nFeatures; j++)
         {
diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i
index 93f4ef1d30d..72ec07a02a7 100644
--- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i
+++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i
@@ -311,7 +311,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult
 #if defined _MEAN_ENABLE_ || defined _SORM_ENABLE_
                 const algorithmFPType _invN = algorithmFPType(1.0) / algorithmFPType(_td->nvectors + 1);
 #endif
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < _cd.nFeatures; j++)
                 {
@@ -376,7 +376,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult
             algorithmFPType mean_scale     = algorithmFPType(1.0) / (n1_p_n2);
             algorithmFPType variance_scale = algorithmFPType(1.0) / (n1_p_n2 - algorithmFPType(1.0));
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < _cd.nFeatures; j++)
             {
@@ -415,7 +415,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult
         if (isOnline)
         {
 #if (defined _SUM_ENABLE_) || (defined _MEAN_ENABLE_)
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < _cd.nFeatures; i++)
             {
@@ -430,7 +430,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult
             algorithmFPType nVectorsM1 = (algorithmFPType)(_cd.nVectors - 1);
             if (!isOnline)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < _cd.nFeatures; i++)
                 {
@@ -441,7 +441,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult
             {
                 if (nObs == 0)
                 {
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t i = 0; i < _cd.nFeatures; i++)
                     {
@@ -454,7 +454,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult
                     algorithmFPType invNVectors     = 1.0 / (algorithmFPType)_cd.nVectors;
                     algorithmFPType coeff           = (algorithmFPType)(nObs * _cd.nVectors) / (algorithmFPType)(nObs + _cd.nVectors);
 
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t i = 0; i < _cd.nFeatures; i++)
                     {
diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i
index b85d3261374..0ed711dea1c 100755
--- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i
+++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i
@@ -383,7 +383,7 @@ Status retrievePrecomputedStatsIfPossible(const size_t nFeatures, const size_t n
 
     const algorithmFPType invNVectors = 1.0 / (algorithmFPType)nVectors;
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nFeatures; i++)
     {
@@ -408,7 +408,7 @@ Status computeSumAndVariance(size_t nFeatures, size_t nVectors, algorithmFPType
 
     if (isOnline)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nFeatures; i++)
         {
@@ -488,7 +488,7 @@ Status computeMinMaxAndSumOfSquared(const size_t nFeatures, const size_t nVector
             max   = &_array[nfeatures * 1];
             sumSq = &_array[nfeatures * 2];
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nfeatures; j++)
             {
@@ -524,7 +524,7 @@ Status computeMinMaxAndSumOfSquared(const size_t nFeatures, const size_t nVector
 
         for (size_t i = startRows; i < startRows + chunkRows; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nFeatures; j++)
             {
@@ -538,7 +538,7 @@ Status computeMinMaxAndSumOfSquared(const size_t nFeatures, const size_t nVector
     });
 
     tslData.reduce([&](TslData * localTslData) {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
@@ -569,7 +569,7 @@ void computeSumOfSquaredDiffsFromMean(size_t nFeatures, size_t nVectors, size_t
         const algorithmFPType nVectorsM1 = (algorithmFPType)(nVectors - 1);
         if (!isOnline)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nFeatures; i++)
             {
@@ -580,7 +580,7 @@ void computeSumOfSquaredDiffsFromMean(size_t nFeatures, size_t nVectors, size_t
 
         if (prevNVectors == 0)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nFeatures; i++)
             {
@@ -860,7 +860,7 @@ void finalize(LowOrderMomentsFinalizeTask<algorithmFPType, cpu> & task)
 
     size_t nFeatures = task.nFeatures;
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nFeatures; i++)
     {
diff --git a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i
index fafd8e3cd5d..f24604c8e83 100644
--- a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i
+++ b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i
@@ -112,7 +112,7 @@ public:
             if (!s) return Status(ErrorMultiClassFailedToComputeTwoClassPrediction).add(s);
 
             /* Compute votes for the block of input observations */
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nRows; ++i)
             {
diff --git a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i
index f92189adbfb..3845952ca6f 100644
--- a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i
+++ b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i
@@ -337,7 +337,7 @@ Status SubTaskDense<algorithmFPType, cpu>::copyDataIntoSubtable(size_t nFeatures
         originalIndicesMap[nRows] = ix;
         _mtX.next(ix, 1);
         DAAL_CHECK_BLOCK_STATUS(_mtX);
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t jx = 0; jx < nFeatures; jx++) this->_subsetX.get()[nRows * nFeatures + jx] = _mtX.get()[jx];
         this->_subsetY[nRows] = label;
@@ -362,14 +362,14 @@ Status SubTaskCSR<algorithmFPType, cpu>::copyDataIntoSubtable(size_t nFeatures,
         originalIndicesMap[nRows] = ix;
         _mtX.next(ix, 1);
         DAAL_CHECK_BLOCK_STATUS(_mtX);
-        const size_t nNonZeroValuesInRow = _mtX.rows()[1] - _mtX.rows()[0];
-        const size_t * colIndices        = _mtX.cols();
-        PRAGMA_IVDEP
-        PRAGMA_VECTOR_ALWAYS
+        const size_t nNonZeroValuesInRow  = _mtX.rows()[1] - _mtX.rows()[0];
+        const size_t * colIndices         = _mtX.cols();
+        const algorithmFPType * mtXValues = _mtX.values();
+        algorithmFPType * subsetXData     = this->_subsetX.get();
         for (size_t jx = 0; jx < nNonZeroValuesInRow; ++jx, ++dataIndex)
         {
-            this->_subsetX.get()[dataIndex] = _mtX.values()[jx];
-            _colIndicesX[dataIndex]         = colIndices[jx];
+            subsetXData[dataIndex]  = mtXValues[jx];
+            _colIndicesX[dataIndex] = colIndices[jx];
         }
         _rowOffsetsX[nRows + 1] = _rowOffsetsX[nRows] + nNonZeroValuesInRow;
         this->_subsetY[nRows]   = label;
diff --git a/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i b/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i
index b3385013635..1e8d028bac6 100644
--- a/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i
+++ b/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i
@@ -173,7 +173,7 @@ services::Status methodSpecific<defaultDense, algorithmFPType, cpu>::getPredicti
         int max_c                 = 0;
         algorithmFPType max_c_val = -(services::internal::MaxVal<algorithmFPType>::get());
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (size_t cl = 0; cl < c; cl++)
         {
             algorithmFPType val = buff[j * c + cl];
@@ -221,7 +221,7 @@ services::Status methodSpecific<fastCSR, algorithmFPType, cpu>::getPredictionDat
         int max_c                 = 0;
         algorithmFPType max_c_val = -(services::internal::MaxVal<algorithmFPType>::get());
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (size_t cl = 0; cl < c; cl++)
         {
             algorithmFPType val = buff[j + cl * n];
diff --git a/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i b/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i
index 74a6732140d..71c5079834d 100644
--- a/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i
+++ b/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i
@@ -197,7 +197,7 @@ Status collectCounters(const Parameter * nbPar, NumericTable * ntData, NumericTa
     tls_n_ci.reduce([=](algorithmFPType * v) {
         if (!v) return;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         PRAGMA_VECTOR_ALIGNED
         for (size_t j = 0; j < c; j++)
@@ -230,13 +230,13 @@ Status mergeModels(const Parameter * nbPar, size_t p, size_t nModels, PartialMod
         const algorithmFPType * in_n_ci = rrCi.set(models[i]->getClassGroupSum().get(), 0, c);
         DAAL_CHECK_BLOCK_STATUS(rrCi);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (size_t j = 0; j < c; j++)
         {
             n_c[j] += in_n_c[j];
         }
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (size_t j = 0; j < p * c; j++)
         {
             n_ci[j] += in_n_ci[j];
@@ -361,13 +361,13 @@ services::Status NaiveBayesBatchTrainKernel<algorithmFPType, method, cpu>::compu
         return Status(ErrorMemoryAllocationFailed);
     }
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     for (size_t j = 0; j < c; j++)
     {
         n_c[j] = 0;
     }
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     for (size_t j = 0; j < p * c; j++)
     {
         n_ci[j] = 0;
diff --git a/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i b/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i
index e9c07f25333..6828a3507f6 100644
--- a/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i
+++ b/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i
@@ -106,7 +106,7 @@ Status MinMaxKernel<algorithmFPType, method, cpu>::processBlock(const NumericTab
 
     for (size_t i = 0; i < blockSize; i++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nColumns; j++)
         {
diff --git a/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i b/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i
index dff721e281e..d9669e6c1d2 100644
--- a/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i
+++ b/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i
@@ -52,7 +52,7 @@ Status ZScoreKernel<algorithmFPType, sumDense, cpu>::computeMeanVariance_thr(Num
     algorithmFPType invNm1 = algorithmFPType(1.0) / (algorithmFPType(nVectors) - algorithmFPType(1.0));
 
     /* Compute means from sums */
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t j = 0; j < nFeatures; j++)
     {
@@ -89,7 +89,7 @@ Status ZScoreKernel<algorithmFPType, sumDense, cpu>::computeMeanVariance_thr(Num
 
         for (size_t i = 0; i < _nRows; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nFeatures; j++)
             {
@@ -103,7 +103,7 @@ Status ZScoreKernel<algorithmFPType, sumDense, cpu>::computeMeanVariance_thr(Num
     tls_data.reduce([&](algorithmFPType * pVariances) {
         if (pVariances)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nFeatures; j++)
             {
@@ -114,7 +114,7 @@ Status ZScoreKernel<algorithmFPType, sumDense, cpu>::computeMeanVariance_thr(Num
     });
     /* Convert array of variances to unbiased */
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t j = 0; j < nFeatures; j++)
     {
diff --git a/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i b/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i
index 7000f806cb6..bb6daa34365 100644
--- a/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i
+++ b/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i
@@ -71,7 +71,7 @@ Status ZScoreKernelBase<algorithmFPType, cpu>::common_compute(NumericTable & inp
 
                 for (size_t i = 0; i < _nRows; i++)
                 {
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t j = 0; j < _nFeatures; j++)
                     {
@@ -119,7 +119,7 @@ Status ZScoreKernelBase<algorithmFPType, cpu>::common_compute(NumericTable & inp
 
             for (size_t i = 0; i < _nRows; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < _nFeatures; j++)
                 {
@@ -145,7 +145,7 @@ Status ZScoreKernelBase<algorithmFPType, cpu>::common_compute(NumericTable & inp
 
             for (size_t i = 0; i < _nRows; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < _nFeatures; j++)
                 {
diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i
index d6e51a93669..57b3bb82ced 100644
--- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i
@@ -61,7 +61,7 @@ static void applyBetaImpl(const algorithmFPType * x, const algorithmFPType * bet
         BlasInst<algorithmFPType, cpu>::xxgemm(&trans, &notrans, &m, &n, &k, &one, beta + 1, &ldb, x, &k, &zero, xb, &m);
     if (bIntercept)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nRows; ++i)
         {
@@ -97,13 +97,13 @@ void CrossEntropyLossKernel<algorithmFPType, method, cpu>::softmax(const algorit
         const algorithmFPType * const pArg = arg + iRow * nCols;
         algorithmFPType * const pRes       = res + iRow * nCols;
         algorithmFPType maxArg             = pArg[0];
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 1; i < nCols; ++i)
         {
             if (maxArg < pArg[i]) maxArg = pArg[i];
         }
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nCols; ++i)
         {
@@ -120,14 +120,14 @@ void CrossEntropyLossKernel<algorithmFPType, method, cpu>::softmax(const algorit
         {
             algorithmFPType * const pRes = res + iRow * nCols;
             algorithmFPType sum(0.);
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nCols; ++i)
             {
                 sum += pRes[i];
             }
             sum = static_cast<algorithmFPType>(1.) / sum;
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nCols; ++i)
             {
@@ -143,14 +143,14 @@ void CrossEntropyLossKernel<algorithmFPType, method, cpu>::softmax(const algorit
         {
             algorithmFPType * const pRes = res + iRow * nCols;
             algorithmFPType sum(0.);
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nCols; ++i)
             {
                 sum += pRes[i];
             }
             sum = static_cast<algorithmFPType>(1.) / sum;
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < nCols; ++i)
             {
@@ -291,7 +291,7 @@ services::Status CrossEntropyLossKernel<algorithmFPType, method, cpu>::doCompute
             {
                 curentNorm = 0;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < p; j++)
                 {
diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i
index 625d99a104a..62dc7007e42 100644
--- a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i
@@ -61,7 +61,7 @@ static void applyBetaImpl(const algorithmFPType * x, const algorithmFPType * bet
     }
     if (bIntercept)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < n; ++i)
         {
@@ -81,7 +81,7 @@ template <typename algorithmFPType, CpuType cpu>
 static void vexp(const algorithmFPType * f, algorithmFPType * exp, size_t n)
 {
     const algorithmFPType expThreshold = daal::internal::MathInst<algorithmFPType, cpu>::vExpThreshold();
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < n; ++i)
     {
@@ -96,7 +96,7 @@ static void vexp(const algorithmFPType * f, algorithmFPType * exp, size_t n)
 template <typename algorithmFPType, CpuType cpu>
 static void sigmoids(algorithmFPType * exp, size_t n, size_t offset)
 {
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < n; ++i)
     {
@@ -112,7 +112,7 @@ void LogLossKernel<algorithmFPType, method, cpu>::sigmoid(const algorithmFPType
     //s = exp(-f)
     vexp<algorithmFPType, cpu>(f, s, n);
     //s = sigm(f)
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < n; ++i)
     {
@@ -370,7 +370,7 @@ services::Status LogLossKernel<algorithmFPType, method, cpu>::doCompute(const Nu
                 const DAAL_INT nN          = static_cast<DAAL_INT>(nRowsToProcess);
                 algorithmFPType * const pg = grads.get() + iBlock * p;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < nRowsToProcess; ++i)
                 {
@@ -380,7 +380,7 @@ services::Status LogLossKernel<algorithmFPType, method, cpu>::doCompute(const Nu
                 daal::internal::BlasInst<algorithmFPType, cpu>::xxgemm(&notrans, &notrans, &dim, &yDim, &nN, &one, xLocal, &dim, sgPtrLocal, &nN,
                                                                        &zero, pg, &dim);
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < nRowsToProcess; ++i)
                 {
diff --git a/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i b/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i
index b6691f2bcc6..9bd98333e72 100644
--- a/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i
@@ -129,7 +129,7 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
                     result |= daal::services::internal::daal_memcpy_s(residualPtr, n * yDim * sizeof(algorithmFPType), Y,
                                                                       n * yDim * sizeof(algorithmFPType));
                     size_t compute_matrix = 0;
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t i = 0; i < (nTheta + 1) * yDim; i++)
                     {
@@ -237,7 +237,7 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
 
                         tlsData.reduceTo(hessianDiagonalPtr, nTheta);
 
-                        PRAGMA_IVDEP
+                        PRAGMA_FORCE_SIMD
                         PRAGMA_VECTOR_ALWAYS
                         for (size_t j = 0; j < nTheta; ++j)
                         {
@@ -278,7 +278,7 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
                         {
                             if (previousFeatureId == 0 && parameter->interceptFlag)
                             {
-                                PRAGMA_IVDEP
+                                PRAGMA_FORCE_SIMD
                                 PRAGMA_VECTOR_ALWAYS
                                 for (size_t i = 0; i < nDataRows; i++) /*threader for*/
                                 {
@@ -305,7 +305,7 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
                     {
                         for (size_t i = 0; i < nDataRows; i++) /*threader for*/
                         {
-                            PRAGMA_IVDEP
+                            PRAGMA_FORCE_SIMD
                             PRAGMA_VECTOR_ALWAYS
                             for (size_t ic = 0; ic < yDim; ic++) dotPtr[ic] += residualPtr[i * yDim + ic];
                         }
@@ -366,11 +366,11 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
                     XY.reset(dim * yDim);
                     XYPtr = XY.get();
 
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t i = 0; i < dim * yDim; i++) XYPtr[i] = 0;
 
-                    PRAGMA_IVDEP
+                    PRAGMA_FORCE_SIMD
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t i = 0; i < dim * dim; i++) gramMatrixPtr[i] = 0;
                     char uplo = 'L';
@@ -415,13 +415,13 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
                         });
                     }
                     tlsData.reduce([&](algorithmFPType * local) {
-                        PRAGMA_IVDEP
+                        PRAGMA_FORCE_SIMD
                         PRAGMA_VECTOR_ALWAYS
                         for (size_t j = 0; j < dim * yDim; j++)
                         {
                             XYPtr[j] += local[j];
                         }
-                        PRAGMA_IVDEP
+                        PRAGMA_FORCE_SIMD
                         PRAGMA_VECTOR_ALWAYS
                         for (size_t j = 0; j < dim * dim; j++)
                         {
@@ -431,7 +431,7 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
                     const size_t dimension = dim;
                     for (size_t i = 0; i < dimension; i++)
                     {
-                        PRAGMA_IVDEP
+                        PRAGMA_FORCE_SIMD
                         PRAGMA_VECTOR_ALWAYS
                         for (size_t j = i; j < dimension; j++) gramMatrixPtr[j * dim + i] = gramMatrixPtr[i * dim + j];
                     }
@@ -574,7 +574,7 @@ inline services::Status MSEKernel<algorithmFPType, method, cpu>::compute(Numeric
                         }
                         tlsData.reduceTo(hessianDiagonalPtr, nTheta);
 
-                        PRAGMA_IVDEP
+                        PRAGMA_FORCE_SIMD
                         PRAGMA_VECTOR_ALWAYS
                         for (size_t j = 0; j < nTheta; ++j)
                         {
diff --git a/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i b/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i
index 92d15fc7a9a..cd17049eedc 100755
--- a/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i
@@ -209,7 +209,7 @@ services::Status AdagradKernel<algorithmFPType, method, cpu>::compute(HostAppIfa
                     processByBlocks<cpu>(
                         nRows,
                         [=](size_t startOffset, size_t nRowsInBlock) {
-                            PRAGMA_IVDEP
+                            PRAGMA_FORCE_SIMD
                             PRAGMA_VECTOR_ALWAYS
                             for (size_t j = startOffset; j < startOffset + nRowsInBlock; j++)
                             {
@@ -225,7 +225,7 @@ services::Status AdagradKernel<algorithmFPType, method, cpu>::compute(HostAppIfa
         processByBlocks<cpu>(
             nRows,
             [=](size_t startOffset, size_t nRowsInBlock) {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = startOffset; j < startOffset + nRowsInBlock; j++)
                 {
diff --git a/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h b/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h
index 8a6683ed517..e94ef7884b8 100755
--- a/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h
+++ b/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h
@@ -95,7 +95,7 @@ class IterativeSolverKernel : public Kernel
                 algorithmFPType * normPtr = normTls.local();
                 DAAL_CHECK_THR(normPtr, services::ErrorMemoryAllocationFailed);
                 PRAGMA_VECTOR_ALWAYS
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < nRowsInBlock; j++)
                 {
                     *normPtr += vecLocal[j] * vecLocal[j];
@@ -117,7 +117,7 @@ class IterativeSolverKernel : public Kernel
         res = 0;
         if (nElements < blockStartThreshold)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nElements; j++)
             {
diff --git a/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i b/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i
index 23ef6bdf7a6..5f57ed231c5 100755
--- a/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i
@@ -550,7 +550,7 @@ template <typename algorithmFPType, CpuType cpu>
 void LBFGSTask<algorithmFPType, cpu>::computeCorrectionPairImpl(size_t correctionIndex, const algorithmFPType * hessian, bool useWolfeConditions)
 {
     algorithmFPType * s = correctionS + correctionIndex * this->argumentSize;
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t j = 0; j < this->argumentSize; j++)
     {
@@ -568,7 +568,7 @@ void LBFGSTask<algorithmFPType, cpu>::computeCorrectionPairImpl(size_t correctio
         algorithmFPType * gradientPrev = (algorithmFPType *)_gradientPrevPtr.get();
         algorithmFPType * gradientCurr = (algorithmFPType *)_gradientCurrPtr.get();
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < this->argumentSize; j++)
         {
diff --git a/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i b/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i
index 97639f68d93..04705edc5c8 100755
--- a/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i
@@ -284,7 +284,7 @@ services::Status SagaKernel<algorithmFPType, method, cpu>::compute(HostAppIface
         result |= daal::services::internal::daal_memcpy_s(previous, sizeArgument * sizeof(algorithmFPType), workValue,
                                                           sizeArgument * sizeof(algorithmFPType));
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t k = 0; k < sizeArgument; k++)
         {
@@ -313,7 +313,7 @@ services::Status SagaKernel<algorithmFPType, method, cpu>::compute(HostAppIface
             }
         }
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t k = 0; k < sizeArgument; k++)
         {
diff --git a/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i b/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i
index 93fbf23a684..bb8fffaad90 100644
--- a/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i
@@ -64,7 +64,7 @@ inline void OutlierDetectionKernel<algorithmFPType, method, cpu>::mahalanobisDis
     algorithmFPType * dataCenPtr    = dataCen;
     for (size_t i = 0; i < nVectors; i++, dataPtr += nFeatures, dataCenPtr += nFeatures)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
@@ -79,7 +79,7 @@ inline void OutlierDetectionKernel<algorithmFPType, method, cpu>::mahalanobisDis
     for (size_t i = 0; i < nVectors; i++, dataCenPtr += nFeatures, dataCenInvScatterPtr += nFeatures)
     {
         distance[i] = zero;
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
diff --git a/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h b/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h
index a4ba5865f17..cf985c6e16f 100644
--- a/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h
+++ b/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h
@@ -60,7 +60,7 @@ struct OutlierDetectionKernel : public Kernel
         const algorithmFPType zero(0.0);
         const algorithmFPType one(1.0);
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
@@ -97,7 +97,7 @@ struct OutlierDetectionKernel : public Kernel
             algorithmFPType diff;
             for (size_t i = 0; i < nRowsInBlock; i++, dataPtr += nFeatures, weightPtr += nFeatures)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nFeatures; j++)
                 {
diff --git a/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i b/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i
index 7baf293d87e..0b60e6784d9 100644
--- a/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i
@@ -75,7 +75,7 @@ Status ExplainedVarianceKernel<method, algorithmFPType, cpu>::compute(const Nume
         if (id >= nComponents) noiseSum += pEigenvalues[id];
     }
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t id = 0; id < nComponents; ++id)
     {
diff --git a/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i b/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i
index 352dd0b229c..cbed9a6d78b 100644
--- a/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i
+++ b/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i
@@ -230,7 +230,7 @@ services::Status PCASVDBatchKernel<algorithmFPType, ParameterType, cpu>::normali
         {
             const algorithmFPType _invN = algorithmFPType(1.0) / algorithmFPType(tls_data_local->nvectors + 1);
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nFeatures; j++)
             {
@@ -258,7 +258,7 @@ services::Status PCASVDBatchKernel<algorithmFPType, ParameterType, cpu>::normali
         const algorithmFPType inv_n1_p_n2       = algorithmFPType(1.0) / (n1_p_n2);
         const algorithmFPType inv_n1_p_n2_m1    = algorithmFPType(1.0) / (n1_p_n2 - algorithmFPType(1.0));
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
@@ -280,7 +280,7 @@ services::Status PCASVDBatchKernel<algorithmFPType, ParameterType, cpu>::normali
     if (!safeStat) return safeStat.detach();
 
     /* Convert array of variances to inverse sigma's */
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t j = 0; j < nFeatures; j++)
     {
@@ -297,7 +297,7 @@ services::Status PCASVDBatchKernel<algorithmFPType, ParameterType, cpu>::normali
 
         for (size_t i = 0; i < nVectors_local; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nFeatures; j++)
             {
diff --git a/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i b/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i
index 09b932c4965..4d03f27fe84 100644
--- a/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i
+++ b/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i
@@ -156,7 +156,7 @@ inline void normalizeData(const size_t nObservations, const size_t nFeatures, co
 {
     for (size_t i = 0; i < nObservations; i++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nFeatures; j++)
         {
diff --git a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i b/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i
index b8947cc856d..a5e8b4f1524 100644
--- a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i
@@ -78,7 +78,7 @@ services::Status ComputeInvSigmas(NumericTable * pVariances, TArray<algorithmFPT
         DAAL_CHECK_BLOCK_STATUS(dataRows);
         const algorithmFPType * pRawVariances = dataRows.get();
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t varianceId = 0; varianceId < numFeatures; ++varianceId)
         {
@@ -181,14 +181,14 @@ services::Status TransformKernel<algorithmFPType, method, cpu>::compute(NumericT
             for (size_t rowId = 0; rowId < numRows; ++rowId)
             {
                 /* compute centering if numMeans != 0 */
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t colId = 0; colId < numMeans; ++colId)
                 {
                     pCopyBlock[rowId * numMeans + colId] = pDataBlock[rowId * numMeans + colId] - pRawMeans[colId];
                 }
                 /* compute normalization to unit variance if numInvSigmas!= 0 */
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t colId = 0; colId < numInvSigmas; ++colId)
                 {
@@ -203,7 +203,7 @@ services::Status TransformKernel<algorithmFPType, method, cpu>::compute(NumericT
         {
             for (size_t rowId = 0; rowId < numRows; ++rowId)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t colId = 0; colId < numComponents; ++colId)
                 {
diff --git a/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i b/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i
index 3e052f7b0af..b28d40b6167 100644
--- a/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i
@@ -252,7 +252,7 @@ Status QRBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na,
             /* Get transposed Q from A */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < brows_local; j++)
                 {
                     QT_local[i * brows_local + j] = A_block[i + j * cols_local];
@@ -266,7 +266,7 @@ Status QRBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na,
             /* Transpose Q */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < brows_local; j++)
                 {
                     Q_block[i + j * cols_local] = QT_local[i * brows_local + j];
@@ -277,16 +277,13 @@ Status QRBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na,
             for (size_t i = 0; i < cols_local; i++)
             {
                 size_t j;
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (j = 0; j <= i; j++)
                 {
                     RT_buff[k * cols_local + i * cols_local * blocks + j] = RT_local[i * cols_local + j];
                 }
-                PRAGMA_IVDEP
-                for (; j < cols_local; j++)
-                {
-                    RT_buff[k * cols_local + i * cols_local * blocks + j] = 0.0;
-                }
+
+                service_memset<algorithmFPType, cpu>(&RT_buff[k * cols_local + i * cols_local * blocks + i + 1], 0.0, cols_local - i - 1);
             }
         });
     }
@@ -307,7 +304,7 @@ Status QRBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na,
         algorithmFPType * R_output = bkR_output.get();
         for (size_t i = 0; i < cols; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t j = 0; j < cols; j++)
             {
                 R_output[i + j * cols] = R_buff[i * cols + j];
@@ -336,7 +333,7 @@ Status QRBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na,
         /* Transpose RB */
         for (size_t i = 0; i < cols_local; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t j = 0; j < cols_local; j++)
             {
                 RT_local[j * cols_local + i] = RT_buff[j * cols_local * blocks + k * cols_local + i];
@@ -346,7 +343,7 @@ Status QRBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na,
         /* Transpose Q to QT */
         for (size_t i = 0; i < cols_local; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t j = 0; j < brows_local; j++)
             {
                 QT_local[i * brows_local + j] = Q_block[i + j * cols_local];
@@ -360,7 +357,7 @@ Status QRBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na,
         /* Transpose result Q */
         for (size_t i = 0; i < cols_local; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t j = 0; j < brows_local; j++)
             {
                 Q_block[i + j * cols_local] = QT_result_local[i * brows_local + j];
diff --git a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i
index ce438f8b1fa..1659cf96a0a 100755
--- a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i
+++ b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i
@@ -339,7 +339,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al
                     A_local[i * ncols + j] = a_local[i + j * local_tiles * ncols];
                 }
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = j + 1; i < ncols; i++)
                 {
@@ -352,7 +352,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al
             // If onlyV then no needs to save to A array (inplace)
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = j + 1; i < ncols; i++)
                 {
@@ -406,7 +406,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al
             // Reset area under upper triangle to 0. Just in case Intel(R) MKL set them.
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = j + 1; i < ncols; i++)
                 {
@@ -418,14 +418,14 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al
 
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < j + 1; i++)
             {
                 R_local[i + Rda * j] = a_local[i + local_tiles * ncols * j];
             }
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = j + 1; i < ncols; i++)
             {
@@ -456,7 +456,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al
 
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < j + 1; i++)
                 {
@@ -470,7 +470,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al
         // of only V required - save only upper part of R array
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < j + 1; i++)
             {
@@ -532,13 +532,13 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
         // Copy triangles from A
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < j + 1; i++)
             {
                 R_local[i + Rda * j] = A_local[i * ncols + j];
             }
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = j + 1; i < ncols; i++)
             {
@@ -591,13 +591,13 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
         // Copy stacked triangle to top of "a" buffer
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < j + 1; i++)
             {
                 a[i + j * local_tiles * ncols] = R_local[i + Rda * j];
             }
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = j + 1; i < local_tiles * ncols; i++)
             {
@@ -608,7 +608,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
         // Zero out top of "b" buffer
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < ncols; i++)
             {
@@ -632,7 +632,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
             // Copy Q into bottom portion of "b" buffer
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < height; i++)
                 {
@@ -643,7 +643,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
             // Zero out bottom portion of "a" buffer
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < height; i++)
                 {
@@ -672,7 +672,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
             // Just in case Intel(R) MKL wrote something here
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = j + 1; i < ncols; i++)
                 {
@@ -696,7 +696,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
         // Copy entire Q factor into "b" buffer
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = j + 1; i < first_height; i++)
             {
@@ -708,7 +708,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols,
         // Only apply Q to upper triangle of "a".
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = ncols; i < first_height; i++)
             {
@@ -807,14 +807,14 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
         // Fill "R_local" buffer with stacked upper triangular matrices
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < j + 1; i++)
             {
                 R_local[i + Rda * j] = A_local[i * ncols + j];
             }
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = j + 1; i < ncols; i++)
             {
@@ -827,7 +827,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
             // Fill "R2_local" top square with top square of matrix being multiplied
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < ncols; i++)
                 {
@@ -840,7 +840,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
             // memset 0
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < ncols; i++)
                 {
@@ -900,7 +900,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
         // Copy my square of R2 to top of "a" buffer
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < ncols; i++)
             {
@@ -911,7 +911,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
         // Zero out top of "b" buffer
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < ncols; i++)
             {
@@ -936,7 +936,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
             // Copy Q factor into bottom portion of "b" buffer
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < height; i++)
                 {
@@ -947,7 +947,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
             // Zero out bottom portion of "a" buffer
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < height; i++)
                 {
@@ -974,7 +974,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
             // Copy bottom portion of "a" buffer to output
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < height; i++)
                 {
@@ -987,7 +987,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
         // Fill "b" buffer with entire Q factor
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = j + 1; i < first_height; i++)
             {
@@ -998,7 +998,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
         // Zero out bottom portion of "a" buffer
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = ncols; i < first_height; i++)
             {
@@ -1023,7 +1023,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols
         // Write result from "a" buffer to output
         for (size_t j = 0; j < ncols; j++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = 0; i < first_height; i++)
             {
@@ -1090,7 +1090,7 @@ static int qr_pcl(const algorithmFPType * A_in,                        /* nrows
                 e = num;
             }
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = b; i < e; i++)
             {
@@ -1109,14 +1109,14 @@ static int qr_pcl(const algorithmFPType * A_in,                        /* nrows
 
         for (size_t i = 0; i < ncols; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < i; j++)
             {
                 R_out[i * ncols + j] = 0;
             }
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = i; j < ncols; j++)
             {
@@ -1191,7 +1191,7 @@ static int svd_pcl(algorithmFPType * A_in,
                 e = num;
             }
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = b; i < e; i++)
             {
@@ -1231,14 +1231,14 @@ static int svd_pcl(algorithmFPType * A_in,
 
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < j + 1; i++)
                 {
                     R[j * ncols + i] = R_out[i * ncols + j];
                 }
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = j + 1; i < ncols; i++)
                 {
@@ -1256,14 +1256,14 @@ static int svd_pcl(algorithmFPType * A_in,
 
             for (size_t j = 0; j < ncols; j++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = 0; i < j + 1; i++)
                 {
                     R[j * ncols + i] = V[i * ncols + j];
                 }
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t i = j + 1; i < ncols; i++)
                 {
@@ -1290,7 +1290,7 @@ static int svd_pcl(algorithmFPType * A_in,
         {
             for (size_t i = 0; i < ncols; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < ncols; j++)
                 {
diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h
index 0fcb2b0eb94..d029656c32c 100644
--- a/cpp/daal/src/algorithms/service_kernel_math.h
+++ b/cpp/daal/src/algorithms/service_kernel_math.h
@@ -169,7 +169,7 @@ class EuclideanDistances : public PairwiseDistances<FPType, cpu>
         const FPType * const aa = normBufferA.get() + aOffset;
         const FPType * const bb = (&_a == &_b) ? normBufferA.get() + bOffset : normBufferB.get() + bOffset;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nRowsC; i++)
         {
@@ -276,7 +276,7 @@ class EuclideanDistances : public PairwiseDistances<FPType, cpu>
             for (size_t i = 0; i < end - begin; i++)
             {
                 FPType sum = FPType(0);
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_ICC_NO16(omp simd reduction(+ : sum))
                 for (size_t j = 0; j < nCols; j++)
                 {
@@ -354,7 +354,7 @@ class CosineDistances : public EuclideanDistances<FPType, cpu>
 
         for (size_t i = 0; i < nRowsC; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < nColsC; j++)
             {
@@ -775,7 +775,7 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_
     DAAL_INT num_taken = static_cast<DAAL_INT>(n) - num_discarded;
     daal::internal::MathInst<FPType, cpu>::vSqrt(num_taken, eigenvalues.get() + num_discarded, eigenvalues.get() + num_discarded);
     DAAL_INT one = 1;
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     for (size_t col = num_discarded; col < n; col++)
     {
         const FPType scale = eigenvalues[col];
diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i b/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i
index 94d2df1e118..955a48f0eff 100644
--- a/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i
@@ -365,7 +365,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             /* Get transposed Q from A */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < brows_local; j++)
                 {
                     QT_local[i * brows_local + j] = A_block[i + j * cols_local];
@@ -383,7 +383,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             /* Transpose Q */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < brows_local; j++)
                 {
                     Q_block[i + j * cols_local] = QT_local[i * brows_local + j];
@@ -394,16 +394,13 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             for (size_t i = 0; i < cols_local; i++)
             {
                 size_t j;
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (j = 0; j <= i; j++)
                 {
                     RT_buff[k * cols_local + i * cols_local * blocks + j] = RT_local[i * cols_local + j];
                 }
-                PRAGMA_IVDEP
-                for (; j < cols_local; j++)
-                {
-                    RT_buff[k * cols_local + i * cols_local * blocks + j] = 0.0;
-                }
+
+                service_memset<algorithmFPType, cpu>(&RT_buff[k * cols_local + i * cols_local * blocks + i + 1], 0.0, cols_local - i - 1);
             }
         });
     }
@@ -446,7 +443,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             /* Transpose result R and save to V output */
             for (size_t i = 0; i < cols; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < nComponents; j++)
                 {
                     V_output[i + j * cols] = V_buff[i * cols + j];
@@ -484,7 +481,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             /* Transpose RB */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < cols_local; j++)
                 {
                     RT_block[i * cols_local + j] = R_buff[j * cols_local * blocks + k * cols_local + i];
@@ -494,7 +491,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             /* Transpose Q to QT */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < brows_local; j++)
                 {
                     QT_local[i * brows_local + j] = Q_block[i + j * cols_local];
@@ -504,7 +501,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             /* Transpose R to RT */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < cols_local; j++)
                 {
                     RT_local[i * cols_local + j] = RT_block[i + j * cols_local];
@@ -518,7 +515,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_thr(const size_t na
             /* Transpose result Q */
             for (size_t i = 0; i < cols_local; i++)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 for (size_t j = 0; j < brows_local; j++)
                 {
                     U_block[i + j * cols_local] = QT_result_local[i * brows_local + j];
@@ -594,7 +591,7 @@ Status SVDBatchKernel<algorithmFPType, method, cpu>::compute_pcl(const size_t na
         algorithmFPType * tV = vBlock.get();
         for (size_t i = 0; i < nComponents; i++)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             for (size_t j = 0; j < n; j++)
             {
                 tV[i * n + j] = V[i * n + j];
diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i b/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i
index c94b145a802..1c5662cf433 100755
--- a/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i
@@ -181,7 +181,7 @@ Status compute_QR_on_one_node(DAAL_INT m, DAAL_INT n, algorithmFPType * a_q, DAA
     // Get R of the QR factorization formed by xgeqrf
     for (DAAL_INT i = 0; i < nColumnsInQ; ++i)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (DAAL_INT j = 0; j <= i; ++j)
         {
@@ -194,14 +194,14 @@ Status compute_QR_on_one_node(DAAL_INT m, DAAL_INT n, algorithmFPType * a_q, DAA
         const algorithmFPType zero(0.0);
         for (size_t i = m; i < n; ++i)
         {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = 0; j < m; ++j)
             {
                 r[i * ldr + j] = a_q[i * lda_q + j];
             }
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t j = m; j <= i; ++j)
             {
diff --git a/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i b/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i
index 104fd10f801..0d94b813afb 100644
--- a/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i
+++ b/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i
@@ -357,7 +357,7 @@ services::Status SVMTrainImpl<thunder, algorithmFPType, cpu>::SMOBlockSolver(
         daal::threader_for(nBlocks, nBlocks, [&](const size_t iBlock) {
             const size_t startRow = iBlock * blockSizeWS;
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (size_t i = startRow; i < startRow + blockSizeWS; ++i)
             {
@@ -375,7 +375,7 @@ services::Status SVMTrainImpl<thunder, algorithmFPType, cpu>::SMOBlockSolver(
                 Ii |= (yLocal[i] > 0) ? positive : negative;
                 I[i] = Ii;
 
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nWS; ++j)
                 {
@@ -508,7 +508,7 @@ services::Status SVMTrainImpl<thunder, algorithmFPType, cpu>::SMOBlockSolver(
         const algorithmFPType * const KBjBlock = &kernelLocal[Bj * nWS];
 
         /* Update gradient */
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < nWS; i++)
         {
@@ -521,7 +521,7 @@ services::Status SVMTrainImpl<thunder, algorithmFPType, cpu>::SMOBlockSolver(
     localDiff = firstDiff;
 
     /* Compute diff and scatter to alpha vector */
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nWS; ++i)
     {
@@ -557,7 +557,7 @@ services::Status SVMTrainImpl<thunder, algorithmFPType, cpu>::updateGrad(algorit
 
             if (startRowGrad < nVectors && startRowGrad + nRowsInBlockGrad > nVectors)
             {
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t j = 0; j < nRowsInBlockGrad; ++j)
                 {
diff --git a/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i b/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i
index 2122c8c532e..0062954904e 100644
--- a/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i
+++ b/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i
@@ -115,7 +115,7 @@ struct MemoryCtxType
         const DataType * xInit = xInitDataBlock.get();
         const DataType * yInit = yInitDataBlock.get();
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < capacity; i++)
         {
@@ -133,7 +133,7 @@ struct MemoryCtxType
         DataType * xInit = xInitDataBlock.get();
         DataType * yInit = yInitDataBlock.get();
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < _capacity; i++)
         {
diff --git a/cpp/daal/src/data_management/roc_auc_score.cpp b/cpp/daal/src/data_management/roc_auc_score.cpp
index 059c641eb49..93690366883 100644
--- a/cpp/daal/src/data_management/roc_auc_score.cpp
+++ b/cpp/daal/src/data_management/roc_auc_score.cpp
@@ -79,7 +79,7 @@ services::Status rocAucScoreImpl(const NumericTablePtr & truePrediction, const N
         }
         elementsInBlock = j - i + 1;
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < elementsInBlock; ++j)
         {
diff --git a/cpp/daal/src/data_management/train_test_split.cpp b/cpp/daal/src/data_management/train_test_split.cpp
index df7ad0d6d1e..16f567974ee 100755
--- a/cpp/daal/src/data_management/train_test_split.cpp
+++ b/cpp/daal/src/data_management/train_test_split.cpp
@@ -170,7 +170,7 @@ services::Status assignColumnValues(const DataType * origDataPtr, const NumericT
     DataType * dataPtr = dataBlock.get();
     DAAL_CHECK_MALLOC(dataPtr);
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < nRows; ++i)
     {
@@ -232,7 +232,7 @@ services::Status assignRows(const DataType * origDataPtr, const NumericTablePtr
 
     for (size_t i = 0; i < nRows; ++i)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t j = 0; j < nColumns; ++j)
         {
diff --git a/cpp/daal/src/externals/service_memory.h b/cpp/daal/src/externals/service_memory.h
index 8cb4bc1b9f0..2858c111366 100644
--- a/cpp/daal/src/externals/service_memory.h
+++ b/cpp/daal/src/externals/service_memory.h
@@ -60,7 +60,7 @@ void service_memset_seq(T * const ptr, const T value, const size_t num)
     {
         /// Use aligned stores
         const unsigned int num32 = static_cast<unsigned int>(num);
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         PRAGMA_VECTOR_ALIGNED
         for (unsigned int i = 0; i < num32; i++)
@@ -70,7 +70,7 @@ void service_memset_seq(T * const ptr, const T value, const size_t num)
     }
     else
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < num; i++)
         {
@@ -250,7 +250,7 @@ T * service_memset(T * const ptr, const T value, const size_t num)
             end = num;
         }
 
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = block * blockSize; i < end; i++)
         {
@@ -264,7 +264,7 @@ T * service_memset(T * const ptr, const T value, const size_t num)
 template <typename T, CpuType cpu>
 void service_memset_incrementing(T * const ptr, const T startValue, const size_t num)
 {
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     PRAGMA_VECTOR_ALWAYS
     for (size_t i = 0; i < num; i++)
     {
diff --git a/cpp/daal/src/externals/service_spblas.h b/cpp/daal/src/externals/service_spblas.h
index 4d959bc6cc9..71a0640c08d 100644
--- a/cpp/daal/src/externals/service_spblas.h
+++ b/cpp/daal/src/externals/service_spblas.h
@@ -163,7 +163,7 @@ struct SpBlas
             for (size_t ind1 = 0; ind1 < nnzCol1; ++ind1)
             {
                 fpType * ptr_ = res.ptr + rowPtr1[ind1] * res.stride;
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (size_t ind2 = 0; ind2 < nnzCol2; ++ind2)
                 {
diff --git a/cpp/daal/src/externals/service_spblas_ref.h b/cpp/daal/src/externals/service_spblas_ref.h
index 2106e689a0e..a5aa495f002 100644
--- a/cpp/daal/src/externals/service_spblas_ref.h
+++ b/cpp/daal/src/externals/service_spblas_ref.h
@@ -180,14 +180,13 @@ struct RefSpBlas
         DAAL_INT offset = pntrb[row] - 1;
         DAAL_INT nnz    = pntrb[row + 1] - pntrb[row];
         DAAL_INT csrcol = col + 1;
-#pragma omp simd
+
+        for (DAAL_INT i = 0; i < nnz; ++i)
         {
-            for (DAAL_INT i = 0; i < nnz; ++i)
-            {
-                if (csrcol < indx[offset + i]) break;
-                if (csrcol == indx[offset + i]) return val[offset + i];
-            }
+            if (csrcol < indx[offset + i]) break;
+            if (csrcol == indx[offset + i]) return val[offset + i];
         }
+
         return fpType(0);
     }
     static void csrmm(const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const fpType * alpha, const fpType * a, const DAAL_INT * indx,
diff --git a/cpp/daal/src/externals/service_stat.h b/cpp/daal/src/externals/service_stat.h
index 4f091f08d2b..34fb242d276 100644
--- a/cpp/daal/src/externals/service_stat.h
+++ b/cpp/daal/src/externals/service_stat.h
@@ -109,7 +109,7 @@ struct Statistics
 
             fpType wsum = 0;
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             PRAGMA_ICC_NO16(omp simd reduction(+ : wsum))
             for (size_t i = 0; i < nRows; i++)
diff --git a/cpp/daal/src/services/service_defines.h b/cpp/daal/src/services/service_defines.h
index 54e510afef4..cdb7541783f 100644
--- a/cpp/daal/src/services/service_defines.h
+++ b/cpp/daal/src/services/service_defines.h
@@ -45,7 +45,7 @@ DAAL_EXPORT bool daal_check_is_intel_cpu();
 #define DAAL_CHECK_CPU_ENVIRONMENT (daal_check_is_intel_cpu())
 
 #if defined(__INTEL_COMPILER)
-    #define PRAGMA_IVDEP            _Pragma("ivdep")
+    #define PRAGMA_FORCE_SIMD       _Pragma("ivdep")
     #define PRAGMA_NOVECTOR         _Pragma("novector")
     #define PRAGMA_VECTOR_ALIGNED   _Pragma("vector aligned")
     #define PRAGMA_VECTOR_UNALIGNED _Pragma("vector unaligned")
@@ -55,8 +55,11 @@ DAAL_EXPORT bool daal_check_is_intel_cpu();
     #define PRAGMA_ICC_NO16(ARGS)   PRAGMA_ICC_TO_STR(ARGS)
     #define DAAL_TYPENAME           typename
 #elif defined(__GNUC__)
-    #define PRAGMA_IVDEP
-    #define PRAGMA_NOVECTOR
+    #if defined(TARGET_ARM)
+        #define PRAGMA_FORCE_SIMD _Pragma("omp simd")
+    #else
+        #define PRAGMA_FORCE_SIMD
+    #endif
     #define PRAGMA_VECTOR_ALIGNED
     #define PRAGMA_VECTOR_UNALIGNED
     #define PRAGMA_VECTOR_ALWAYS
@@ -65,7 +68,7 @@ DAAL_EXPORT bool daal_check_is_intel_cpu();
     #define PRAGMA_ICC_NO16(ARGS)
     #define DAAL_TYPENAME typename
 #elif defined(_MSC_VER)
-    #define PRAGMA_IVDEP
+    #define PRAGMA_FORCE_SIMD
     #define PRAGMA_NOVECTOR
     #define PRAGMA_VECTOR_ALIGNED
     #define PRAGMA_VECTOR_UNALIGNED
@@ -75,7 +78,7 @@ DAAL_EXPORT bool daal_check_is_intel_cpu();
     #define PRAGMA_ICC_NO16(ARGS)
     #define DAAL_TYPENAME typename
 #else
-    #define PRAGMA_IVDEP
+    #define PRAGMA_FORCE_SIMD
     #define PRAGMA_NOVECTOR
     #define PRAGMA_VECTOR_ALIGNED
     #define PRAGMA_VECTOR_UNALIGNED
diff --git a/cpp/daal/src/services/service_utils.h b/cpp/daal/src/services/service_utils.h
index fb766339a86..c5f8e6091dd 100644
--- a/cpp/daal/src/services/service_utils.h
+++ b/cpp/daal/src/services/service_utils.h
@@ -272,7 +272,7 @@ void transpose(const algorithmFPType * src, size_t rows, size_t cols, algorithmF
 {
     for (size_t j = 0; j < cols; j++)
     {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (size_t i = 0; i < rows; i++)
         {
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp
index c0dd4cca565..ef82f821e30 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp
@@ -116,7 +116,7 @@ class infer_model_manager {
             Index* const lc = lc_list_host.get_mutable_data() + tree_idx * max_tree_size_;
             Float* const fv = fv_list_host.get_mutable_data() + tree_idx * max_tree_size_;
 
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (Index i = 0; i < tree_size; i++) {
                 fi[i] = static_cast<Index>(dt_node_list[i].featureIndex);
@@ -128,7 +128,7 @@ class infer_model_manager {
                 const double* probas = daal_model_ptr->getProbas(tree_idx);
                 Float* pv = probas_list_host.get_mutable_data() +
                             tree_idx * max_tree_size_ * ctx_.class_count;
-                PRAGMA_IVDEP
+                PRAGMA_FORCE_SIMD
                 PRAGMA_VECTOR_ALWAYS
                 for (Index i = 0; i < tree_size * ctx_.class_count; i++) {
                     pv[i] = static_cast<Float>(probas[i]);
diff --git a/cpp/oneapi/dal/backend/common.hpp b/cpp/oneapi/dal/backend/common.hpp
index d3bd2ec3ed5..1563b774bec 100644
--- a/cpp/oneapi/dal/backend/common.hpp
+++ b/cpp/oneapi/dal/backend/common.hpp
@@ -26,10 +26,14 @@
 #include "oneapi/dal/detail/common.hpp"
 
 #if defined(__INTEL_COMPILER)
-#define PRAGMA_IVDEP         _Pragma("ivdep")
+#define PRAGMA_FORCE_SIMD    _Pragma("ivdep")
 #define PRAGMA_VECTOR_ALWAYS _Pragma("vector always")
 #else
-#define PRAGMA_IVDEP
+#if defined(TARGET_ARM)
+#define PRAGMA_FORCE_SIMD _Pragma("omp simd")
+#else
+#define PRAGMA_FORCE_SIMD
+#endif
 #define PRAGMA_VECTOR_ALWAYS
 #endif
 
diff --git a/cpp/oneapi/dal/table/backend/convert/common.hpp b/cpp/oneapi/dal/table/backend/convert/common.hpp
index b84ad724d2a..2965117bab4 100644
--- a/cpp/oneapi/dal/table/backend/convert/common.hpp
+++ b/cpp/oneapi/dal/table/backend/convert/common.hpp
@@ -39,7 +39,7 @@ inline dal::array<Type> extract_by_indices(const Index* indices,
     auto result = dal::array<Type>::empty(count);
     auto* const output = result.get_mutable_data();
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     for (std::int64_t i = 0l; i < count; ++i) {
         const Index idx = indices[i];
         output[i] = values[idx];
diff --git a/cpp/oneapi/dal/table/backend/convert/common_convert.cpp b/cpp/oneapi/dal/table/backend/convert/common_convert.cpp
index ab20a0f6060..84b55132ca8 100644
--- a/cpp/oneapi/dal/table/backend/convert/common_convert.cpp
+++ b/cpp/oneapi/dal/table/backend/convert/common_convert.cpp
@@ -45,7 +45,7 @@ dal::array<Pointer> compute_pointers(const dal::array<dal::byte_t>& data,
     auto pointers = dal::array<ptr_t>::empty(count);
     ptr_t* raw_pointers = pointers.get_mutable_data();
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     for (std::int64_t row = 0l; row < count; ++row) {
         raw_pointers[row] = source + raw_offsets[row];
     }
@@ -65,7 +65,7 @@ dal::array<std::int64_t> compute_output_offsets(data_type output_type,
     detail::check_mul_overflow(row_count, row_stride_in_bytes);
     std::int64_t* const raw_offsets = offsets.get_mutable_data();
 
-    PRAGMA_IVDEP
+    PRAGMA_FORCE_SIMD
     for (std::int64_t row = 0l; row < row_count; ++row) {
         raw_offsets[row] = row * row_stride_in_bytes;
     }
diff --git a/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp b/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp
index 533ad9026f4..8b75dddcc42 100644
--- a/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp
+++ b/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp
@@ -69,7 +69,7 @@ struct copy_converter_impl {
             backend::copy(out, inp, count);
         }
         else {
-            PRAGMA_IVDEP
+            PRAGMA_FORCE_SIMD
             PRAGMA_VECTOR_ALWAYS
             for (std::int64_t i = 0l; i < count; ++i) {
                 out[i] = static_cast<out_t>(inp[i]);
@@ -81,7 +81,7 @@ struct copy_converter_impl {
                             std::int64_t out_stride,
                             const inp_t* inp,
                             std::int64_t count) {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (std::int64_t i = 0l; i < count; ++i) {
             const std::int64_t out_offset = i * out_stride;
@@ -94,7 +94,7 @@ struct copy_converter_impl {
                             const inp_t* inp,
                             std::int64_t inp_stride,
                             std::int64_t count) {
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         PRAGMA_VECTOR_ALWAYS
         for (std::int64_t i = 0l; i < count; ++i) {
             const std::int64_t inp_offset = i * inp_stride;
@@ -110,7 +110,7 @@ struct copy_converter_impl {
         // Let's trust compiler to decide if the loop should be
         // vectorized or not. It can be suboptimal if strides are
         // too large
-        PRAGMA_IVDEP
+        PRAGMA_FORCE_SIMD
         for (std::int64_t i = 0l; i < count; ++i) {
             const std::int64_t out_offset = i * out_stride;
             const std::int64_t inp_offset = i * inp_stride;
diff --git a/dev/make/compiler_definitions/gnu.ref.arm.mk b/dev/make/compiler_definitions/gnu.ref.arm.mk
index a78f213183a..fd16d394fda 100644
--- a/dev/make/compiler_definitions/gnu.ref.arm.mk
+++ b/dev/make/compiler_definitions/gnu.ref.arm.mk
@@ -22,7 +22,7 @@ include dev/make/compiler_definitions/gnu.mk
 
 PLATs.gnu = lnxarm
 
-COMPILER.all.gnu =  ${CXX} -march=armv8-a+sve -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \
+COMPILER.all.gnu =  ${CXX} -march=armv8-a+sve -fopenmp-simd -ftree-vectorize -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \
                     -DDAAL_REF -DONEDAL_REF -DDAAL_CPU=sve -Werror -Wreturn-type $(if $(RNG_OPENRNG), -DOPENRNG_BACKEND)
 
 link.dynamic.all.gnu = ${CXX} -march=native
diff --git a/docs/source/contribution/cpu_features.rst b/docs/source/contribution/cpu_features.rst
index f0615e0b2eb..0c82774f054 100644
--- a/docs/source/contribution/cpu_features.rst
+++ b/docs/source/contribution/cpu_features.rst
@@ -180,7 +180,7 @@ instruction set specific code. The implementation is located in the file `abc_cl
 
 Although the implementation of the ``method1`` does not contain any instruction set specific code, it is
 expected that the developers leverage SIMD related macros available in |short_name|.
-For example, ``PRAGMA_IVDEP``, ``PRAGMA_VECTOR_ALWAYS``, ``PRAGMA_VECTOR_ALIGNED`` and other pragmas defined in
+For example, ``PRAGMA_FORCE_SIMD``, ``PRAGMA_VECTOR_ALWAYS``, ``PRAGMA_VECTOR_ALIGNED`` and other pragmas defined in
 `service_defines.h <https://github.com/uxlfoundation/oneDAL/blob/main/cpp/daal/src/services/service_defines.h>`_.
 This will guide the compiler to generate more efficient code for the target architecture.