diff --git a/cpp/daal/src/algorithms/cholesky/cholesky_impl.i b/cpp/daal/src/algorithms/cholesky/cholesky_impl.i index 76716837ff1..8331773609c 100755 --- a/cpp/daal/src/algorithms/cholesky/cholesky_impl.i +++ b/cpp/daal/src/algorithms/cholesky/cholesky_impl.i @@ -151,13 +151,13 @@ bool CholeskyKernel::copyToFullMatrix(NumericTable for (size_t i = iBlock * blockSize; i < endBlock; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j <= i; j++) { pL[i * dim + j] = pA[i * dim + j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = (i + 1); j < dim; j++) { @@ -176,13 +176,13 @@ bool CholeskyKernel::copyToFullMatrix(NumericTable { const size_t ind = (i + 1) * i / 2; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j <= i; j++) { pL[i * dim + j] = pA[ind + j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = (i + 1); j < dim; j++) { @@ -201,13 +201,13 @@ bool CholeskyKernel::copyToFullMatrix(NumericTable { const size_t ind = (2 * dim - j + 1) * j / 2; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j; i++) { pL[i * dim + j] = algorithmFPType(0); } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j; i < dim; i++) { @@ -247,7 +247,7 @@ services::Status CholeskyKernel::copyToLowerTriang { const size_t ind = (i + 1) * i / 2; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j <= i; j++) { @@ -276,7 +276,7 @@ services::Status CholeskyKernel::copyToLowerTriang { const size_t ind = (j + 1) * j / 2; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i <= j; i++) { diff --git a/cpp/daal/src/algorithms/covariance/covariance_impl.i b/cpp/daal/src/algorithms/covariance/covariance_impl.i index 2448a8e9b90..297159ca4b5 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_impl.i +++ b/cpp/daal/src/algorithms/covariance/covariance_impl.i @@ -227,7 +227,7 @@ public: /* Sum input array elements in case of non-normalized data */ for (DAAL_INT i = 0; i < nRows; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (DAAL_INT j = 0; j < _nFeatures; j++) { @@ -269,7 +269,7 @@ public: } /// It is safe to use aligned loads and stores because the data in TArrayScalableCalloc data structures is aligned - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS PRAGMA_VECTOR_ALIGNED for (size_t i = 0; i < (_nFeatures * _nFeatures); i++) @@ -286,7 +286,7 @@ public: return; } /// It is safe to use aligned loads and stores because the data is aligned - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS PRAGMA_VECTOR_ALIGNED for (size_t i = 0; i < _nFeatures; i++) @@ -407,7 +407,7 @@ services::Status updateDenseCrossProductAndSums(bool isNormalized, size_t nFeatu } for (size_t i = 0; i < nFeatures; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -501,7 +501,7 @@ void mergeCrossProductAndSums(size_t nFeatures, const algorithmFPType * partialC if (nObsValue == 0) { daal::threader_for(nFeatures, nFeatures, [=](size_t i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j <= i; j++) { @@ -517,7 +517,7 @@ void mergeCrossProductAndSums(size_t nFeatures, const algorithmFPType * partialC algorithmFPType invNewNObs = 1.0 / (nObsValue + partialNObsValue); daal::threader_for(nFeatures, nFeatures, [=](size_t i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j <= i; j++) { diff --git a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i index 16064a701b0..a36c9da7bd2 100644 --- a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i +++ b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i @@ -419,7 +419,7 @@ int doPartition(SizeType n, const IndexType * aIdx, const ResponseType * aRespon SizeType iRight = 0; int iRowSplitVal = -1; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (SizeType i = 0; i < n; ++i) { @@ -457,7 +457,7 @@ int doPartitionIdx(SizeType n, const IndexType * aIdx, const IndexType * aIdx2, if (aIdx2) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (SizeType i = 0; i < n; ++i) { @@ -478,7 +478,7 @@ int doPartitionIdx(SizeType n, const IndexType * aIdx, const IndexType * aIdx2, } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (SizeType i = 0; i < n; ++i) { diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i index 9b5eb542864..4cc98e908e4 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i @@ -68,7 +68,7 @@ DAAL_FORCEINLINE void fillResults(const size_t nClasses, const enum VotingMethod { if (votingMethod == VotingMethod::unweighted || probas == nullptr) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < blockSize; ++i) { @@ -80,7 +80,7 @@ DAAL_FORCEINLINE void fillResults(const size_t nClasses, const enum VotingMethod { for (size_t i = 0; i < blockSize; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nClasses; ++j) { @@ -340,7 +340,7 @@ Status PredictClassificationTask::predictByTrees(const siz } else if (_votingMethod == VotingMethod::weighted) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _nClasses; ++i) { @@ -357,7 +357,7 @@ Status PredictClassificationTask::predictByTrees(const siz sum += resPtr[i]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _nClasses; ++i) { @@ -403,7 +403,7 @@ Status PredictClassificationTask::predictByTreesWithoutCon } else if (_votingMethod == VotingMethod::weighted) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _nClasses; ++i) { @@ -420,7 +420,7 @@ Status PredictClassificationTask::predictByTreesWithoutCon sum += resPtr[i]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _nClasses; ++i) { @@ -449,7 +449,7 @@ Status PredictClassificationTask::parallelPredict(const al SafeStatus safeStat; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < treeSize; ++i) { @@ -769,7 +769,7 @@ DAAL_FORCEINLINE Status PredictClassificationTask::predict } if (probPtr != nullptr) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _nClasses; ++j) { @@ -891,7 +891,7 @@ DAAL_FORCEINLINE Status PredictClassificationTask::predictOneRowB { const size_t treeSize = _aTree[iTree + i]->getNumberOfRows(); const DecisionTreeNode * aNode = (const DecisionTreeNode *)(*_aTree[iTree + i]).getArray(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < treeSize; ++j) { @@ -1002,7 +1002,7 @@ DAAL_FORCEINLINE Status PredictClassificationTask::predictOneRowB } if (probPtr != nullptr) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _nClasses; ++j) { diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i index 6561366ded0..4e157357f93 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i @@ -138,7 +138,7 @@ protected: //enables specific functions for UnorderedRespHelperBest const double one = double(1); const double cDiv = isZero(sqWeights) ? one : (one / sqWeights); double var = one; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _nClasses; ++i) var -= cDiv * double(imp.hist[i]) * double(imp.hist[i]); imp.var = var; @@ -193,7 +193,7 @@ int UnorderedRespHelperBest::findSplitByHistDefault(int nD if (!split.featureUnordered) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < _nClasses; ++iClass) histLeft[iClass] += nSamplesPerClass[i * _nClasses + iClass]; } @@ -201,7 +201,7 @@ int UnorderedRespHelperBest::findSplitByHistDefault(int nD if (split.featureUnordered) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS //one against others for (size_t iClass = 0; iClass < _nClasses; ++iClass) histLeft[iClass] = nSamplesPerClass[i * _nClasses + iClass]; @@ -210,7 +210,7 @@ int UnorderedRespHelperBest::findSplitByHistDefault(int nD auto histTotal = curImpurity.hist.get(); algorithmFPType sumLeft = 0; algorithmFPType sumRight = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS //proximal impurity improvement for (size_t iClass = 0; iClass < _nClasses; ++iClass) @@ -485,7 +485,7 @@ bool UnorderedRespHelperBest::findSplitCategoricalFeature( if ((count < nMinSplitPart) || ((n - count) < nMinSplitPart) || (leftWeights < minWeightLeaf) || ((totalWeights - leftWeights) < minWeightLeaf)) continue; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _nClasses; ++j) _impRight.hist[j] = curImpurity.hist[j] - _impLeft.hist[j]; calcGini(leftWeights, _impLeft); @@ -632,7 +632,7 @@ public: DAAL_ASSERT(n > 0); node.count = n; node.impurity = imp.var; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < this->_nClasses; ++i) { @@ -681,7 +681,7 @@ protected: auto histTotal = total.get(); auto histRight = right.get(); auto histLeft = left.get(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < this->_nClasses; ++iClass) histRight[iClass] = histTotal[iClass] - histLeft[iClass]; } @@ -1089,7 +1089,7 @@ int UnorderedRespHelperRandom::findSplitByHistDefault(int nLeft = nFeatIdx[idx]; leftWeights = featWeights[idx]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS //one against others for (size_t iClass = 0; iClass < this->_nClasses; ++iClass) histLeft[iClass] = nSamplesPerClass[idx * this->_nClasses + iClass]; @@ -1108,7 +1108,7 @@ int UnorderedRespHelperRandom::findSplitByHistDefault(int nLeft += nFeatIdx[i]; leftWeights += featWeights[i]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < this->_nClasses; ++iClass) histLeft[iClass] += nSamplesPerClass[i * this->_nClasses + iClass]; } @@ -1120,7 +1120,7 @@ int UnorderedRespHelperRandom::findSplitByHistDefault(int auto histTotal = curImpurity.hist.get(); algorithmFPType sumLeft = 0; algorithmFPType sumRight = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS //proximal impurity improvement for (size_t iClass = 0; iClass < this->_nClasses; ++iClass) @@ -1186,7 +1186,7 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi { minidx++; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) { @@ -1202,7 +1202,7 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi while ((minidx < maxidx) && isZero(thisNFeatIdx)) thisNFeatIdx = nFeatIdx[++minidx]; nLeft = thisNFeatIdx; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) { @@ -1217,7 +1217,7 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi return idxFeatureBestSplit; //set histLeft - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] = nSamplesPerClass[minidx * K + iClass]; @@ -1228,7 +1228,7 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi while ((minidx < maxidx) && isZero(thisNFeatIdx)) { maxidx--; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) { @@ -1260,7 +1260,7 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi //iterate idx down to a bin with values for FinalizeBestSplit algorithmFPType thisNFeatIdx(0); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iC = 0; iC < K; ++iC) thisNFeatIdx += nSamplesPerClass[idx * K + iC]; while ((minidx < idx) && isZero(thisNFeatIdx)) @@ -1276,13 +1276,13 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi if (split.featureUnordered) //only need last index { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] = nSamplesPerClass[idx * K + iClass]; } else //sum over all to idx { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = minidx + 1; i <= idx; i++) { @@ -1290,7 +1290,7 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi } } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) leftWeights += histLeft[iClass]; //histleft is forced to float, and may cause issues with algorithmFPType = double @@ -1307,7 +1307,7 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi if (split.featureUnordered) //only need last index { nLeft = nFeatIdx[idx]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] = nSamplesPerClass[idx * K + iClass]; } @@ -1316,13 +1316,13 @@ int UnorderedRespHelperRandom::findSplitFewClasses(int nDi for (size_t i = minidx + 1; i <= idx; i++) { nLeft += nFeatIdx[i]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) histLeft[iClass] += nSamplesPerClass[i * K + iClass]; } } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iClass = 0; iClass < K; ++iClass) leftWeights += histLeft[iClass]; } @@ -1412,7 +1412,7 @@ bool UnorderedRespHelperRandom::findSplitOrderedFeature(co if (noWeights) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (i = 0; i < r; ++i) { @@ -1423,7 +1423,7 @@ bool UnorderedRespHelperRandom::findSplitOrderedFeature(co } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (i = 0; i < r; ++i) { @@ -1550,7 +1550,7 @@ bool UnorderedRespHelperRandom::findSplitCategoricalFeatur if ((count < nMinSplitPart) || ((n - count) < nMinSplitPart) || (leftWeights < minWeightLeaf) || ((totalWeights - leftWeights) < minWeightLeaf)) continue; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < this->_nClasses; ++j) this->_impRight.hist[j] = curImpurity.hist[j] - this->_impLeft.hist[j]; this->calcGini(leftWeights, this->_impLeft); @@ -1618,9 +1618,13 @@ public: { OOBClassificationData * dst = (OOBClassificationData *)other.oobBuf; const OOBClassificationData * src = (const OOBClassificationData *)this->oobBuf; - PRAGMA_IVDEP + const size_t n = _nClasses * nSamples; + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS - for (size_t i = 0, n = _nClasses * nSamples; i < n; ++i) dst[i] += src[i]; + for (size_t i = 0; i < n; ++i) + { + dst[i] += src[i]; + } } } Status finalizeOOBError(const NumericTable * resp, algorithmFPType * res, algorithmFPType * resPerObs, algorithmFPType * resAccuracy, diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i index 1289d31c5f7..7321356c41b 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i @@ -243,9 +243,12 @@ void TreeThreadCtxBase::finalizeVarImp(training::VariableI } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS - for (size_t i = 0; i < nVars; ++i) varImp[i] = 0; + for (size_t i = 0; i < nVars; ++i) + { + varImp[i] = 0; + } } } else if (mode == training::MDI) @@ -363,7 +366,7 @@ services::Status copyBinIndex(const size_t nRows, const size_t nCols, const Inde for (size_t i = iStart; i < iEnd; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nCols; ++j) { @@ -725,8 +728,6 @@ services::Status TrainBatchTaskBasegetNumberOfRows()); @@ -745,17 +746,23 @@ services::Status TrainBatchTaskBase::findBestSplitByHist(size_t nD } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = minidx; i <= idx; ++i) { @@ -962,7 +962,7 @@ int OrderedRespHelperRandom::findBestSplitByHist(size_t nD } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = minidx; i <= idx; ++i) { @@ -1263,7 +1263,7 @@ public: algorithmFPType sumMeanDiff = 0; RegErr * ptr = (RegErr *)this->oobBuf; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nSamples; ++i) { diff --git a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i index 4d5d4829d74..3cfa6881687 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_predict_dense_default_batch_impl.i @@ -464,7 +464,7 @@ services::Status PredictBinaryClassificationTask::run(cons const size_t finishRow = (((iBlock + 1) == nBlocks) ? nRows : (iBlock + 1) * blockSize); daal::internal::MathInst::vExp(finishRow - startRow, res + startRow, expVal + startRow); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t iRow = startRow; iRow < finishRow; ++iRow) { @@ -515,7 +515,7 @@ services::Status PredictBinaryClassificationTask::run(cons { // convert the score to a class label typedef services::internal::SignBit SignBit; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t iRow = 0; iRow < nRows; ++iRow) { // probability is a sigmoid(f) hence sign(f) can be checked @@ -787,7 +787,7 @@ void PredictMulticlassTask::predictByTreesVector(algorithm gbt::prediction::internal::predictForTreeVector( *this->_aTree[iTree], this->_featHelper, x, v, dispatcher); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < vectorBlockSize; ++j) val[(iTree % nClasses) + j * nClasses] += v[j]; } diff --git a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i index 5f10da858c2..9b84b45f33d 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/classification/gbt_classification_train_dense_default_impl.i @@ -71,7 +71,7 @@ public: const size_t end = iBlock + 1 > nSurplus ? start + nPerBlock : start + (nPerBlock + 1); if (sampleInd) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < end; i++) { @@ -83,7 +83,7 @@ public: } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < end; i++) { @@ -96,7 +96,7 @@ public: daal::internal::MathInst::vExp(end - start, exp + start, exp + start); if (sampleInd) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < end; i++) { @@ -107,7 +107,7 @@ public: } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < end; i++) { @@ -139,7 +139,7 @@ public: algorithmFPType * p = bUseTLS ? lsData.local() : buf; const size_t iSample = (sampleInd ? sampleInd[i] : i); getSoftmax(f + _nClasses * iSample, p); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t k = 0; k < _nClasses; ++k) { @@ -165,7 +165,7 @@ protected: { if (maxArg < arg[i]) maxArg = arg[i]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _nClasses; ++i) { @@ -180,7 +180,7 @@ protected: for (size_t i = 0; i < _nClasses; ++i) sum += res[i]; sum = algorithmFPType(1.) / sum; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _nClasses; ++i) res[i] *= sum; } diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i index fd76f8da721..34960d4c9c9 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_predict_dense_default_impl.i @@ -112,7 +112,7 @@ inline void predictForTreeVector(const DecisionTreeType & t, const FeatureTypes for (FeatureIndexType itr = 0; itr < maxLvl; itr++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (FeatureIndexType k = 0; k < vectorBlockSize; k++) { @@ -122,7 +122,7 @@ inline void predictForTreeVector(const DecisionTreeType & t, const FeatureTypes } } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (FeatureIndexType k = 0; k < vectorBlockSize; k++) { diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i index 13bf793be15..c3762dcf732 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_dense_default_impl.i @@ -177,7 +177,7 @@ protected: const auto nF = nRows * _nTrees; //initialize f. TODO: input argument algorithmFPType * pf = f(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nF; ++i) pf[i] = initValue; } @@ -255,7 +255,7 @@ double TrainBatchTaskBase::computeLeafWeight LoopHelper::run(inParallel, nBlocks, [&](size_t iBlock) { const size_t start = iBlock + 1 > nSurplus ? nPerBlock * iBlock + nSurplus : (nPerBlock + 1) * iBlock; const size_t end = iBlock + 1 > nSurplus ? start + nPerBlock : start + (nPerBlock + 1); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < end; i++) pf[idx[i] * this->_nTrees + iTree] += inc; }); @@ -463,7 +463,7 @@ services::Status computeTypeDisp(HostAppIface * pHostApp, const NumericTable * x for (size_t i = iStart; i < iEnd; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nCols; ++j) { diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i index 4b3bee150d9..cb31a063e4e 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i @@ -208,7 +208,7 @@ public: algorithmFPType * aGHSumPrevFP = (algorithmFPType *)aGHSumPrev; algorithmFPType * aGHSumsOtherFP = (algorithmFPType *)aGHSumsOther; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nUnique * 4; ++i) { @@ -237,7 +237,7 @@ struct ComputeGHSumByRows const size_t nCacheLinesToPrefetchOneRow = nFeatures / elementsInCacheLine + !!(nFeatures % elementsInCacheLine); RowIndexType i = iStart; - PRAGMA_IVDEP + for (; i < iEndWithPrefetch; ++i) { DAAL_PREFETCH_READ_T0(pgh + 2 * aIdx[i + prefetchOffset]); @@ -246,7 +246,7 @@ struct ComputeGHSumByRows const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (RowIndexType j = 0; j < nFeatures; j++) { const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]); @@ -256,12 +256,11 @@ struct ComputeGHSumByRows } } - PRAGMA_IVDEP for (; i < iEnd; ++i) { const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (RowIndexType j = 0; j < nFeatures; j++) { const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]); @@ -285,19 +284,19 @@ struct MergeGHSums algorithmFPType * cur = (algorithmFPType *)res.ghSums; algorithmFPType * ptr = results[0] + 4 * iStart; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < 4 * nUnique; i++) cur[i] = ptr[i]; for (size_t iB = 1; iB < nBlocks; ++iB) { algorithmFPType * ptr = results[iB] + 4 * iStart; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < 4 * nUnique; i++) cur[i] += ptr[i]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nUnique; ++i) { @@ -340,7 +339,7 @@ struct ComputeGHSumByRows addsPtr[3] = 0.0f; RowIndexType i = iStart; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (; i < iEndWithPrefetch; ++i) { DAAL_PREFETCH_READ_T0(pgh + 2 * aIdx[i + prefetchOffset]); @@ -351,7 +350,7 @@ struct ComputeGHSumByRows addsPtr[0] = pgh[2 * aIdx[i]]; addsPtr[1] = pgh[2 * aIdx[i] + 1]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (IndexType j = 0; j < nFeatures; j++) { @@ -362,14 +361,14 @@ struct ComputeGHSumByRows } } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (; i < iEnd; ++i) { const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures; addsPtr[0] = pgh[2 * aIdx[i]]; addsPtr[1] = pgh[2 * aIdx[i] + 1]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (IndexType j = 0; j < nFeatures; j++) { const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]); diff --git a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i index 760bbb9debe..a8ea7d33c92 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/gbt_train_partition.i @@ -185,7 +185,7 @@ protected: if (featureUnordered) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (IndexType i = iStart; i < iEnd; ++i) { @@ -197,7 +197,7 @@ protected: } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (IndexType i = iStart; i < iEnd; ++i) { diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i index ec3c3969dfd..a5b124b254d 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_impl.i @@ -397,7 +397,7 @@ services::Status PredictRegressionTask::predictContributio algorithmFPType * contribsOn = buffer + 2 * elementsInMatrix; // Copy nominal values (for bias term) to the condition = 0 buffer - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0ul; i < nRowsData; ++i) { @@ -557,7 +557,7 @@ void PredictRegressionTask::predictByTreesVector(size_t iF gbt::prediction::internal::predictForTreeVector( *_aTree[iTree], _featHelper, x, v, dispatcher); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t row = 0ul; row < vectorBlockSize; ++row) { diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i index d5a2081818c..0fd670a2994 100755 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_impl.i @@ -65,7 +65,7 @@ public: const size_t end = iBlock + 1 > nSurplus ? start + nPerBlock : start + (nPerBlock + 1); if (sampleInd) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < end; i++) { @@ -75,7 +75,7 @@ public: } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < end; i++) { diff --git a/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i b/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i index 2d6fe05d0fb..2701f05a12a 100755 --- a/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/elastic_net/elastic_net_train_dense_default_impl.i @@ -132,7 +132,7 @@ services::Status TrainBatchKernel::compute( for (size_t i = 0; i < numRowsInBlock; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t id = 0; id < nDependentVariables; ++id) { @@ -142,7 +142,7 @@ services::Status TrainBatchKernel::compute( }); yTlsData.reduceTo(yMeansPtr, nDependentVariables); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nDependentVariables; ++i) { @@ -160,7 +160,7 @@ services::Status TrainBatchKernel::compute( for (size_t i = 0; i < numRowsInBlock; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t id = 0; id < nDependentVariables; ++id) { @@ -190,7 +190,7 @@ services::Status TrainBatchKernel::compute( for (size_t i = 0; i < numRowsInBlock; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; ++j) { diff --git a/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i b/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i index 64757061fee..f0890cb3555 100755 --- a/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/em/em_gmm_dense_default_batch_impl.i @@ -178,7 +178,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { algorithmFPType tp = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -198,7 +198,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu { for (size_t j = 0; j < nFeatures; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -211,7 +211,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu algorithmFPType addition = t.logAlpha[k] + t.logSqrtInvDetSigma[k]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -220,7 +220,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu for (size_t j = 0; j < nFeatures; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -228,7 +228,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu } } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -239,7 +239,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu t.partLogLikelyhood = 0; algorithmFPType * maxInRow = t.rowSum; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -248,7 +248,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu for (size_t k = 1; k < nComponents; k++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -261,7 +261,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu for (size_t k = 0; k < nComponents; k++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -273,14 +273,14 @@ void EMKernelTask::stepE(const size_t nVectorsInCu } } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { t.partLogLikelyhood += maxInRow[i]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -291,7 +291,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu for (size_t k = 0; k < nComponents; k++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -301,7 +301,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu t.rowSumInv = t.rowSum; algorithmFPType one = 1.0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -310,7 +310,7 @@ void EMKernelTask::stepE(const size_t nVectorsInCu for (size_t k = 0; k < nComponents; k++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { @@ -334,7 +334,7 @@ algorithmFPType EMKernelTask::computePartialLogLik MathInst::vLog(nVectorsInCurrentBlock, t.rowSumInv, logRowSumInv); algorithmFPType loglikPartial = t.partLogLikelyhood; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nVectorsInCurrentBlock; i++) { diff --git a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i index 1d21ff4965f..43439535a6e 100644 --- a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i +++ b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_batch_impl.i @@ -113,7 +113,7 @@ services::Status ImplicitALSInitKernel::reduceSum { algorithmFPType * const s = arrSum[k]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = start; j < end; ++j) { @@ -164,14 +164,14 @@ services::Status ImplicitALSInitKernel::compute(c s |= this->randFactors(nItems, nFactors, itemsFactors, engine); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nItems; i++) // if number of not null elems is equal 0 { notNullElemSum[i] = (notNullElemSum[i] == algorithmFPType(0.0) ? algorithmFPType(1.0) : notNullElemSum[i]); } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nItems; i++) { diff --git a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i index 09c3f2b9b77..dc02e9d1ab8 100644 --- a/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i +++ b/cpp/daal/src/algorithms/implicit_als/implicit_als_train_init_csr_default_distr_impl.i @@ -279,7 +279,7 @@ Status ImplicitALSInitDistrKernel::computePartial algorithmFPType itemsSum = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t k = start; k < end; ++k) { diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i index 5cc08923966..4241682084b 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i @@ -350,7 +350,7 @@ Status KNNClassificationTrainBatchKernel dx[indexes[i]]) @@ -416,7 +416,7 @@ size_t KNNClassificationTrainBatchKernel(x).getBlockOfColumnValues(j, 0, xRowCount, readOnly, columnBD); const algorithmFpType * const dx = columnBD.getBlockPtr(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t i = 0; i < elementCount; ++i) { sampleValues[i] = dx[indexes[start + i]]; @@ -580,7 +580,7 @@ algorithmFpType KNNClassificationTrainBatchKernel void { if (v) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < sampleCount; ++j) { @@ -734,8 +734,6 @@ size_t KNNClassificationTrainBatchKernel::computeDotProduct(const size_t double partialSum[8]; _mm512_storeu_pd(partialSum, vSum); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (int i = 0; i < 8; i++) { @@ -289,7 +289,7 @@ inline float KernelCSRImplBase::computeDotProduct(const size_t st double partialSum[8]; _mm512_storeu_pd(partialSum, vSum); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (int i = 0; i < 8; i++) { diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i index f8e5925ce22..5c6cc4aa933 100755 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_impl.i @@ -65,7 +65,7 @@ services::Status KernelImplRBF::computeInter //compute const algorithmFPType invSqrSigma = (algorithmFPType)(1.0 / (par->sigma * par->sigma)); algorithmFPType factor = 0.0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nFeatures; i++) { @@ -102,7 +102,7 @@ services::Status KernelImplRBF::computeInter for (size_t i = 0; i < nVectors1; i++) { algorithmFPType factor = 0.0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i index bff8c1d7a30..14fab573d52 100644 --- a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i +++ b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_impl.i @@ -139,7 +139,7 @@ services::Status KernelImplPolynomial::computeInt if (k != one || b != zero) { daal::threader_for_optional(nVectors1, nVectors1, [=](size_t i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j <= i; j++) { @@ -158,7 +158,7 @@ services::Status KernelImplPolynomial::computeInt } daal::threader_for_optional(nVectors1, nVectors1, [=](size_t i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = i + 1; j < nVectors1; j++) { diff --git a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i index b95ee30d666..153e80cd7a7 100644 --- a/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i +++ b/cpp/daal/src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_impl.i @@ -63,7 +63,7 @@ services::Status KernelImplPolynomial::compu //compute dataR[0] = 0.0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nFeatures; i++) { @@ -102,7 +102,7 @@ services::Status KernelImplPolynomial::compu services::internal::service_memset_seq(dataR, b, nVectors1); for (size_t i = 0; i < nVectors1; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -183,7 +183,7 @@ services::Status KernelImplPolynomial::compu BlasInst::xxgemm(&trans, ¬rans, &nRowsInBlock2, &nRowsInBlock1, (DAAL_INT *)&nFeatures, &alpha, dataA2, (DAAL_INT *)&nFeatures, dataA1, (DAAL_INT *)&nFeatures, &beta, dataR, (DAAL_INT *)&nVectors2); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRowsInBlock1; ++i) { @@ -218,7 +218,7 @@ services::Status KernelImplPolynomial::compu BlasInst::xxgemm(&trans, ¬rans, &nRowsInBlock1, &nRowsInBlock2, (DAAL_INT *)&nFeatures, &alpha, dataA1, (DAAL_INT *)&nFeatures, dataA2, (DAAL_INT *)&nFeatures, &beta, mklBuff, &ldc2); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < blockSize * blockSize; ++i) { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i index 7640be0cb51..08c9b38a55b 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_batch_impl.i @@ -150,7 +150,7 @@ Status KMeansBatchKernel::compute(const NumericTab { const algorithmFPType coeff = 1.0 / clusterS0[i]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < p; j++) { @@ -167,7 +167,7 @@ Status KMeansBatchKernel::compute(const NumericTab ReadRows mtRow(ntData, cIndices[cPos], 1); const algorithmFPType * row = mtRow.get(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < p; j++) { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i index c612cf4f9a3..c96580fe3f4 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_distr_step1_impl.i @@ -180,7 +180,7 @@ Status KMeansDistributedStep1Kernel::finalizeCompu DAAL_CHECK_BLOCK_STATUS(outBlock); int * outAssignments = outBlock.get(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t i = 0; i < n; i++) { outAssignments[i] = inAssignments[i]; diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i index c8297ddb336..8e366e8e2fc 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i @@ -68,7 +68,7 @@ struct TaskKMeansLloyd for (size_t k = 0; k < clNum; k++) { algorithmFPType sum = algorithmFPType(0); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_ICC_NO16(omp simd reduction(+ : sum)) for (size_t j = 0; j < dim; j++) { @@ -185,7 +185,7 @@ Status TaskKMeansLloyd::addNTToTaskThreadedDense(const Num for (size_t j = 0; j < nClusters; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < blockSize; i++) { @@ -223,7 +223,7 @@ Status TaskKMeansLloyd::addNTToTaskThreadedDense(const Num const size_t minIdx = *((algIntType *)&(x_clusters[i])); algorithmFPType minGoalVal = x_clusters[i + blockSize]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < p; j++) { cS1[minIdx * p + j] += data[i * p + j]; @@ -372,7 +372,7 @@ int TaskKMeansLloyd::kmeansUpdateCluster(int jidx, centroi tls_task->reduce([=](TlsTask * tt) -> void { int j; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (j = 0; j < dim; j++) { s1[j] += tt->cS1[idx * dim + j]; @@ -392,7 +392,7 @@ void TaskKMeansLloyd::kmeansComputeCentroids(int * cluster service_memset_seq(auxData, 0.0, dim); clusterS0[i] = kmeansUpdateCluster(i, auxData); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < dim; j++) { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h index 598bd40e7a6..131a23af549 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h +++ b/cpp/daal/src/algorithms/kmeans/kmeans_lloyd_postprocessing.h @@ -76,7 +76,7 @@ struct PostProcessing for (size_t k = 0; k < nClusters; k++) { algorithmFPType sum = algorithmFPType(0); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_ICC_NO16(omp simd reduction(+ : sum)) for (size_t j = 0; j < p; j++) { @@ -203,7 +203,7 @@ struct PostProcessing { clSq[k] = 0; algorithmFPType sum = algorithmFPType(0); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_ICC_NO16(omp simd reduction(+ : sum)) for (size_t j = 0; j < p; j++) { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i index be8859690b3..71e826c0260 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -198,7 +198,7 @@ public: for (size_t iRow = 0u; iRow < nRowsToProcess; iRow++) { algorithmFPType dist2 = algorithmFPType(0); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0u; i < dim; i++) { @@ -224,7 +224,7 @@ public: const algorithmFPType * pData = ntDataBD.get(); algorithmFPType res(0.); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < dim; ++i) { diff --git a/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i b/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i index 4c1a5e599a6..96e1cc062e4 100755 --- a/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/lasso_regression/lasso_regression_train_dense_default_impl.i @@ -132,7 +132,7 @@ services::Status TrainBatchKernel::compute( for (size_t i = 0; i < numRowsInBlock; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t id = 0; id < nDependentVariables; ++id) { @@ -142,7 +142,7 @@ services::Status TrainBatchKernel::compute( }); yTlsData.reduceTo(yMeansPtr, nDependentVariables); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nDependentVariables; ++i) { @@ -160,7 +160,7 @@ services::Status TrainBatchKernel::compute( for (size_t i = 0; i < numRowsInBlock; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t id = 0; id < nDependentVariables; ++id) { @@ -188,7 +188,7 @@ services::Status TrainBatchKernel::compute( for (size_t i = 0; i < numRowsInBlock; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; ++j) { diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i index e3601d74439..18c76fb752d 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i +++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_impl.i @@ -127,7 +127,7 @@ Status FinalizeKernel::compute(const NumericTable & xtxTab { for (size_t i = 0; i < nResponses; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 1; j < nBetas; j++) { @@ -140,7 +140,7 @@ Status FinalizeKernel::compute(const NumericTable & xtxTab { for (size_t i = 0; i < nResponses; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nBetas - 1; j++) { diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i index e9698ddc674..5628e3b64e0 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i +++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i @@ -101,7 +101,7 @@ Status ThreadingTask::update(DAAL_INT startRow, DAAL_INT n for (DAAL_INT i = 0; i < nRows; i++, xPtr += nFeatures) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (DAAL_INT j = 0; j < nFeatures; j++) { @@ -124,7 +124,7 @@ Status ThreadingTask::update(DAAL_INT startRow, DAAL_INT n const algorithmFPType * yPtr = y; for (DAAL_INT i = 0; i < nRows; i++, yPtr += _nResponses) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (DAAL_INT j = 0; j < _nResponses; j++) { @@ -140,7 +140,7 @@ void ThreadingTask::reduce(algorithmFPType * xtx, algorith { { DAAL_PROFILER_THREADING_TASK(reduce.syrkX); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < (_nBetasIntercept * _nBetasIntercept); i++) { @@ -150,7 +150,7 @@ void ThreadingTask::reduce(algorithmFPType * xtx, algorith { DAAL_PROFILER_THREADING_TASK(reduce.gemmXY); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < (_nBetasIntercept * _nResponses); i++) { diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i index a7fd526f13e..b497a8933e5 100755 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i +++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_common_impl.i @@ -87,7 +87,7 @@ Status CommonKernel::computeQRForBlock(DAAL_INT p, DAAL_IN for (size_t i = 0; i < nRowsInR; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j <= i + jOffset; ++j) { @@ -112,7 +112,7 @@ Status CommonKernel::computeQRForBlock(DAAL_INT p, DAAL_IN for (size_t i = 0; i < p - n; ++i) { r[i * p + i] = one; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < i; ++j) { diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i index 5c9068cbb22..2a260573bb5 100755 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i +++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_qr_finalize_impl.i @@ -87,7 +87,7 @@ Status FinalizeKernel::compute(const NumericTable & rTable for (size_t i = 0; i < nResponses; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nBetasIntercept; j++) { @@ -115,7 +115,7 @@ Status FinalizeKernel::compute(const NumericTable & rTable { for (size_t i = 0; i < nResponses; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 1; j < nBetas; j++) { @@ -128,7 +128,7 @@ Status FinalizeKernel::compute(const NumericTable & rTable { for (size_t i = 0; i < nResponses; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nBetas - 1; j++) { diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i index cbaa63d87a1..9a1562172ac 100644 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_impl.i @@ -180,7 +180,7 @@ protected: s |= gemvSoa(x, beta + 1, xb, nRows, nCols, xOffset); if (bIntercept) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRows; ++i) xb[i] += beta[0]; } diff --git a/cpp/daal/src/algorithms/logitboost/logitboost_impl.i b/cpp/daal/src/algorithms/logitboost/logitboost_impl.i index 479101ccf21..b6fdee988c7 100644 --- a/cpp/daal/src/algorithms/logitboost/logitboost_impl.i +++ b/cpp/daal/src/algorithms/logitboost/logitboost_impl.i @@ -110,7 +110,7 @@ void UpdateP(size_t nc, size_t n, algorithmFPType * F, algorithmFPType * P, algo algorithmFPType invs = (algorithmFPType)1.0 / s; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nc; j++) { diff --git a/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i b/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i index 89a8f6f9e09..a517725e500 100755 --- a/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i +++ b/cpp/daal/src/algorithms/logitboost/logitboost_train_friedman_impl.i @@ -172,7 +172,7 @@ services::Status UpdateFPNew(size_t nc, size_t n, algorithmFPType * F, algorithm /* Update additive function's values Step 2.b) of the Algorithm 6 from [1] */ /* i-row contains Fi() for all classes in i-th point x */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = start; i < start + size; i++) { @@ -206,7 +206,7 @@ services::Status UpdateFPNew(size_t nc, size_t n, algorithmFPType * F, algorithm Step 2.c) of the Algorithm 6 from [1] */ const bool useFullBuffer = size * nc <= n; if (useFullBuffer) daal::internal::MathInst::vExp(nc * size, F + start * nc, buffer); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < size; i++) { diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i index 3c071e5ac73..ee573b95204 100644 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i +++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_batch.i @@ -278,7 +278,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result) const algorithmFPType * const argi = _dataArray_block + i * _cd.nFeatures; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _cd.nFeatures; j++) { @@ -373,7 +373,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result) size_t _jstart = iFeatureBlock * numFeaturesInBlock; size_t _jend = _jstart + ((iFeatureBlock < (numFeatureBlocks - 1)) ? numFeaturesInBlock : numFeaturesInLastBlock); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = _jstart; j < _jend; j++) { @@ -408,7 +408,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result) } /* if(_cd.nFeatures >= _THREAD_REDUCTION_MIN_SIZE_) */ else /* if(_cd.nFeatures < _THREAD_REDUCTION_MIN_SIZE_) */ { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _cd.nFeatures; j++) { @@ -458,7 +458,7 @@ Status compute_estimates(NumericTable * dataTable, Result * result) #if (defined _VART_ENABLE_ || defined _SORM_ENABLE_) const algorithmFPType _invN = algorithmFPType(1.0) / algorithmFPType(_cd.nVectors); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _cd.nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i index 93f4ef1d30d..72ec07a02a7 100644 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i +++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_estimates_online.i @@ -311,7 +311,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult #if defined _MEAN_ENABLE_ || defined _SORM_ENABLE_ const algorithmFPType _invN = algorithmFPType(1.0) / algorithmFPType(_td->nvectors + 1); #endif - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _cd.nFeatures; j++) { @@ -376,7 +376,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult algorithmFPType mean_scale = algorithmFPType(1.0) / (n1_p_n2); algorithmFPType variance_scale = algorithmFPType(1.0) / (n1_p_n2 - algorithmFPType(1.0)); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _cd.nFeatures; j++) { @@ -415,7 +415,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult if (isOnline) { #if (defined _SUM_ENABLE_) || (defined _MEAN_ENABLE_) - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _cd.nFeatures; i++) { @@ -430,7 +430,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult algorithmFPType nVectorsM1 = (algorithmFPType)(_cd.nVectors - 1); if (!isOnline) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _cd.nFeatures; i++) { @@ -441,7 +441,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult { if (nObs == 0) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _cd.nFeatures; i++) { @@ -454,7 +454,7 @@ Status compute_estimates(NumericTable * dataTable, PartialResult * partialResult algorithmFPType invNVectors = 1.0 / (algorithmFPType)_cd.nVectors; algorithmFPType coeff = (algorithmFPType)(nObs * _cd.nVectors) / (algorithmFPType)(nObs + _cd.nVectors); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _cd.nFeatures; i++) { diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i index b85d3261374..0ed711dea1c 100755 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i +++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_impl.i @@ -383,7 +383,7 @@ Status retrievePrecomputedStatsIfPossible(const size_t nFeatures, const size_t n const algorithmFPType invNVectors = 1.0 / (algorithmFPType)nVectors; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nFeatures; i++) { @@ -408,7 +408,7 @@ Status computeSumAndVariance(size_t nFeatures, size_t nVectors, algorithmFPType if (isOnline) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nFeatures; i++) { @@ -488,7 +488,7 @@ Status computeMinMaxAndSumOfSquared(const size_t nFeatures, const size_t nVector max = &_array[nfeatures * 1]; sumSq = &_array[nfeatures * 2]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nfeatures; j++) { @@ -524,7 +524,7 @@ Status computeMinMaxAndSumOfSquared(const size_t nFeatures, const size_t nVector for (size_t i = startRows; i < startRows + chunkRows; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -538,7 +538,7 @@ Status computeMinMaxAndSumOfSquared(const size_t nFeatures, const size_t nVector }); tslData.reduce([&](TslData * localTslData) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -569,7 +569,7 @@ void computeSumOfSquaredDiffsFromMean(size_t nFeatures, size_t nVectors, size_t const algorithmFPType nVectorsM1 = (algorithmFPType)(nVectors - 1); if (!isOnline) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nFeatures; i++) { @@ -580,7 +580,7 @@ void computeSumOfSquaredDiffsFromMean(size_t nFeatures, size_t nVectors, size_t if (prevNVectors == 0) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nFeatures; i++) { @@ -860,7 +860,7 @@ void finalize(LowOrderMomentsFinalizeTask & task) size_t nFeatures = task.nFeatures; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nFeatures; i++) { diff --git a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i index fafd8e3cd5d..f24604c8e83 100644 --- a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i +++ b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_votebased_impl.i @@ -112,7 +112,7 @@ public: if (!s) return Status(ErrorMultiClassFailedToComputeTwoClassPrediction).add(s); /* Compute votes for the block of input observations */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRows; ++i) { diff --git a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i index f92189adbfb..3845952ca6f 100644 --- a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i +++ b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_train_oneagainstone_impl.i @@ -337,7 +337,7 @@ Status SubTaskDense::copyDataIntoSubtable(size_t nFeatures originalIndicesMap[nRows] = ix; _mtX.next(ix, 1); DAAL_CHECK_BLOCK_STATUS(_mtX); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t jx = 0; jx < nFeatures; jx++) this->_subsetX.get()[nRows * nFeatures + jx] = _mtX.get()[jx]; this->_subsetY[nRows] = label; @@ -362,14 +362,14 @@ Status SubTaskCSR::copyDataIntoSubtable(size_t nFeatures, originalIndicesMap[nRows] = ix; _mtX.next(ix, 1); DAAL_CHECK_BLOCK_STATUS(_mtX); - const size_t nNonZeroValuesInRow = _mtX.rows()[1] - _mtX.rows()[0]; - const size_t * colIndices = _mtX.cols(); - PRAGMA_IVDEP - PRAGMA_VECTOR_ALWAYS + const size_t nNonZeroValuesInRow = _mtX.rows()[1] - _mtX.rows()[0]; + const size_t * colIndices = _mtX.cols(); + const algorithmFPType * mtXValues = _mtX.values(); + algorithmFPType * subsetXData = this->_subsetX.get(); for (size_t jx = 0; jx < nNonZeroValuesInRow; ++jx, ++dataIndex) { - this->_subsetX.get()[dataIndex] = _mtX.values()[jx]; - _colIndicesX[dataIndex] = colIndices[jx]; + subsetXData[dataIndex] = mtXValues[jx]; + _colIndicesX[dataIndex] = colIndices[jx]; } _rowOffsetsX[nRows + 1] = _rowOffsetsX[nRows] + nNonZeroValuesInRow; this->_subsetY[nRows] = label; diff --git a/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i b/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i index b3385013635..1e8d028bac6 100644 --- a/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i +++ b/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_fast_impl.i @@ -173,7 +173,7 @@ services::Status methodSpecific::getPredicti int max_c = 0; algorithmFPType max_c_val = -(services::internal::MaxVal::get()); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t cl = 0; cl < c; cl++) { algorithmFPType val = buff[j * c + cl]; @@ -221,7 +221,7 @@ services::Status methodSpecific::getPredictionDat int max_c = 0; algorithmFPType max_c_val = -(services::internal::MaxVal::get()); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t cl = 0; cl < c; cl++) { algorithmFPType val = buff[j + cl * n]; diff --git a/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i b/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i index 74a6732140d..71c5079834d 100644 --- a/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i +++ b/cpp/daal/src/algorithms/naivebayes/naivebayes_train_impl.i @@ -197,7 +197,7 @@ Status collectCounters(const Parameter * nbPar, NumericTable * ntData, NumericTa tls_n_ci.reduce([=](algorithmFPType * v) { if (!v) return; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS PRAGMA_VECTOR_ALIGNED for (size_t j = 0; j < c; j++) @@ -230,13 +230,13 @@ Status mergeModels(const Parameter * nbPar, size_t p, size_t nModels, PartialMod const algorithmFPType * in_n_ci = rrCi.set(models[i]->getClassGroupSum().get(), 0, c); DAAL_CHECK_BLOCK_STATUS(rrCi); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < c; j++) { n_c[j] += in_n_c[j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < p * c; j++) { n_ci[j] += in_n_ci[j]; @@ -361,13 +361,13 @@ services::Status NaiveBayesBatchTrainKernel::compu return Status(ErrorMemoryAllocationFailed); } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < c; j++) { n_c[j] = 0; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < p * c; j++) { n_ci[j] = 0; diff --git a/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i b/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i index e9c07f25333..6828a3507f6 100644 --- a/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i +++ b/cpp/daal/src/algorithms/normalization/minmax/minmax_impl.i @@ -106,7 +106,7 @@ Status MinMaxKernel::processBlock(const NumericTab for (size_t i = 0; i < blockSize; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nColumns; j++) { diff --git a/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i b/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i index dff721e281e..d9669e6c1d2 100644 --- a/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i +++ b/cpp/daal/src/algorithms/normalization/zscore/zscore_dense_sum_impl.i @@ -52,7 +52,7 @@ Status ZScoreKernel::computeMeanVariance_thr(Num algorithmFPType invNm1 = algorithmFPType(1.0) / (algorithmFPType(nVectors) - algorithmFPType(1.0)); /* Compute means from sums */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -89,7 +89,7 @@ Status ZScoreKernel::computeMeanVariance_thr(Num for (size_t i = 0; i < _nRows; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -103,7 +103,7 @@ Status ZScoreKernel::computeMeanVariance_thr(Num tls_data.reduce([&](algorithmFPType * pVariances) { if (pVariances) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -114,7 +114,7 @@ Status ZScoreKernel::computeMeanVariance_thr(Num }); /* Convert array of variances to unbiased */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i b/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i index 7000f806cb6..bb6daa34365 100644 --- a/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i +++ b/cpp/daal/src/algorithms/normalization/zscore/zscore_impl.i @@ -71,7 +71,7 @@ Status ZScoreKernelBase::common_compute(NumericTable & inp for (size_t i = 0; i < _nRows; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _nFeatures; j++) { @@ -119,7 +119,7 @@ Status ZScoreKernelBase::common_compute(NumericTable & inp for (size_t i = 0; i < _nRows; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _nFeatures; j++) { @@ -145,7 +145,7 @@ Status ZScoreKernelBase::common_compute(NumericTable & inp for (size_t i = 0; i < _nRows; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < _nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i index d6e51a93669..57b3bb82ced 100644 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_impl.i @@ -61,7 +61,7 @@ static void applyBetaImpl(const algorithmFPType * x, const algorithmFPType * bet BlasInst::xxgemm(&trans, ¬rans, &m, &n, &k, &one, beta + 1, &ldb, x, &k, &zero, xb, &m); if (bIntercept) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRows; ++i) { @@ -97,13 +97,13 @@ void CrossEntropyLossKernel::softmax(const algorit const algorithmFPType * const pArg = arg + iRow * nCols; algorithmFPType * const pRes = res + iRow * nCols; algorithmFPType maxArg = pArg[0]; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 1; i < nCols; ++i) { if (maxArg < pArg[i]) maxArg = pArg[i]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nCols; ++i) { @@ -120,14 +120,14 @@ void CrossEntropyLossKernel::softmax(const algorit { algorithmFPType * const pRes = res + iRow * nCols; algorithmFPType sum(0.); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nCols; ++i) { sum += pRes[i]; } sum = static_cast(1.) / sum; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nCols; ++i) { @@ -143,14 +143,14 @@ void CrossEntropyLossKernel::softmax(const algorit { algorithmFPType * const pRes = res + iRow * nCols; algorithmFPType sum(0.); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nCols; ++i) { sum += pRes[i]; } sum = static_cast(1.) / sum; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nCols; ++i) { @@ -291,7 +291,7 @@ services::Status CrossEntropyLossKernel::doCompute { curentNorm = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < p; j++) { diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i index 625d99a104a..62dc7007e42 100644 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_impl.i @@ -61,7 +61,7 @@ static void applyBetaImpl(const algorithmFPType * x, const algorithmFPType * bet } if (bIntercept) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < n; ++i) { @@ -81,7 +81,7 @@ template static void vexp(const algorithmFPType * f, algorithmFPType * exp, size_t n) { const algorithmFPType expThreshold = daal::internal::MathInst::vExpThreshold(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < n; ++i) { @@ -96,7 +96,7 @@ static void vexp(const algorithmFPType * f, algorithmFPType * exp, size_t n) template static void sigmoids(algorithmFPType * exp, size_t n, size_t offset) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < n; ++i) { @@ -112,7 +112,7 @@ void LogLossKernel::sigmoid(const algorithmFPType //s = exp(-f) vexp(f, s, n); //s = sigm(f) - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < n; ++i) { @@ -370,7 +370,7 @@ services::Status LogLossKernel::doCompute(const Nu const DAAL_INT nN = static_cast(nRowsToProcess); algorithmFPType * const pg = grads.get() + iBlock * p; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRowsToProcess; ++i) { @@ -380,7 +380,7 @@ services::Status LogLossKernel::doCompute(const Nu daal::internal::BlasInst::xxgemm(¬rans, ¬rans, &dim, &yDim, &nN, &one, xLocal, &dim, sgPtrLocal, &nN, &zero, pg, &dim); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRowsToProcess; ++i) { diff --git a/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i b/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i index b6691f2bcc6..9bd98333e72 100644 --- a/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_impl.i @@ -129,7 +129,7 @@ inline services::Status MSEKernel::compute(Numeric result |= daal::services::internal::daal_memcpy_s(residualPtr, n * yDim * sizeof(algorithmFPType), Y, n * yDim * sizeof(algorithmFPType)); size_t compute_matrix = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < (nTheta + 1) * yDim; i++) { @@ -237,7 +237,7 @@ inline services::Status MSEKernel::compute(Numeric tlsData.reduceTo(hessianDiagonalPtr, nTheta); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nTheta; ++j) { @@ -278,7 +278,7 @@ inline services::Status MSEKernel::compute(Numeric { if (previousFeatureId == 0 && parameter->interceptFlag) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nDataRows; i++) /*threader for*/ { @@ -305,7 +305,7 @@ inline services::Status MSEKernel::compute(Numeric { for (size_t i = 0; i < nDataRows; i++) /*threader for*/ { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t ic = 0; ic < yDim; ic++) dotPtr[ic] += residualPtr[i * yDim + ic]; } @@ -366,11 +366,11 @@ inline services::Status MSEKernel::compute(Numeric XY.reset(dim * yDim); XYPtr = XY.get(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < dim * yDim; i++) XYPtr[i] = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < dim * dim; i++) gramMatrixPtr[i] = 0; char uplo = 'L'; @@ -415,13 +415,13 @@ inline services::Status MSEKernel::compute(Numeric }); } tlsData.reduce([&](algorithmFPType * local) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < dim * yDim; j++) { XYPtr[j] += local[j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < dim * dim; j++) { @@ -431,7 +431,7 @@ inline services::Status MSEKernel::compute(Numeric const size_t dimension = dim; for (size_t i = 0; i < dimension; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = i; j < dimension; j++) gramMatrixPtr[j * dim + i] = gramMatrixPtr[i * dim + j]; } @@ -574,7 +574,7 @@ inline services::Status MSEKernel::compute(Numeric } tlsData.reduceTo(hessianDiagonalPtr, nTheta); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nTheta; ++j) { diff --git a/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i b/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i index 92d15fc7a9a..cd17049eedc 100755 --- a/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i +++ b/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_impl.i @@ -209,7 +209,7 @@ services::Status AdagradKernel::compute(HostAppIfa processByBlocks( nRows, [=](size_t startOffset, size_t nRowsInBlock) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = startOffset; j < startOffset + nRowsInBlock; j++) { @@ -225,7 +225,7 @@ services::Status AdagradKernel::compute(HostAppIfa processByBlocks( nRows, [=](size_t startOffset, size_t nRowsInBlock) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = startOffset; j < startOffset + nRowsInBlock; j++) { diff --git a/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h b/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h index 8a6683ed517..e94ef7884b8 100755 --- a/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h +++ b/cpp/daal/src/algorithms/optimization_solver/iterative_solver_kernel.h @@ -95,7 +95,7 @@ class IterativeSolverKernel : public Kernel algorithmFPType * normPtr = normTls.local(); DAAL_CHECK_THR(normPtr, services::ErrorMemoryAllocationFailed); PRAGMA_VECTOR_ALWAYS - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < nRowsInBlock; j++) { *normPtr += vecLocal[j] * vecLocal[j]; @@ -117,7 +117,7 @@ class IterativeSolverKernel : public Kernel res = 0; if (nElements < blockStartThreshold) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nElements; j++) { diff --git a/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i b/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i index 23ef6bdf7a6..5f57ed231c5 100755 --- a/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i +++ b/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_impl.i @@ -550,7 +550,7 @@ template void LBFGSTask::computeCorrectionPairImpl(size_t correctionIndex, const algorithmFPType * hessian, bool useWolfeConditions) { algorithmFPType * s = correctionS + correctionIndex * this->argumentSize; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < this->argumentSize; j++) { @@ -568,7 +568,7 @@ void LBFGSTask::computeCorrectionPairImpl(size_t correctio algorithmFPType * gradientPrev = (algorithmFPType *)_gradientPrevPtr.get(); algorithmFPType * gradientCurr = (algorithmFPType *)_gradientCurrPtr.get(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < this->argumentSize; j++) { diff --git a/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i b/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i index 97639f68d93..04705edc5c8 100755 --- a/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i +++ b/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_impl.i @@ -284,7 +284,7 @@ services::Status SagaKernel::compute(HostAppIface result |= daal::services::internal::daal_memcpy_s(previous, sizeArgument * sizeof(algorithmFPType), workValue, sizeArgument * sizeof(algorithmFPType)); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t k = 0; k < sizeArgument; k++) { @@ -313,7 +313,7 @@ services::Status SagaKernel::compute(HostAppIface } } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t k = 0; k < sizeArgument; k++) { diff --git a/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i b/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i index 93fbf23a684..bb8fffaad90 100644 --- a/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i +++ b/cpp/daal/src/algorithms/outlierdetection_multivariate/outlierdetection_multivariate_dense_default_impl.i @@ -64,7 +64,7 @@ inline void OutlierDetectionKernel::mahalanobisDis algorithmFPType * dataCenPtr = dataCen; for (size_t i = 0; i < nVectors; i++, dataPtr += nFeatures, dataCenPtr += nFeatures) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -79,7 +79,7 @@ inline void OutlierDetectionKernel::mahalanobisDis for (size_t i = 0; i < nVectors; i++, dataCenPtr += nFeatures, dataCenInvScatterPtr += nFeatures) { distance[i] = zero; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h b/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h index a4ba5865f17..cf985c6e16f 100644 --- a/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h +++ b/cpp/daal/src/algorithms/outlierdetection_univariate/outlierdetection_univariate_kernel.h @@ -60,7 +60,7 @@ struct OutlierDetectionKernel : public Kernel const algorithmFPType zero(0.0); const algorithmFPType one(1.0); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -97,7 +97,7 @@ struct OutlierDetectionKernel : public Kernel algorithmFPType diff; for (size_t i = 0; i < nRowsInBlock; i++, dataPtr += nFeatures, weightPtr += nFeatures) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i b/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i index 7baf293d87e..0b60e6784d9 100644 --- a/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i +++ b/cpp/daal/src/algorithms/pca/metrics/pca_explained_variance_default_batch_impl.i @@ -75,7 +75,7 @@ Status ExplainedVarianceKernel::compute(const Nume if (id >= nComponents) noiseSum += pEigenvalues[id]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t id = 0; id < nComponents; ++id) { diff --git a/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i b/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i index 352dd0b229c..cbed9a6d78b 100644 --- a/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i +++ b/cpp/daal/src/algorithms/pca/pca_dense_svd_batch_impl.i @@ -230,7 +230,7 @@ services::Status PCASVDBatchKernel::normali { const algorithmFPType _invN = algorithmFPType(1.0) / algorithmFPType(tls_data_local->nvectors + 1); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -258,7 +258,7 @@ services::Status PCASVDBatchKernel::normali const algorithmFPType inv_n1_p_n2 = algorithmFPType(1.0) / (n1_p_n2); const algorithmFPType inv_n1_p_n2_m1 = algorithmFPType(1.0) / (n1_p_n2 - algorithmFPType(1.0)); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -280,7 +280,7 @@ services::Status PCASVDBatchKernel::normali if (!safeStat) return safeStat.detach(); /* Convert array of variances to inverse sigma's */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { @@ -297,7 +297,7 @@ services::Status PCASVDBatchKernel::normali for (size_t i = 0; i < nVectors_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i b/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i index 09b932c4965..4d03f27fe84 100644 --- a/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i +++ b/cpp/daal/src/algorithms/pca/pca_dense_svd_online_impl.i @@ -156,7 +156,7 @@ inline void normalizeData(const size_t nObservations, const size_t nFeatures, co { for (size_t i = 0; i < nObservations; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nFeatures; j++) { diff --git a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i b/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i index b8947cc856d..a5e8b4f1524 100644 --- a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_impl.i @@ -78,7 +78,7 @@ services::Status ComputeInvSigmas(NumericTable * pVariances, TArray::compute(NumericT for (size_t rowId = 0; rowId < numRows; ++rowId) { /* compute centering if numMeans != 0 */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t colId = 0; colId < numMeans; ++colId) { pCopyBlock[rowId * numMeans + colId] = pDataBlock[rowId * numMeans + colId] - pRawMeans[colId]; } /* compute normalization to unit variance if numInvSigmas!= 0 */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t colId = 0; colId < numInvSigmas; ++colId) { @@ -203,7 +203,7 @@ services::Status TransformKernel::compute(NumericT { for (size_t rowId = 0; rowId < numRows; ++rowId) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t colId = 0; colId < numComponents; ++colId) { diff --git a/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i b/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i index 3e052f7b0af..b28d40b6167 100644 --- a/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/qr/qr_dense_default_batch_impl.i @@ -252,7 +252,7 @@ Status QRBatchKernel::compute_thr(const size_t na, /* Get transposed Q from A */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { QT_local[i * brows_local + j] = A_block[i + j * cols_local]; @@ -266,7 +266,7 @@ Status QRBatchKernel::compute_thr(const size_t na, /* Transpose Q */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { Q_block[i + j * cols_local] = QT_local[i * brows_local + j]; @@ -277,16 +277,13 @@ Status QRBatchKernel::compute_thr(const size_t na, for (size_t i = 0; i < cols_local; i++) { size_t j; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (j = 0; j <= i; j++) { RT_buff[k * cols_local + i * cols_local * blocks + j] = RT_local[i * cols_local + j]; } - PRAGMA_IVDEP - for (; j < cols_local; j++) - { - RT_buff[k * cols_local + i * cols_local * blocks + j] = 0.0; - } + + service_memset(&RT_buff[k * cols_local + i * cols_local * blocks + i + 1], 0.0, cols_local - i - 1); } }); } @@ -307,7 +304,7 @@ Status QRBatchKernel::compute_thr(const size_t na, algorithmFPType * R_output = bkR_output.get(); for (size_t i = 0; i < cols; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < cols; j++) { R_output[i + j * cols] = R_buff[i * cols + j]; @@ -336,7 +333,7 @@ Status QRBatchKernel::compute_thr(const size_t na, /* Transpose RB */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < cols_local; j++) { RT_local[j * cols_local + i] = RT_buff[j * cols_local * blocks + k * cols_local + i]; @@ -346,7 +343,7 @@ Status QRBatchKernel::compute_thr(const size_t na, /* Transpose Q to QT */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { QT_local[i * brows_local + j] = Q_block[i + j * cols_local]; @@ -360,7 +357,7 @@ Status QRBatchKernel::compute_thr(const size_t na, /* Transpose result Q */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { Q_block[i + j * cols_local] = QT_result_local[i * brows_local + j]; diff --git a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i index ce438f8b1fa..1659cf96a0a 100755 --- a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i +++ b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i @@ -339,7 +339,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al A_local[i * ncols + j] = a_local[i + j * local_tiles * ncols]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -352,7 +352,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al // If onlyV then no needs to save to A array (inplace) for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -406,7 +406,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al // Reset area under upper triangle to 0. Just in case Intel(R) MKL set them. for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -418,14 +418,14 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { R_local[i + Rda * j] = a_local[i + local_tiles * ncols * j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -456,7 +456,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { @@ -470,7 +470,7 @@ static void tsqr(algorithmFPType * A, const size_t nrows, const size_t ncols, al // of only V required - save only upper part of R array for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { @@ -532,13 +532,13 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Copy triangles from A for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { R_local[i + Rda * j] = A_local[i * ncols + j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -591,13 +591,13 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Copy stacked triangle to top of "a" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { a[i + j * local_tiles * ncols] = R_local[i + Rda * j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < local_tiles * ncols; i++) { @@ -608,7 +608,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Zero out top of "b" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < ncols; i++) { @@ -632,7 +632,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Copy Q into bottom portion of "b" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < height; i++) { @@ -643,7 +643,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Zero out bottom portion of "a" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < height; i++) { @@ -672,7 +672,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Just in case Intel(R) MKL wrote something here for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -696,7 +696,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Copy entire Q factor into "b" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < first_height; i++) { @@ -708,7 +708,7 @@ static void tsgetq(algorithmFPType * A, const size_t nrows, const size_t ncols, // Only apply Q to upper triangle of "a". for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = ncols; i < first_height; i++) { @@ -807,14 +807,14 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Fill "R_local" buffer with stacked upper triangular matrices for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { R_local[i + Rda * j] = A_local[i * ncols + j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -827,7 +827,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Fill "R2_local" top square with top square of matrix being multiplied for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < ncols; i++) { @@ -840,7 +840,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // memset 0 for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < ncols; i++) { @@ -900,7 +900,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Copy my square of R2 to top of "a" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < ncols; i++) { @@ -911,7 +911,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Zero out top of "b" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < ncols; i++) { @@ -936,7 +936,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Copy Q factor into bottom portion of "b" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < height; i++) { @@ -947,7 +947,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Zero out bottom portion of "a" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < height; i++) { @@ -974,7 +974,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Copy bottom portion of "a" buffer to output for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < height; i++) { @@ -987,7 +987,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Fill "b" buffer with entire Q factor for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < first_height; i++) { @@ -998,7 +998,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Zero out bottom portion of "a" buffer for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = ncols; i < first_height; i++) { @@ -1023,7 +1023,7 @@ static void tsapplyq(algorithmFPType * A, const size_t nrows, const size_t ncols // Write result from "a" buffer to output for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < first_height; i++) { @@ -1090,7 +1090,7 @@ static int qr_pcl(const algorithmFPType * A_in, /* nrows e = num; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = b; i < e; i++) { @@ -1109,14 +1109,14 @@ static int qr_pcl(const algorithmFPType * A_in, /* nrows for (size_t i = 0; i < ncols; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < i; j++) { R_out[i * ncols + j] = 0; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = i; j < ncols; j++) { @@ -1191,7 +1191,7 @@ static int svd_pcl(algorithmFPType * A_in, e = num; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = b; i < e; i++) { @@ -1231,14 +1231,14 @@ static int svd_pcl(algorithmFPType * A_in, for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { R[j * ncols + i] = R_out[i * ncols + j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -1256,14 +1256,14 @@ static int svd_pcl(algorithmFPType * A_in, for (size_t j = 0; j < ncols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < j + 1; i++) { R[j * ncols + i] = V[i * ncols + j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = j + 1; i < ncols; i++) { @@ -1290,7 +1290,7 @@ static int svd_pcl(algorithmFPType * A_in, { for (size_t i = 0; i < ncols; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < ncols; j++) { diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index 0fcb2b0eb94..d029656c32c 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -169,7 +169,7 @@ class EuclideanDistances : public PairwiseDistances const FPType * const aa = normBufferA.get() + aOffset; const FPType * const bb = (&_a == &_b) ? normBufferA.get() + bOffset : normBufferB.get() + bOffset; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRowsC; i++) { @@ -276,7 +276,7 @@ class EuclideanDistances : public PairwiseDistances for (size_t i = 0; i < end - begin; i++) { FPType sum = FPType(0); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_ICC_NO16(omp simd reduction(+ : sum)) for (size_t j = 0; j < nCols; j++) { @@ -354,7 +354,7 @@ class CosineDistances : public EuclideanDistances for (size_t i = 0; i < nRowsC; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nColsC; j++) { @@ -775,7 +775,7 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ DAAL_INT num_taken = static_cast(n) - num_discarded; daal::internal::MathInst::vSqrt(num_taken, eigenvalues.get() + num_discarded, eigenvalues.get() + num_discarded); DAAL_INT one = 1; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t col = num_discarded; col < n; col++) { const FPType scale = eigenvalues[col]; diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i b/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i index 94d2df1e118..955a48f0eff 100644 --- a/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/svd/svd_dense_default_batch_impl.i @@ -365,7 +365,7 @@ Status SVDBatchKernel::compute_thr(const size_t na /* Get transposed Q from A */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { QT_local[i * brows_local + j] = A_block[i + j * cols_local]; @@ -383,7 +383,7 @@ Status SVDBatchKernel::compute_thr(const size_t na /* Transpose Q */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { Q_block[i + j * cols_local] = QT_local[i * brows_local + j]; @@ -394,16 +394,13 @@ Status SVDBatchKernel::compute_thr(const size_t na for (size_t i = 0; i < cols_local; i++) { size_t j; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (j = 0; j <= i; j++) { RT_buff[k * cols_local + i * cols_local * blocks + j] = RT_local[i * cols_local + j]; } - PRAGMA_IVDEP - for (; j < cols_local; j++) - { - RT_buff[k * cols_local + i * cols_local * blocks + j] = 0.0; - } + + service_memset(&RT_buff[k * cols_local + i * cols_local * blocks + i + 1], 0.0, cols_local - i - 1); } }); } @@ -446,7 +443,7 @@ Status SVDBatchKernel::compute_thr(const size_t na /* Transpose result R and save to V output */ for (size_t i = 0; i < cols; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < nComponents; j++) { V_output[i + j * cols] = V_buff[i * cols + j]; @@ -484,7 +481,7 @@ Status SVDBatchKernel::compute_thr(const size_t na /* Transpose RB */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < cols_local; j++) { RT_block[i * cols_local + j] = R_buff[j * cols_local * blocks + k * cols_local + i]; @@ -494,7 +491,7 @@ Status SVDBatchKernel::compute_thr(const size_t na /* Transpose Q to QT */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { QT_local[i * brows_local + j] = Q_block[i + j * cols_local]; @@ -504,7 +501,7 @@ Status SVDBatchKernel::compute_thr(const size_t na /* Transpose R to RT */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < cols_local; j++) { RT_local[i * cols_local + j] = RT_block[i + j * cols_local]; @@ -518,7 +515,7 @@ Status SVDBatchKernel::compute_thr(const size_t na /* Transpose result Q */ for (size_t i = 0; i < cols_local; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < brows_local; j++) { U_block[i + j * cols_local] = QT_result_local[i * brows_local + j]; @@ -594,7 +591,7 @@ Status SVDBatchKernel::compute_pcl(const size_t na algorithmFPType * tV = vBlock.get(); for (size_t i = 0; i < nComponents; i++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (size_t j = 0; j < n; j++) { tV[i * n + j] = V[i * n + j]; diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i b/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i index c94b145a802..1c5662cf433 100755 --- a/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i +++ b/cpp/daal/src/algorithms/svd/svd_dense_default_impl.i @@ -181,7 +181,7 @@ Status compute_QR_on_one_node(DAAL_INT m, DAAL_INT n, algorithmFPType * a_q, DAA // Get R of the QR factorization formed by xgeqrf for (DAAL_INT i = 0; i < nColumnsInQ; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (DAAL_INT j = 0; j <= i; ++j) { @@ -194,14 +194,14 @@ Status compute_QR_on_one_node(DAAL_INT m, DAAL_INT n, algorithmFPType * a_q, DAA const algorithmFPType zero(0.0); for (size_t i = m; i < n; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < m; ++j) { r[i * ldr + j] = a_q[i * lda_q + j]; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = m; j <= i; ++j) { diff --git a/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i b/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i index 104fd10f801..0d94b813afb 100644 --- a/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i +++ b/cpp/daal/src/algorithms/svm/svm_train_thunder_impl.i @@ -357,7 +357,7 @@ services::Status SVMTrainImpl::SMOBlockSolver( daal::threader_for(nBlocks, nBlocks, [&](const size_t iBlock) { const size_t startRow = iBlock * blockSizeWS; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = startRow; i < startRow + blockSizeWS; ++i) { @@ -375,7 +375,7 @@ services::Status SVMTrainImpl::SMOBlockSolver( Ii |= (yLocal[i] > 0) ? positive : negative; I[i] = Ii; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nWS; ++j) { @@ -508,7 +508,7 @@ services::Status SVMTrainImpl::SMOBlockSolver( const algorithmFPType * const KBjBlock = &kernelLocal[Bj * nWS]; /* Update gradient */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nWS; i++) { @@ -521,7 +521,7 @@ services::Status SVMTrainImpl::SMOBlockSolver( localDiff = firstDiff; /* Compute diff and scatter to alpha vector */ - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nWS; ++i) { @@ -557,7 +557,7 @@ services::Status SVMTrainImpl::updateGrad(algorit if (startRowGrad < nVectors && startRowGrad + nRowsInBlockGrad > nVectors) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nRowsInBlockGrad; ++j) { diff --git a/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i b/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i index 2122c8c532e..0062954904e 100644 --- a/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i +++ b/cpp/daal/src/algorithms/tsne/tsne_gradient_descent_impl.i @@ -115,7 +115,7 @@ struct MemoryCtxType const DataType * xInit = xInitDataBlock.get(); const DataType * yInit = yInitDataBlock.get(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < capacity; i++) { @@ -133,7 +133,7 @@ struct MemoryCtxType DataType * xInit = xInitDataBlock.get(); DataType * yInit = yInitDataBlock.get(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < _capacity; i++) { diff --git a/cpp/daal/src/data_management/roc_auc_score.cpp b/cpp/daal/src/data_management/roc_auc_score.cpp index 059c641eb49..93690366883 100644 --- a/cpp/daal/src/data_management/roc_auc_score.cpp +++ b/cpp/daal/src/data_management/roc_auc_score.cpp @@ -79,7 +79,7 @@ services::Status rocAucScoreImpl(const NumericTablePtr & truePrediction, const N } elementsInBlock = j - i + 1; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < elementsInBlock; ++j) { diff --git a/cpp/daal/src/data_management/train_test_split.cpp b/cpp/daal/src/data_management/train_test_split.cpp index df7ad0d6d1e..16f567974ee 100755 --- a/cpp/daal/src/data_management/train_test_split.cpp +++ b/cpp/daal/src/data_management/train_test_split.cpp @@ -170,7 +170,7 @@ services::Status assignColumnValues(const DataType * origDataPtr, const NumericT DataType * dataPtr = dataBlock.get(); DAAL_CHECK_MALLOC(dataPtr); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < nRows; ++i) { @@ -232,7 +232,7 @@ services::Status assignRows(const DataType * origDataPtr, const NumericTablePtr for (size_t i = 0; i < nRows; ++i) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t j = 0; j < nColumns; ++j) { diff --git a/cpp/daal/src/externals/service_memory.h b/cpp/daal/src/externals/service_memory.h index 8cb4bc1b9f0..2858c111366 100644 --- a/cpp/daal/src/externals/service_memory.h +++ b/cpp/daal/src/externals/service_memory.h @@ -60,7 +60,7 @@ void service_memset_seq(T * const ptr, const T value, const size_t num) { /// Use aligned stores const unsigned int num32 = static_cast(num); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS PRAGMA_VECTOR_ALIGNED for (unsigned int i = 0; i < num32; i++) @@ -70,7 +70,7 @@ void service_memset_seq(T * const ptr, const T value, const size_t num) } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < num; i++) { @@ -250,7 +250,7 @@ T * service_memset(T * const ptr, const T value, const size_t num) end = num; } - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = block * blockSize; i < end; i++) { @@ -264,7 +264,7 @@ T * service_memset(T * const ptr, const T value, const size_t num) template void service_memset_incrementing(T * const ptr, const T startValue, const size_t num) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < num; i++) { diff --git a/cpp/daal/src/externals/service_spblas.h b/cpp/daal/src/externals/service_spblas.h index 4d959bc6cc9..71a0640c08d 100644 --- a/cpp/daal/src/externals/service_spblas.h +++ b/cpp/daal/src/externals/service_spblas.h @@ -163,7 +163,7 @@ struct SpBlas for (size_t ind1 = 0; ind1 < nnzCol1; ++ind1) { fpType * ptr_ = res.ptr + rowPtr1[ind1] * res.stride; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t ind2 = 0; ind2 < nnzCol2; ++ind2) { diff --git a/cpp/daal/src/externals/service_spblas_ref.h b/cpp/daal/src/externals/service_spblas_ref.h index 2106e689a0e..a5aa495f002 100644 --- a/cpp/daal/src/externals/service_spblas_ref.h +++ b/cpp/daal/src/externals/service_spblas_ref.h @@ -180,14 +180,13 @@ struct RefSpBlas DAAL_INT offset = pntrb[row] - 1; DAAL_INT nnz = pntrb[row + 1] - pntrb[row]; DAAL_INT csrcol = col + 1; -#pragma omp simd + + for (DAAL_INT i = 0; i < nnz; ++i) { - for (DAAL_INT i = 0; i < nnz; ++i) - { - if (csrcol < indx[offset + i]) break; - if (csrcol == indx[offset + i]) return val[offset + i]; - } + if (csrcol < indx[offset + i]) break; + if (csrcol == indx[offset + i]) return val[offset + i]; } + return fpType(0); } static void csrmm(const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const fpType * alpha, const fpType * a, const DAAL_INT * indx, diff --git a/cpp/daal/src/externals/service_stat.h b/cpp/daal/src/externals/service_stat.h index 4f091f08d2b..34fb242d276 100644 --- a/cpp/daal/src/externals/service_stat.h +++ b/cpp/daal/src/externals/service_stat.h @@ -109,7 +109,7 @@ struct Statistics fpType wsum = 0; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS PRAGMA_ICC_NO16(omp simd reduction(+ : wsum)) for (size_t i = 0; i < nRows; i++) diff --git a/cpp/daal/src/services/service_defines.h b/cpp/daal/src/services/service_defines.h index 54e510afef4..cdb7541783f 100644 --- a/cpp/daal/src/services/service_defines.h +++ b/cpp/daal/src/services/service_defines.h @@ -45,7 +45,7 @@ DAAL_EXPORT bool daal_check_is_intel_cpu(); #define DAAL_CHECK_CPU_ENVIRONMENT (daal_check_is_intel_cpu()) #if defined(__INTEL_COMPILER) - #define PRAGMA_IVDEP _Pragma("ivdep") + #define PRAGMA_FORCE_SIMD _Pragma("ivdep") #define PRAGMA_NOVECTOR _Pragma("novector") #define PRAGMA_VECTOR_ALIGNED _Pragma("vector aligned") #define PRAGMA_VECTOR_UNALIGNED _Pragma("vector unaligned") @@ -55,8 +55,11 @@ DAAL_EXPORT bool daal_check_is_intel_cpu(); #define PRAGMA_ICC_NO16(ARGS) PRAGMA_ICC_TO_STR(ARGS) #define DAAL_TYPENAME typename #elif defined(__GNUC__) - #define PRAGMA_IVDEP - #define PRAGMA_NOVECTOR + #if defined(TARGET_ARM) + #define PRAGMA_FORCE_SIMD _Pragma("omp simd") + #else + #define PRAGMA_FORCE_SIMD + #endif #define PRAGMA_VECTOR_ALIGNED #define PRAGMA_VECTOR_UNALIGNED #define PRAGMA_VECTOR_ALWAYS @@ -65,7 +68,7 @@ DAAL_EXPORT bool daal_check_is_intel_cpu(); #define PRAGMA_ICC_NO16(ARGS) #define DAAL_TYPENAME typename #elif defined(_MSC_VER) - #define PRAGMA_IVDEP + #define PRAGMA_FORCE_SIMD #define PRAGMA_NOVECTOR #define PRAGMA_VECTOR_ALIGNED #define PRAGMA_VECTOR_UNALIGNED @@ -75,7 +78,7 @@ DAAL_EXPORT bool daal_check_is_intel_cpu(); #define PRAGMA_ICC_NO16(ARGS) #define DAAL_TYPENAME typename #else - #define PRAGMA_IVDEP + #define PRAGMA_FORCE_SIMD #define PRAGMA_NOVECTOR #define PRAGMA_VECTOR_ALIGNED #define PRAGMA_VECTOR_UNALIGNED diff --git a/cpp/daal/src/services/service_utils.h b/cpp/daal/src/services/service_utils.h index fb766339a86..c5f8e6091dd 100644 --- a/cpp/daal/src/services/service_utils.h +++ b/cpp/daal/src/services/service_utils.h @@ -272,7 +272,7 @@ void transpose(const algorithmFPType * src, size_t rows, size_t cols, algorithmF { for (size_t j = 0; j < cols; j++) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (size_t i = 0; i < rows; i++) { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp index c0dd4cca565..ef82f821e30 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp @@ -116,7 +116,7 @@ class infer_model_manager { Index* const lc = lc_list_host.get_mutable_data() + tree_idx * max_tree_size_; Float* const fv = fv_list_host.get_mutable_data() + tree_idx * max_tree_size_; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (Index i = 0; i < tree_size; i++) { fi[i] = static_cast(dt_node_list[i].featureIndex); @@ -128,7 +128,7 @@ class infer_model_manager { const double* probas = daal_model_ptr->getProbas(tree_idx); Float* pv = probas_list_host.get_mutable_data() + tree_idx * max_tree_size_ * ctx_.class_count; - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (Index i = 0; i < tree_size * ctx_.class_count; i++) { pv[i] = static_cast(probas[i]); diff --git a/cpp/oneapi/dal/backend/common.hpp b/cpp/oneapi/dal/backend/common.hpp index d3bd2ec3ed5..1563b774bec 100644 --- a/cpp/oneapi/dal/backend/common.hpp +++ b/cpp/oneapi/dal/backend/common.hpp @@ -26,10 +26,14 @@ #include "oneapi/dal/detail/common.hpp" #if defined(__INTEL_COMPILER) -#define PRAGMA_IVDEP _Pragma("ivdep") +#define PRAGMA_FORCE_SIMD _Pragma("ivdep") #define PRAGMA_VECTOR_ALWAYS _Pragma("vector always") #else -#define PRAGMA_IVDEP +#if defined(TARGET_ARM) +#define PRAGMA_FORCE_SIMD _Pragma("omp simd") +#else +#define PRAGMA_FORCE_SIMD +#endif #define PRAGMA_VECTOR_ALWAYS #endif diff --git a/cpp/oneapi/dal/table/backend/convert/common.hpp b/cpp/oneapi/dal/table/backend/convert/common.hpp index b84ad724d2a..2965117bab4 100644 --- a/cpp/oneapi/dal/table/backend/convert/common.hpp +++ b/cpp/oneapi/dal/table/backend/convert/common.hpp @@ -39,7 +39,7 @@ inline dal::array extract_by_indices(const Index* indices, auto result = dal::array::empty(count); auto* const output = result.get_mutable_data(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (std::int64_t i = 0l; i < count; ++i) { const Index idx = indices[i]; output[i] = values[idx]; diff --git a/cpp/oneapi/dal/table/backend/convert/common_convert.cpp b/cpp/oneapi/dal/table/backend/convert/common_convert.cpp index ab20a0f6060..84b55132ca8 100644 --- a/cpp/oneapi/dal/table/backend/convert/common_convert.cpp +++ b/cpp/oneapi/dal/table/backend/convert/common_convert.cpp @@ -45,7 +45,7 @@ dal::array compute_pointers(const dal::array& data, auto pointers = dal::array::empty(count); ptr_t* raw_pointers = pointers.get_mutable_data(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (std::int64_t row = 0l; row < count; ++row) { raw_pointers[row] = source + raw_offsets[row]; } @@ -65,7 +65,7 @@ dal::array compute_output_offsets(data_type output_type, detail::check_mul_overflow(row_count, row_stride_in_bytes); std::int64_t* const raw_offsets = offsets.get_mutable_data(); - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (std::int64_t row = 0l; row < row_count; ++row) { raw_offsets[row] = row * row_stride_in_bytes; } diff --git a/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp b/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp index 533ad9026f4..8b75dddcc42 100644 --- a/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp +++ b/cpp/oneapi/dal/table/backend/convert/copy_convert_impl_cpu.cpp @@ -69,7 +69,7 @@ struct copy_converter_impl { backend::copy(out, inp, count); } else { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (std::int64_t i = 0l; i < count; ++i) { out[i] = static_cast(inp[i]); @@ -81,7 +81,7 @@ struct copy_converter_impl { std::int64_t out_stride, const inp_t* inp, std::int64_t count) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (std::int64_t i = 0l; i < count; ++i) { const std::int64_t out_offset = i * out_stride; @@ -94,7 +94,7 @@ struct copy_converter_impl { const inp_t* inp, std::int64_t inp_stride, std::int64_t count) { - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD PRAGMA_VECTOR_ALWAYS for (std::int64_t i = 0l; i < count; ++i) { const std::int64_t inp_offset = i * inp_stride; @@ -110,7 +110,7 @@ struct copy_converter_impl { // Let's trust compiler to decide if the loop should be // vectorized or not. It can be suboptimal if strides are // too large - PRAGMA_IVDEP + PRAGMA_FORCE_SIMD for (std::int64_t i = 0l; i < count; ++i) { const std::int64_t out_offset = i * out_stride; const std::int64_t inp_offset = i * inp_stride; diff --git a/dev/make/compiler_definitions/gnu.ref.arm.mk b/dev/make/compiler_definitions/gnu.ref.arm.mk index a78f213183a..fd16d394fda 100644 --- a/dev/make/compiler_definitions/gnu.ref.arm.mk +++ b/dev/make/compiler_definitions/gnu.ref.arm.mk @@ -22,7 +22,7 @@ include dev/make/compiler_definitions/gnu.mk PLATs.gnu = lnxarm -COMPILER.all.gnu = ${CXX} -march=armv8-a+sve -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \ +COMPILER.all.gnu = ${CXX} -march=armv8-a+sve -fopenmp-simd -ftree-vectorize -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \ -DDAAL_REF -DONEDAL_REF -DDAAL_CPU=sve -Werror -Wreturn-type $(if $(RNG_OPENRNG), -DOPENRNG_BACKEND) link.dynamic.all.gnu = ${CXX} -march=native diff --git a/docs/source/contribution/cpu_features.rst b/docs/source/contribution/cpu_features.rst index f0615e0b2eb..0c82774f054 100644 --- a/docs/source/contribution/cpu_features.rst +++ b/docs/source/contribution/cpu_features.rst @@ -180,7 +180,7 @@ instruction set specific code. The implementation is located in the file `abc_cl Although the implementation of the ``method1`` does not contain any instruction set specific code, it is expected that the developers leverage SIMD related macros available in |short_name|. -For example, ``PRAGMA_IVDEP``, ``PRAGMA_VECTOR_ALWAYS``, ``PRAGMA_VECTOR_ALIGNED`` and other pragmas defined in +For example, ``PRAGMA_FORCE_SIMD``, ``PRAGMA_VECTOR_ALWAYS``, ``PRAGMA_VECTOR_ALIGNED`` and other pragmas defined in `service_defines.h `_. This will guide the compiler to generate more efficient code for the target architecture.