diff --git a/CHANGELOG.md b/CHANGELOG.md index afa4465ce..581dcc412 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Convert an FDIV to an FMUL in a hot loop in CPU tensor operation. - Fixed compilation with clang 16.0.6 - Added Threads::Threads to `EXT_LIBS` - Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 9d5c8166d..efdb16efe 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -1110,11 +1110,11 @@ void LayerNormalizationImpl(float* out, sqSum += ex * ex; } - float sigma = std::sqrt(sqSum / cols + eps); + float invSigma = 1.0/std::sqrt(sqSum / cols + eps); #pragma omp simd for(int i = 0; i < cols; ++i) { - float t = alpha[alphaStride * i] * ((sp[i] - mean) / sigma); + float t = alpha[alphaStride * i] * ((sp[i] - mean) * invSigma); if(hasBeta) t += beta[betaStride * i]; @@ -1295,11 +1295,11 @@ void RMSNormalizationImpl(float* out, sqSum += sp[i] * sp[i]; } - float rms = std::sqrt(sqSum / cols + eps); + float invRms = 1.0/std::sqrt(sqSum / cols + eps); #pragma omp simd for(int i = 0; i < cols; ++i) { - float t = alpha[alphaStride * i] * (sp[i] / rms); + float t = alpha[alphaStride * i] * (sp[i] * invRms); if(hasBeta) t += beta[betaStride * i];