From 472f5d9020486bf4bbeaa4e4359bd427c5157178 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Wed, 30 Jun 2021 07:35:36 +0200 Subject: [PATCH 1/4] Add TFloat data type for neural network Up to now Tesseract used double for training and recognition with "best" models. This commit replaces double by a new data type TFloat which is double by default, but float if FAST_FLOAT is defined. Ideally this should allow faster training. Signed-off-by: Stefan Weil --- src/arch/dotproductavx.cpp | 23 +++ src/arch/dotproductfma.cpp | 29 ++++ src/arch/dotproductsse.cpp | 10 ++ src/arch/intsimdmatrixavx2.cpp | 255 ++++++++++++++++++++++++++++++++- src/arch/simddetect.cpp | 4 + src/ccstruct/blobbox.h | 7 +- src/ccstruct/matrix.h | 2 +- src/ccutil/tesstypes.h | 4 + src/lstm/weightmatrix.cpp | 106 ++++++++++---- unittest/intsimdmatrix_test.cc | 16 ++- 10 files changed, 419 insertions(+), 37 deletions(-) diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp index 3f243906db..98e829959a 100644 --- a/src/arch/dotproductavx.cpp +++ b/src/arch/dotproductavx.cpp @@ -29,6 +29,28 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel AVX intrinsics to access the SIMD instruction set. +#if defined(FAST_FLOAT) +float DotProductAVX(const float *u, const float *v, int n) { + const unsigned quot = n / 8; + const unsigned rem = n % 8; + __m256 t0 = _mm256_setzero_ps(); + for (unsigned k = 0; k < quot; k++) { + __m256 f0 = _mm256_loadu_ps(u); + __m256 f1 = _mm256_loadu_ps(v); + f0 = _mm256_mul_ps(f0, f1); + t0 = _mm256_add_ps(t0, f0); + u += 8; + v += 8; + } + alignas(32) float tmp[8]; + _mm256_store_ps(tmp, t0); + float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; + for (unsigned k = 0; k < rem; k++) { + result += *u++ * *v++; + } + return result; +} +#else double DotProductAVX(const double *u, const double *v, int n) { const unsigned quot = n / 8; const unsigned rem = n % 8; @@ -57,6 +79,7 @@ double DotProductAVX(const double *u, const double *v, int n) { } return result; } +#endif } // namespace tesseract. diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp index ede46298e8..1e9f3a7975 100644 --- a/src/arch/dotproductfma.cpp +++ b/src/arch/dotproductfma.cpp @@ -29,6 +29,34 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel FMA intrinsics to access the SIMD instruction set. +#if defined(FAST_FLOAT) +float DotProductFMA(const float *u, const float *v, int n) { + const unsigned quot = n / 8; + const unsigned rem = n % 8; + __m256 t0 = _mm256_setzero_ps(); + __m256 t1 = _mm256_setzero_ps(); + for (unsigned k = 0; k < quot; k++) { + __m256 f0 = _mm256_loadu_ps(u); + __m256 f1 = _mm256_loadu_ps(v); + t0 = _mm256_fmadd_ps(f0, f1, t0); + u += 4; + v += 4; + __m256 f2 = _mm256_loadu_ps(u); + __m256 f3 = _mm256_loadu_ps(v); + t1 = _mm256_fmadd_ps(f2, f3, t1); + u += 4; + v += 4; + } + t0 = _mm256_hadd_ps(t0, t1); + alignas(32) float tmp[4]; + _mm256_store_ps(tmp, t0); + float result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; + for (unsigned k = 0; k < rem; k++) { + result += *u++ * *v++; + } + return result; +} +#else double DotProductFMA(const double *u, const double *v, int n) { const unsigned quot = n / 8; const unsigned rem = n % 8; @@ -55,6 +83,7 @@ double DotProductFMA(const double *u, const double *v, int n) { } return result; } +#endif } // namespace tesseract. diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp index 1dbd18fb8e..ec94f50341 100644 --- a/src/arch/dotproductsse.cpp +++ b/src/arch/dotproductsse.cpp @@ -30,6 +30,15 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. +#if defined(FAST_FLOAT) +TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) { + TFloat total = 0.0; + for (int k = 0; k < n; ++k) { + total += u[k] * v[k]; + } + return total; +} +#else double DotProductSSE(const double *u, const double *v, int n) { int max_offset = n - 2; int offset = 0; @@ -78,6 +87,7 @@ double DotProductSSE(const double *u, const double *v, int n) { } return result; } +#endif } // namespace tesseract. diff --git a/src/arch/intsimdmatrixavx2.cpp b/src/arch/intsimdmatrixavx2.cpp index ce5d8ea9fe..1754d6e6cf 100644 --- a/src/arch/intsimdmatrixavx2.cpp +++ b/src/arch/intsimdmatrixavx2.cpp @@ -15,14 +15,13 @@ // limitations under the License. /////////////////////////////////////////////////////////////////////// +#include "intsimdmatrix.h" + #if !defined(__AVX2__) # if defined(__i686__) || defined(__x86_64__) # error Implementation only for AVX2 capable architectures # endif #else - -# include "intsimdmatrix.h" - # include # include # include @@ -86,6 +85,252 @@ static inline __m128i load64_to_128(const int8_t *wi_) { return _mm_set_epi64x(0, wi[0]); } +#if defined(FAST_FLOAT) +static inline void ExtractResults8(__m256i result, const int8_t *wi, const float *scales, + float *v) { + __m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg + __m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg + __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127); + __m256d scale01234567 = _mm256_loadu_ps(scales); + //~ __m256d scale4567 = _mm256_loadu_ps(scales + 8); + w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 + result = _mm256_add_epi32(result, w256); // result += bias * 127 + __m256 res01234567 = _mm256_cvtepi32_ps(_mm256_castsi256_si128(result)); + result = _mm256_permute4x64_epi64(result, 2 + (3 << 2)); + __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result)); + res01234567 = _mm256_mul_pd(res01234567, scale01234567); + //~ res4567 = _mm256_mul_pd(res4567, scale4567); + _mm256_storeu_ps(v, res01234567); + //~ _mm256_storeu_pd(v + 4, res4567); +} + +static inline void ExtractResults16(__m256i result0, __m256i result1, const int8_t *&wi, + const float *&scales, float *&v) { + __m128i w8 = _mm_loadu_si128(reinterpret_cast(wi)); + // 8x8bit vals in bottom of 128bit reg + const __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127); + __m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg + __m256d scale0123 = _mm256_loadu_ps(scales); + __m256d scale4567 = _mm256_loadu_ps(scales + 8); + w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 + result0 = _mm256_add_epi32(result0, w256); // result += bias * 127 + __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0)); + result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2)); + __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0)); + res0123 = _mm256_mul_pd(res0123, scale0123); + res4567 = _mm256_mul_pd(res4567, scale4567); + _mm256_storeu_ps(v, res0123); + _mm256_storeu_ps(v + 8, res4567); + w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2)); + w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg + scale0123 = _mm256_loadu_ps(scales + 16); + scale4567 = _mm256_loadu_ps(scales + 24); + w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 + result1 = _mm256_add_epi32(result1, w256); // result += bias * 127 + res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1)); + result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2)); + res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1)); + res0123 = _mm256_mul_pd(res0123, scale0123); + res4567 = _mm256_mul_pd(res4567, scale4567); + _mm256_storeu_ps(v + 16, res0123); + _mm256_storeu_ps(v + 24, res4567); + wi += 16; + scales += 16; + v += 16; +} + +// Computes part of matrix.vector v = Wu. Computes N=64 results. +// The weights *must* be arranged so that consecutive reads from wi +// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of +// (kNumInputsPerGroup inputs))). After that there must be N consecutive +// bias weights, before continuing with any more weights. +// u must be padded out with zeros to +// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements. +static void PartialMatrixDotVector64(const int8_t *wi, const float *scales, const int8_t *u, + int num_in, float *v) { + // Register containing 16-bit ones for horizontal add with 16->32 bit + // conversion. + __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); + // Initialize all the results to 0. + __m256i result0 = _mm256_setzero_si256(); + __m256i result1 = _mm256_setzero_si256(); + __m256i result2 = _mm256_setzero_si256(); + __m256i result3 = _mm256_setzero_si256(); + __m256i result4 = _mm256_setzero_si256(); + __m256i result5 = _mm256_setzero_si256(); + __m256i result6 = _mm256_setzero_si256(); + __m256i result7 = _mm256_setzero_si256(); + // Iterate over the input (u), one registerful at a time. + for (int j = 0; j < num_in;) { + __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); + // Inputs are processed in groups of kNumInputsPerGroup, replicated + // kNumInputGroups times. + for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { + // Replicate the low 32 bits (4 inputs) 8 times. + __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); + // Rotate the inputs in groups of 4, so the next 4 inputs are ready. + inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); + __m256i weights, reps; + // Mul-add, with horizontal add of the 4 inputs to each of the results. + MultiplyGroup(rep_input, ones, wi, weights, reps, result0); + MultiplyGroup(rep_input, ones, wi, weights, reps, result1); + MultiplyGroup(rep_input, ones, wi, weights, reps, result2); + MultiplyGroup(rep_input, ones, wi, weights, reps, result3); + MultiplyGroup(rep_input, ones, wi, weights, reps, result4); + MultiplyGroup(rep_input, ones, wi, weights, reps, result5); + MultiplyGroup(rep_input, ones, wi, weights, reps, result6); + MultiplyGroup(rep_input, ones, wi, weights, reps, result7); + } + } + ExtractResults16(result0, result1, wi, scales, v); + ExtractResults16(result2, result3, wi, scales, v); + ExtractResults16(result4, result5, wi, scales, v); + ExtractResults16(result6, result7, wi, scales, v); +} + +// Computes part of matrix.vector v = Wu. Computes N=32 results. +// For details see PartialMatrixDotVector64 with N=32. +static void PartialMatrixDotVector32(const int8_t *wi, const float *scales, const int8_t *u, + int num_in, float *v) { + // Register containing 16-bit ones for horizontal add with 16->32 bit + // conversion. + __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); + // Initialize all the results to 0. + __m256i result0 = _mm256_setzero_si256(); + __m256i result1 = _mm256_setzero_si256(); + __m256i result2 = _mm256_setzero_si256(); + __m256i result3 = _mm256_setzero_si256(); + // Iterate over the input (u), one registerful at a time. + for (int j = 0; j < num_in;) { + __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); + // Inputs are processed in groups of kNumInputsPerGroup, replicated + // kNumInputGroups times. + for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { + // Replicate the low 32 bits (4 inputs) 8 times. + __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); + // Rotate the inputs in groups of 4, so the next 4 inputs are ready. + inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); + __m256i weights, reps; + // Mul-add, with horizontal add of the 4 inputs to each of the results. + MultiplyGroup(rep_input, ones, wi, weights, reps, result0); + MultiplyGroup(rep_input, ones, wi, weights, reps, result1); + MultiplyGroup(rep_input, ones, wi, weights, reps, result2); + MultiplyGroup(rep_input, ones, wi, weights, reps, result3); + } + } + ExtractResults16(result0, result1, wi, scales, v); + ExtractResults16(result2, result3, wi, scales, v); +} + +// Computes part of matrix.vector v = Wu. Computes N=16 results. +// For details see PartialMatrixDotVector64 with N=16. +static void PartialMatrixDotVector16(const int8_t *wi, const float *scales, const int8_t *u, + int num_in, float *v) { + // Register containing 16-bit ones for horizontal add with 16->32 bit + // conversion. + __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); + // Initialize all the results to 0. + __m256i result0 = _mm256_setzero_si256(); + __m256i result1 = _mm256_setzero_si256(); + // Iterate over the input (u), one registerful at a time. + for (int j = 0; j < num_in;) { + __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); + // Inputs are processed in groups of kNumInputsPerGroup, replicated + // kNumInputGroups times. + for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { + // Replicate the low 32 bits (4 inputs) 8 times. + __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); + // Rotate the inputs in groups of 4, so the next 4 inputs are ready. + inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); + __m256i weights, reps; + // Mul-add, with horizontal add of the 4 inputs to each of the results. + MultiplyGroup(rep_input, ones, wi, weights, reps, result0); + MultiplyGroup(rep_input, ones, wi, weights, reps, result1); + } + } + ExtractResults16(result0, result1, wi, scales, v); +} + +// Computes part of matrix.vector v = Wu. Computes N=8 results. +// For details see PartialMatrixDotVector64 with N=8. +static inline void PartialMatrixDotVector8(const int8_t *wi, const float *scales, const int8_t *u, + int num_in, float *v) { + // Register containing 16-bit ones for horizontal add with 16->32 bit + // conversion. + __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); + // Initialize all the results to 0. + __m256i result0 = _mm256_setzero_si256(); + // Iterate over the input (u), one registerful at a time. + for (int j = 0; j < num_in;) { + __m256i inputs = _mm256_loadu_si256(reinterpret_cast(u + j)); + // Inputs are processed in groups of kNumInputsPerGroup, replicated + // kNumInputGroups times. + for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { + // Replicate the low 32 bits (4 inputs) 8 times. + __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); + // Rotate the inputs in groups of 4, so the next 4 inputs are ready. + inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); + __m256i weights, reps; + // Mul-add, with horizontal add of the 4 inputs to each of the results. + MultiplyGroup(rep_input, ones, wi, weights, reps, result0); + } + } + ExtractResults8(result0, wi, scales, v); +} + +static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const float *scales, + const int8_t *u, float *v) { + const int num_out = dim1; + const int num_in = dim2 - 1; + // Each call to a partial_func_ produces group_size outputs, except the + // last one, which can produce less. + const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup); + const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister); + int group_size = kNumOutputsPerRegister * kMaxOutputRegisters; + int output = 0; + + int w_step = (rounded_num_in + 1) * group_size; + + // Run with this group size, until it would produce too much output, then + // switch to a smaller size. + for (; output + group_size <= rounded_num_out; output += group_size) { + PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v); + wi += w_step; + scales += group_size; + v += group_size; + } + group_size /= 2; + w_step /= 2; + + if (output + group_size <= rounded_num_out) { + PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v); + wi += w_step; + scales += group_size; + v += group_size; + output += group_size; + } + group_size /= 2; + w_step /= 2; + + if (output + group_size <= rounded_num_out) { + PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v); + wi += w_step; + scales += group_size; + v += group_size; + output += group_size; + } + group_size /= 2; + w_step /= 2; + + if (output + group_size <= rounded_num_out) { + PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v); + } +} +#else static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales, double *v) { __m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg @@ -330,6 +575,7 @@ static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double * PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v); } } +#endif const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = { // Function. @@ -341,7 +587,8 @@ const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = { // Number of 8 bit inputs in the inputs register. kNumInputsPerRegister, // Number of inputs in each weight group. - kNumInputsPerGroup}; + kNumInputsPerGroup +}; } // namespace tesseract. diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp index 514bee3cbc..e729640297 100644 --- a/src/arch/simddetect.cpp +++ b/src/arch/simddetect.cpp @@ -96,7 +96,11 @@ bool SIMDDetect::sse_available_; static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) { TFloat total = 0; const int stride = 1; +#if defined(FAST_FLOAT) + vDSP_dotpr(u, stride, v, stride, &total, n); +#else vDSP_dotprD(u, stride, v, stride, &total, n); +#endif return total; } #endif diff --git a/src/ccstruct/blobbox.h b/src/ccstruct/blobbox.h index d003062fe4..5fafb3da48 100644 --- a/src/ccstruct/blobbox.h +++ b/src/ccstruct/blobbox.h @@ -740,8 +740,11 @@ class TESS_API TO_BLOCK : public ELIST_LINK { TO_ROW_IT row_it = &row_list; for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { auto row = row_it.data(); - tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n", row->min_y(), row->max_y(), - row->parallel_c(), row->blob_list()->length()); + tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n", + static_cast(row->min_y()), + static_cast(row->max_y()), + static_cast(row->parallel_c()), + row->blob_list()->length()); } } diff --git a/src/ccstruct/matrix.h b/src/ccstruct/matrix.h index 0741967127..a97912ad78 100644 --- a/src/ccstruct/matrix.h +++ b/src/ccstruct/matrix.h @@ -417,7 +417,7 @@ class GENERIC_2D_ARRAY { // Accumulates the element-wise sums of squares of src into *this. void SumSquares(const GENERIC_2D_ARRAY &src, const T &decay_factor) { - T update_factor = 1.0 - decay_factor; + T update_factor = 1 - decay_factor; int size = num_elements(); for (int i = 0; i < size; ++i) { array_[i] = array_[i] * decay_factor + update_factor * src.array_[i] * src.array_[i]; diff --git a/src/ccutil/tesstypes.h b/src/ccutil/tesstypes.h index 9e0c71cb0f..390fa32667 100644 --- a/src/ccutil/tesstypes.h +++ b/src/ccutil/tesstypes.h @@ -25,7 +25,11 @@ namespace tesseract { using TDimension = int16_t; // Floating point data type used for LSTM calculations. +#if defined(FAST_FLOAT) +using TFloat = float; +#else using TFloat = double; +#endif } diff --git a/src/lstm/weightmatrix.cpp b/src/lstm/weightmatrix.cpp index f3eb503147..57a07dcb91 100644 --- a/src/lstm/weightmatrix.cpp +++ b/src/lstm/weightmatrix.cpp @@ -21,7 +21,7 @@ #include "intsimdmatrix.h" #include "simddetect.h" // for DotProduct #include "statistc.h" -#include "tprintf.h" +#include "tprintf.h" // forTFloat namespace tesseract { @@ -36,8 +36,22 @@ const int kAdamCorrectionIterations = 200000; // Epsilon in Adam to prevent division by zero. const TFloat kAdamEpsilon = 1e-8; -// Utility function converts an array of float to the corresponding array -// of double. +// Utility functions convert between double and float arrays. +#ifdef FAST_FLOAT +static void DoubleToFloat(const GENERIC_2D_ARRAY &src, GENERIC_2D_ARRAY &dst) { + const auto dim1 = src.dim1(); + const auto dim2 = src.dim2(); + dst.ResizeNoInit(dim1, dim2); + for (int i = 0; i < dim1; ++i) { + const auto *src_i = src[i]; + auto *dst_i = dst[i]; + for (int j = 0; j < dim2; ++j) { + dst_i[j] = static_cast(src_i[j]); + } + } +} +#endif + static void FloatToDouble(const GENERIC_2D_ARRAY &src, GENERIC_2D_ARRAY &dst) { const auto dim1 = src.dim1(); const auto dim2 = src.dim2(); @@ -51,6 +65,29 @@ static void FloatToDouble(const GENERIC_2D_ARRAY &src, GENERIC_2D_ARRAY &tfloat_array) { +#ifdef FAST_FLOAT + GENERIC_2D_ARRAY double_array; + if (!double_array.DeSerialize(fp)) { + return false; + } + DoubleToFloat(double_array, tfloat_array); + return true; +#else + return tfloat_array.DeSerialize(fp); +#endif +} + +static bool Serialize(TFile *fp, const GENERIC_2D_ARRAY &tfloat_array) { +#ifdef FAST_FLOAT + GENERIC_2D_ARRAY double_array; + FloatToDouble(tfloat_array, double_array); + return double_array.Serialize(fp); +#else + return tfloat_array.Serialize(fp); +#endif +} + // Computes matrix.vector v = Wu. // u is of size W.dim2() - add_bias_fwd and the output v is of size // W.dim1() - skip_bias_back. @@ -209,29 +246,30 @@ bool WeightMatrix::Serialize(bool training, TFile *fp) const { if (!wi_.Serialize(fp)) { return false; } - // The scales stored in memory have an extra factor applied to them - // to allow faster operation. We have to remove that factor here - // before writing to disc. - auto scales = scales_; - for (auto &scale : scales) { - scale *= INT8_MAX; - } - uint32_t size = scales.size(); + uint32_t size = scales_.size(); if (!fp->Serialize(&size)) { return false; } - if (!fp->Serialize(&scales[0], size)) { - return false; + for (auto scale : scales_) { + // The scales stored in memory have an extra factor applied to them + // to allow faster operation. We have to remove that factor here + // before writing to disc. + double value = scale * INT8_MAX; + if (!fp->Serialize(&value)) { + return false; + } } } else { - if (!wf_.Serialize(fp)) { + if (!tesseract::Serialize(fp, wf_)) { return false; } - if (training && !updates_.Serialize(fp)) { - return false; - } - if (training && use_adam_ && !dw_sq_sum_.Serialize(fp)) { - return false; + if (training) { + if (!tesseract::Serialize(fp, updates_)) { + return false; + } + if (use_adam_ && !tesseract::Serialize(fp, dw_sq_sum_)) { + return false; + } } } return true; @@ -257,6 +295,16 @@ bool WeightMatrix::DeSerialize(bool training, TFile *fp) { if (!fp->DeSerialize(&size)) { return false; } +#ifdef FAST_FLOAT + scales_.reserve(size); + for (auto n = size; n > 0; n--) { + double val; + if (!fp->DeSerialize(&val)) { + return false; + } + scales_.push_back(val / INT8_MAX); + } +#else scales_.resize(size); if (!fp->DeSerialize(&scales_[0], size)) { return false; @@ -264,22 +312,25 @@ bool WeightMatrix::DeSerialize(bool training, TFile *fp) { for (auto &scale : scales_) { scale /= INT8_MAX; } +#endif if (IntSimdMatrix::intSimdMatrix) { int32_t rounded_num_out; IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out); scales_.resize(rounded_num_out); } } else { - if (!wf_.DeSerialize(fp)) { + if (!tesseract::DeSerialize(fp, wf_)) { return false; } if (training) { InitBackward(); - if (!updates_.DeSerialize(fp)) { + if (!tesseract::DeSerialize(fp, updates_)) { return false; } - if (use_adam_ && !dw_sq_sum_.DeSerialize(fp)) { - return false; + if (use_adam_) { + if (!tesseract::DeSerialize(fp, dw_sq_sum_)) { + return false; + } } } } @@ -289,7 +340,11 @@ bool WeightMatrix::DeSerialize(bool training, TFile *fp) { // As DeSerialize, but reads an old (float) format WeightMatrix for // backward compatibility. bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) { - GENERIC_2D_ARRAY float_array; +#ifdef FAST_FLOAT + // Not implemented. + ASSERT_HOST(!"not implemented"); + return false; +#else if (int_mode_) { if (!wi_.DeSerialize(fp)) { return false; @@ -303,6 +358,7 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) { scales_.push_back(old_scale); } } else { + GENERIC_2D_ARRAY float_array; if (!float_array.DeSerialize(fp)) { return false; } @@ -310,6 +366,7 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) { } if (training) { InitBackward(); + GENERIC_2D_ARRAY float_array; if (!float_array.DeSerialize(fp)) { return false; } @@ -320,6 +377,7 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) { } } return true; +#endif } // Computes matrix.vector v = Wu. diff --git a/unittest/intsimdmatrix_test.cc b/unittest/intsimdmatrix_test.cc index a1411a0a10..95688eed5a 100644 --- a/unittest/intsimdmatrix_test.cc +++ b/unittest/intsimdmatrix_test.cc @@ -52,8 +52,8 @@ class IntSimdMatrixTest : public ::testing::Test { return v; } // Makes a random scales vector of the given size. - std::vector RandomScales(int size) { - std::vector v(size); + std::vector RandomScales(int size) { + std::vector v(size); for (int i = 0; i < size; ++i) { v[i] = (1.0 + random_.SignedRand(1.0)) / INT8_MAX; } @@ -61,19 +61,19 @@ class IntSimdMatrixTest : public ::testing::Test { } // Tests a range of sizes and compares the results against the generic version. void ExpectEqualResults(const IntSimdMatrix &matrix) { - double total = 0.0; + TFloat total = 0.0; for (int num_out = 1; num_out < 130; ++num_out) { for (int num_in = 1; num_in < 130; ++num_in) { GENERIC_2D_ARRAY w = InitRandom(num_out, num_in + 1); std::vector u = RandomVector(num_in, matrix); - std::vector scales = RandomScales(num_out); + std::vector scales = RandomScales(num_out); int ro = num_out; if (IntSimdMatrix::intSimdMatrix) { ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro); } - std::vector base_result(num_out); + std::vector base_result(num_out); IntSimdMatrix::MatrixDotVector(w, scales, u.data(), base_result.data()); - std::vector test_result(ro); + std::vector test_result(ro); std::vector shaped_wi; int32_t rounded_num_out; matrix.Init(w, shaped_wi, rounded_num_out); @@ -91,7 +91,11 @@ class IntSimdMatrixTest : public ::testing::Test { } } // Compare sum of all results with expected value. +#ifdef FAST_FLOAT + EXPECT_FLOAT_EQ(total, 337852.16f); +#else EXPECT_FLOAT_EQ(total, 337849.39354684710); +#endif } TRand random_; From 24a29b79e5811b4d8a7ae9d81bc2ed7c83c8222a Mon Sep 17 00:00:00 2001 From: Ger Hobbelt Date: Tue, 13 Jul 2021 08:59:26 +0200 Subject: [PATCH 2/4] bugfix of FMA port to FAST_FLOAT 8 float FPs fit in a single 256bit vector (8x32) (contrasting 4 double FPs: 4*64) [sw] Format commit message and use float instead of TFloat --- src/arch/dotproductfma.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp index 1e9f3a7975..01a788948a 100644 --- a/src/arch/dotproductfma.cpp +++ b/src/arch/dotproductfma.cpp @@ -31,26 +31,26 @@ namespace tesseract { // Uses Intel FMA intrinsics to access the SIMD instruction set. #if defined(FAST_FLOAT) float DotProductFMA(const float *u, const float *v, int n) { - const unsigned quot = n / 8; - const unsigned rem = n % 8; + const unsigned quot = n / 16; + const unsigned rem = n % 16; __m256 t0 = _mm256_setzero_ps(); __m256 t1 = _mm256_setzero_ps(); for (unsigned k = 0; k < quot; k++) { __m256 f0 = _mm256_loadu_ps(u); __m256 f1 = _mm256_loadu_ps(v); t0 = _mm256_fmadd_ps(f0, f1, t0); - u += 4; - v += 4; + u += 8; + v += 8; __m256 f2 = _mm256_loadu_ps(u); __m256 f3 = _mm256_loadu_ps(v); t1 = _mm256_fmadd_ps(f2, f3, t1); - u += 4; - v += 4; + u += 8; + v += 8; } t0 = _mm256_hadd_ps(t0, t1); - alignas(32) float tmp[4]; + alignas(32) float tmp[8]; _mm256_store_ps(tmp, t0); - float result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; + float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (unsigned k = 0; k < rem; k++) { result += *u++ * *v++; } From 79e8b4f3446973fb97d1712bff705f8e2fa52174 Mon Sep 17 00:00:00 2001 From: Ger Hobbelt Date: Tue, 13 Jul 2021 09:56:11 +0200 Subject: [PATCH 3/4] bugfixing the AVX2 Extract8+16 codes There's lines like `__m256d scale01234567 = _mm256_loadu_ps(scales)`, i.e. loading float vectors into double vector types. [sw] Formatted commit message --- src/arch/intsimdmatrixavx2.cpp | 49 ++++++++++++++-------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/src/arch/intsimdmatrixavx2.cpp b/src/arch/intsimdmatrixavx2.cpp index 1754d6e6cf..b333cf51a9 100644 --- a/src/arch/intsimdmatrixavx2.cpp +++ b/src/arch/intsimdmatrixavx2.cpp @@ -86,54 +86,45 @@ static inline __m128i load64_to_128(const int8_t *wi_) { } #if defined(FAST_FLOAT) -static inline void ExtractResults8(__m256i result, const int8_t *wi, const float *scales, - float *v) { - __m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg + +static inline void ExtractResults8(__m256i result, const int8_t *wi, + const float *scales, float *v) { + __m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg __m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127); - __m256d scale01234567 = _mm256_loadu_ps(scales); - //~ __m256d scale4567 = _mm256_loadu_ps(scales + 8); + __m256 scale01234567 = _mm256_loadu_ps(scales); w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 result = _mm256_add_epi32(result, w256); // result += bias * 127 - __m256 res01234567 = _mm256_cvtepi32_ps(_mm256_castsi256_si128(result)); + __m256 res01234567 = _mm256_cvtepi32_ps(result); result = _mm256_permute4x64_epi64(result, 2 + (3 << 2)); - __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result)); - res01234567 = _mm256_mul_pd(res01234567, scale01234567); - //~ res4567 = _mm256_mul_pd(res4567, scale4567); + res01234567 = _mm256_mul_ps(res01234567, scale01234567); _mm256_storeu_ps(v, res01234567); - //~ _mm256_storeu_pd(v + 4, res4567); } -static inline void ExtractResults16(__m256i result0, __m256i result1, const int8_t *&wi, - const float *&scales, float *&v) { +static inline void ExtractResults16(__m256i result0, __m256i result1, + const int8_t *&wi, const float *&scales, + float *&v) { __m128i w8 = _mm_loadu_si128(reinterpret_cast(wi)); // 8x8bit vals in bottom of 128bit reg - const __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127); + const __m256i bias_scale = + _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127); __m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg - __m256d scale0123 = _mm256_loadu_ps(scales); - __m256d scale4567 = _mm256_loadu_ps(scales + 8); + __m256 scale01234567 = _mm256_loadu_ps(scales); w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 result0 = _mm256_add_epi32(result0, w256); // result += bias * 127 - __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0)); + __m256 res01234567 = _mm256_cvtepi32_ps(result0); result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2)); - __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0)); - res0123 = _mm256_mul_pd(res0123, scale0123); - res4567 = _mm256_mul_pd(res4567, scale4567); - _mm256_storeu_ps(v, res0123); - _mm256_storeu_ps(v + 8, res4567); + res01234567 = _mm256_mul_ps(res01234567, scale01234567); + _mm256_storeu_ps(v, res01234567); w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2)); w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg - scale0123 = _mm256_loadu_ps(scales + 16); - scale4567 = _mm256_loadu_ps(scales + 24); + scale01234567 = _mm256_loadu_ps(scales + 8); w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 result1 = _mm256_add_epi32(result1, w256); // result += bias * 127 - res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1)); + res01234567 = _mm256_cvtepi32_ps(result1); result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2)); - res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1)); - res0123 = _mm256_mul_pd(res0123, scale0123); - res4567 = _mm256_mul_pd(res4567, scale4567); - _mm256_storeu_ps(v + 16, res0123); - _mm256_storeu_ps(v + 24, res4567); + res01234567 = _mm256_mul_ps(res01234567, scale01234567); + _mm256_storeu_ps(v + 8, res01234567); wi += 16; scales += 16; v += 16; From 27597883db813fdd914e840170b95f1b40688d58 Mon Sep 17 00:00:00 2001 From: Ger Hobbelt Date: Tue, 13 Jul 2021 09:20:39 +0200 Subject: [PATCH 4/4] Implement DotProductSSE() for FAST_FLOAT [sw] Formatted commit message --- src/arch/dotproductsse.cpp | 64 ++++++++++++++++++++++++++++++++++---- src/arch/intsimdmatrix.h | 2 +- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp index ec94f50341..9122e9d1b1 100644 --- a/src/arch/dotproductsse.cpp +++ b/src/arch/dotproductsse.cpp @@ -31,12 +31,63 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. #if defined(FAST_FLOAT) -TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) { - TFloat total = 0.0; - for (int k = 0; k < n; ++k) { - total += u[k] * v[k]; +float DotProductSSE(const float *u, const float *v, int n) { + int max_offset = n - 4; + int offset = 0; + // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and + // v, and multiplying them together in parallel. + __m128 sum = _mm_setzero_ps(); + if (offset <= max_offset) { + offset = 4; + // Aligned load is reputedly faster but requires 16 byte aligned input. + if ((reinterpret_cast(u) & 15) == 0 && + (reinterpret_cast(v) & 15) == 0) { + // Use aligned load. + sum = _mm_load_ps(u); + __m128 floats2 = _mm_load_ps(v); + // Multiply. + sum = _mm_mul_ps(sum, floats2); + while (offset <= max_offset) { + __m128 floats1 = _mm_load_ps(u + offset); + floats2 = _mm_load_ps(v + offset); + floats1 = _mm_mul_ps(floats1, floats2); + sum = _mm_add_ps(sum, floats1); + offset += 4; + } + } else { + // Use unaligned load. + sum = _mm_loadu_ps(u); + __m128 floats2 = _mm_loadu_ps(v); + // Multiply. + sum = _mm_mul_ps(sum, floats2); + while (offset <= max_offset) { + __m128 floats1 = _mm_loadu_ps(u + offset); + floats2 = _mm_loadu_ps(v + offset); + floats1 = _mm_mul_ps(floats1, floats2); + sum = _mm_add_ps(sum, floats1); + offset += 4; + } + } } - return total; + // Add the 4 sums in sum horizontally. +#if 0 + alignas(32) float tmp[4]; + _mm_store_ps(tmp, sum); + float result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; +#else + __m128 zero = _mm_setzero_ps(); + // https://www.felixcloutier.com/x86/haddps + sum = _mm_hadd_ps(sum, zero); + sum = _mm_hadd_ps(sum, zero); + // Extract the low result. + float result = _mm_cvtss_f32(sum); +#endif + // Add on any left-over products. + while (offset < n) { + result += u[offset] * v[offset]; + ++offset; + } + return result; } #else double DotProductSSE(const double *u, const double *v, int n) { @@ -48,7 +99,8 @@ double DotProductSSE(const double *u, const double *v, int n) { if (offset <= max_offset) { offset = 2; // Aligned load is reputedly faster but requires 16 byte aligned input. - if ((reinterpret_cast(u) & 15) == 0 && (reinterpret_cast(v) & 15) == 0) { + if ((reinterpret_cast(u) & 15) == 0 && + (reinterpret_cast(v) & 15) == 0) { // Use aligned load. sum = _mm_load_pd(u); __m128d floats2 = _mm_load_pd(v); diff --git a/src/arch/intsimdmatrix.h b/src/arch/intsimdmatrix.h index 7b09577cc9..d93f928dbc 100644 --- a/src/arch/intsimdmatrix.h +++ b/src/arch/intsimdmatrix.h @@ -115,7 +115,7 @@ struct TESS_API IntSimdMatrix { static const IntSimdMatrix *intSimdMatrix; // Only available with NEON. static const IntSimdMatrix intSimdMatrixNEON; - // Only available with AVX2 / SSE. + // Only available with AVX2 / AVX / FMA / SSE. static const IntSimdMatrix intSimdMatrixAVX2; static const IntSimdMatrix intSimdMatrixSSE; };