From d1304c80e550e436e8b8a781eb6084e11a5e921f Mon Sep 17 00:00:00 2001 From: Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com> Date: Wed, 15 May 2024 09:49:43 +0200 Subject: [PATCH] feat: use avx2 to speedup matmulQ40 (#54) --- src/funcs.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/funcs.cpp b/src/funcs.cpp index 26e8989..3e15e4a 100644 --- a/src/funcs.cpp +++ b/src/funcs.cpp @@ -218,6 +218,21 @@ void matmulQ40(MatmulThreadInfo* a) { } a->output[d] = vaddvq_f32(u); } +#elif defined(__AVX2__) + assert(k % 32 == 0); + __m256 a0, b0, u; + for (int d = a->ds; d < a->de; d++) { + u = _mm256_set1_ps(0.0f); + for (int j = 0; j < n; j++) { + dequantizeQ40Row(&w[d * n * blocksPerRow + j * blocksPerRow], group, k); + for (int z = 0; z < k; z += 8) { + a0 = _mm256_loadu_ps(&input[j * k + z]); + b0 = _mm256_loadu_ps(&group[z]); + u = _mm256_fmadd_ps(a0, b0, u); + } + } + a->output[d] = hsum_float_8(u); + } #else for (int d = a->ds; d < a->de; d++) { float val = 0.0f;