diff --git a/src/containers/run.c b/src/containers/run.c index 75030bf8f..b572f8e61 100644 --- a/src/containers/run.c +++ b/src/containers/run.c @@ -1017,25 +1017,33 @@ int _avx2_run_container_to_uint32_array(void *vout, const run_container_t *cont, for (int i = 0; i < cont->n_runs; ++i) { uint32_t run_start = base + cont->runs[i].value; uint16_t le = cont->runs[i].length; - int j = 0; - __m256i run_start_v = _mm256_set1_epi32(run_start); - // [8,8,8,8....] - __m256i inc = _mm256_set1_epi32(8); - // used for generate sequence: - // [0, 1, 2, 3...], [8, 9, 10,...] - __m256i delta = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - for (j = 0; j + 8 <= le; j += 8) { - __m256i val_v = _mm256_add_epi32(run_start_v, delta); - _mm256_storeu_si256((__m256i *)(out + outpos), val_v); - delta = _mm256_add_epi32(inc, delta); - outpos += 8; - } - - for (; j <= le; ++j) { - uint32_t val = run_start + j; - memcpy(out + outpos, &val, - sizeof(uint32_t)); // should be compiled as a MOV on x64 - outpos++; + if (__builtin_expect(le < 8, 1)) { + for (int j = 0; j <= le; ++j) { + uint32_t val = run_start + j; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + } else { + int j = 0; + __m256i run_start_v = _mm256_set1_epi32(run_start); + // [8,8,8,8....] + __m256i inc = _mm256_set1_epi32(8); + // used for generate sequence: + // [0, 1, 2, 3...], [8, 9, 10,...] + __m256i delta = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + for (j = 0; j + 8 <= le; j += 8) { + __m256i val_v = _mm256_add_epi32(run_start_v, delta); + _mm256_storeu_si256((__m256i *)(out + outpos), val_v); + delta = _mm256_add_epi32(inc, delta); + outpos += 8; + } + for (; j <= le; ++j) { + uint32_t val = run_start + j; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } } } return outpos;