Skip to content

Commit

Permalink
Add 12-byte sse2 unshuffle
Browse files Browse the repository at this point in the history
  • Loading branch information
tom-neara authored and FrancescAlted committed Feb 5, 2025
1 parent 5588bb6 commit b74a749
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 1 deletion.
71 changes: 71 additions & 0 deletions blosc/shuffle-sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,74 @@ unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
}
}

/* Routine optimized for unshuffling a buffer for a type size of 12 bytes. */
/* Based on the 16-byte implementation */
static void
unshuffle12_sse2(uint8_t* const dest, const uint8_t* const src,
const int32_t vectorizable_elements, const int32_t total_elements) {
static const int32_t bytesoftype = 12;
int32_t i;
int j;
__m128i xmm1[16], xmm2[16];

__m128i mask = _mm_set_epi8( 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff);

for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
/* Load 16 elements (256 bytes) into 16 XMM registers. */
const uint8_t* const src_for_ith_element = src + i;
for (j = 0; j < bytesoftype; j++) {
xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
}
/* Shuffle bytes */
for (j = 0; j < 8; j++) {
/* Compute the low 32 bytes */
xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
/* Compute the hi 32 bytes */
xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
}
/* Shuffle 2-byte words */
for (j = 0; j < 8; j++) {
/* Compute the low 32 bytes */
xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
/* Compute the hi 32 bytes */
xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
}
/* Shuffle 4-byte dwords */
for (j = 0; j < 8; j++) {
/* Compute the low 32 bytes */
xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
/* Compute the hi 32 bytes */
xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
}
/* Shuffle 8-byte qwords */
for (j = 0; j < 8; j++) {
/* Compute the low 32 bytes */
xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
/* Compute the hi 32 bytes */
xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
}


/* Store the result vectors in proper order */
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * 12)), xmm1[0]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * 12)), xmm1[8]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * 12)), xmm1[4]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * 12)), xmm1[12]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * 12)), xmm1[2]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * 12)), xmm1[10]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * 12)), xmm1[6]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * 12)), xmm1[14]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * 12)), xmm1[1]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * 12)), xmm1[9]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * 12)), xmm1[5]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * 12)), xmm1[13]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * 12)), xmm1[3]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * 12)), xmm1[11]);
_mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * 12)), xmm1[7]);
_mm_maskmoveu_si128(xmm1[15], mask, (char *)(dest + (i * bytesoftype) + (15 * 12)));
}
}

/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
static void
unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
Expand Down Expand Up @@ -592,6 +660,9 @@ unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
case 8:
unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
break;
case 12:
unshuffle12_sse2(_dest, _src, vectorizable_elements, total_elements);
break;
case 16:
unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
break;
Expand Down
23 changes: 22 additions & 1 deletion tests/test_shuffle_roundtrip_sse2.csv
Original file line number Diff line number Diff line change
Expand Up @@ -397,4 +397,25 @@
80,100000,32,2
80,702713,32,0
80,702713,32,1
80,702713,32,2
80,702713,32,2
12,100000,32,0
12,100000,32,1
12,100000,32,2
12,1792,32,0
12,1792,32,1
12,1792,32,2
12,192,32,0
12,192,32,1
12,192,32,2
12,500,32,0
12,500,32,1
12,500,32,2
12,702713,32,0
12,702713,32,1
12,702713,32,2
12,7,32,0
12,7,32,1
12,7,32,2
12,8000,32,0
12,8000,32,1
12,8000,32,2

0 comments on commit b74a749

Please sign in to comment.