Skip to content

Commit

Permalink
Fix Code Style
Browse files Browse the repository at this point in the history
  • Loading branch information
tszumski committed Sep 5, 2024
1 parent 342c687 commit ececcd9
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 69 deletions.
34 changes: 20 additions & 14 deletions src/lib/openjp2/dwt.c
Original file line number Diff line number Diff line change
Expand Up @@ -336,8 +336,9 @@ static void opj_dwt_decode_1(const opj_dwt_t *v)

#if defined(__AVX512F__)
static int32_t loop_short_sse(int32_t len, const int32_t** lf_ptr,
const int32_t** hf_ptr, int32_t** out_ptr,
int32_t* prev_even) {
const int32_t** hf_ptr, int32_t** out_ptr,
int32_t* prev_even)
{
int32_t next_even;
__m128i odd, even_m1, unpack1, unpack2;
const int32_t batch = (len - 2) / 8;
Expand Down Expand Up @@ -416,9 +417,12 @@ static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
int32_t prev_even = in_even[0] - ((in_odd[0] + 1) >> 1);

const __m512i permutevar_mask = _mm512_setr_epi32(
0x10, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e);
const __m512i store1_perm = _mm512_setr_epi64(0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b);
const __m512i store2_perm = _mm512_setr_epi64(0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f);
0x10, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e);
const __m512i store1_perm = _mm512_setr_epi64(0x00, 0x01, 0x08, 0x09, 0x02,
0x03, 0x0a, 0x0b);
const __m512i store2_perm = _mm512_setr_epi64(0x04, 0x05, 0x0c, 0x0d, 0x06,
0x07, 0x0e, 0x0f);

const __m512i two = _mm512_set1_epi32(2);

Expand Down Expand Up @@ -465,7 +469,8 @@ static void opj_idwt53_h_cas0(OPJ_INT32* tmp,

leftover = len - simd_batch_512 * 32;
if (leftover > 8) {
leftover -= 8 * loop_short_sse(leftover, &in_even, &in_odd, &out_ptr, &prev_even);
leftover -= 8 * loop_short_sse(leftover, &in_even, &in_odd, &out_ptr,
&prev_even);
}
out_ptr[0] = prev_even;

Expand All @@ -475,20 +480,20 @@ static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
in_even++;
in_odd++;
out_ptr += 2;
}
}

if (len & 1) {
out_ptr[2] = in_even[1] - ((in_odd[0] + 1) >> 1);
out_ptr[1] = in_odd[0] + ((out_ptr[0] + out_ptr[2]) >> 1);
}
else { //!(len & 1)
} else { //!(len & 1)
out_ptr[1] = in_odd[0] + out_ptr[0];
}
#elif defined(__AVX2__)
OPJ_INT32* out_ptr = tmp;
int32_t prev_even = in_even[0] - ((in_odd[0] + 1) >> 1);

const __m256i reg_permutevar_mask_move_right = _mm256_setr_epi32(0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
const __m256i reg_permutevar_mask_move_right = _mm256_setr_epi32(0x00, 0x00,
0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
const __m256i two = _mm256_set1_epi32(2);

int32_t simd_batch = (len - 2) / 16;
Expand Down Expand Up @@ -519,8 +524,10 @@ static void opj_idwt53_h_cas0(OPJ_INT32* tmp,

_mm_storeu_si128((__m128i*)(out_ptr + 0), _mm256_castsi256_si128(unpack1_avx2));
_mm_storeu_si128((__m128i*)(out_ptr + 4), _mm256_castsi256_si128(unpack2_avx2));
_mm_storeu_si128((__m128i*)(out_ptr + 8), _mm256_extracti128_si256(unpack1_avx2, 0x1));
_mm_storeu_si128((__m128i*)(out_ptr + 12), _mm256_extracti128_si256(unpack2_avx2, 0x1));
_mm_storeu_si128((__m128i*)(out_ptr + 8), _mm256_extracti128_si256(unpack1_avx2,
0x1));
_mm_storeu_si128((__m128i*)(out_ptr + 12),
_mm256_extracti128_si256(unpack2_avx2, 0x1));

prev_even = next_even;

Expand All @@ -540,8 +547,7 @@ static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
if (len & 1) {
out_ptr[2] = in_even[1] - ((in_odd[0] + 1) >> 1);
out_ptr[1] = in_odd[0] + ((out_ptr[0] + out_ptr[2]) >> 1);
}
else { //!(len & 1)
} else { //!(len & 1)
out_ptr[1] = in_odd[0] + out_ptr[0];
}
#else
Expand Down
124 changes: 69 additions & 55 deletions src/lib/openjp2/t1.c
Original file line number Diff line number Diff line change
Expand Up @@ -2271,55 +2271,63 @@ static void opj_t1_cblk_encode_processor(void* user_data, opj_tls_t* tls)
/* Change from "natural" order to "zigzag" order of T1 passes */
for (j = 0; j < (cblk_h & ~3U); j += 4) {
#if defined(__AVX512F__)
const __m512i perm1 = _mm512_setr_epi64(2, 3, 10, 11, 4, 5, 12, 13);
const __m512i perm2 = _mm512_setr_epi64(6, 7, 14, 15, 0, 0, 0, 0);
OPJ_UINT32* ptr = tiledp_u;
for (i = 0; i < cblk_w / 16; ++i) {
// INPUT OUTPUT
// 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
// 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
// 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F 08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
// 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
__m512i in1 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr + (j + 0) * tile_w)), T1_NMSEDEC_FRACBITS);
__m512i in2 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr + (j + 1) * tile_w)), T1_NMSEDEC_FRACBITS);
__m512i in3 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr + (j + 2) * tile_w)), T1_NMSEDEC_FRACBITS);
__m512i in4 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr + (j + 3) * tile_w)), T1_NMSEDEC_FRACBITS);

__m512i tmp1 = _mm512_unpacklo_epi32(in1, in2);
__m512i tmp2 = _mm512_unpacklo_epi32(in3, in4);
__m512i tmp3 = _mm512_unpackhi_epi32(in1, in2);
__m512i tmp4 = _mm512_unpackhi_epi32(in3, in4);

in1 = _mm512_unpacklo_epi64(tmp1, tmp2);
in2 = _mm512_unpacklo_epi64(tmp3, tmp4);
in3 = _mm512_unpackhi_epi64(tmp1, tmp2);
in4 = _mm512_unpackhi_epi64(tmp3, tmp4);

_mm_storeu_si128((__m128i*)(t1data + 0), _mm512_castsi512_si128(in1));
_mm_storeu_si128((__m128i*)(t1data + 4), _mm512_castsi512_si128(in3));
_mm_storeu_si128((__m128i*)(t1data + 8), _mm512_castsi512_si128(in2));
_mm_storeu_si128((__m128i*)(t1data + 12), _mm512_castsi512_si128(in4));

tmp1 = _mm512_permutex2var_epi64(in1, perm1, in3);
tmp2 = _mm512_permutex2var_epi64(in2, perm1, in4);

_mm256_storeu_si256((__m256i*)(t1data + 16), _mm512_castsi512_si256(tmp1));
_mm256_storeu_si256((__m256i*)(t1data + 24), _mm512_castsi512_si256(tmp2));
_mm256_storeu_si256((__m256i*)(t1data + 32), _mm512_extracti64x4_epi64(tmp1, 0x1));
_mm256_storeu_si256((__m256i*)(t1data + 40), _mm512_extracti64x4_epi64(tmp2, 0x1));
_mm256_storeu_si256((__m256i*)(t1data + 48), _mm512_castsi512_si256(_mm512_permutex2var_epi64(in1, perm2, in3)));
_mm256_storeu_si256((__m256i*)(t1data + 56), _mm512_castsi512_si256(_mm512_permutex2var_epi64(in2, perm2, in4)));
t1data += 64;
ptr += 16;
}
for (i = 0; i < cblk_w % 16; ++i) {
t1data[0] = ptr[(j + 0) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data[1] = ptr[(j + 1) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data[2] = ptr[(j + 2) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data[3] = ptr[(j + 3) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data += 4;
ptr += 1;
}
const __m512i perm1 = _mm512_setr_epi64(2, 3, 10, 11, 4, 5, 12, 13);
const __m512i perm2 = _mm512_setr_epi64(6, 7, 14, 15, 0, 0, 0, 0);
OPJ_UINT32* ptr = tiledp_u;
for (i = 0; i < cblk_w / 16; ++i) {
// INPUT OUTPUT
// 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
// 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
// 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F 08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
// 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
__m512i in1 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr +
(j + 0) * tile_w)), T1_NMSEDEC_FRACBITS);
__m512i in2 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr +
(j + 1) * tile_w)), T1_NMSEDEC_FRACBITS);
__m512i in3 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr +
(j + 2) * tile_w)), T1_NMSEDEC_FRACBITS);
__m512i in4 = _mm512_slli_epi32(_mm512_loadu_si512((__m512i*)(ptr +
(j + 3) * tile_w)), T1_NMSEDEC_FRACBITS);

__m512i tmp1 = _mm512_unpacklo_epi32(in1, in2);
__m512i tmp2 = _mm512_unpacklo_epi32(in3, in4);
__m512i tmp3 = _mm512_unpackhi_epi32(in1, in2);
__m512i tmp4 = _mm512_unpackhi_epi32(in3, in4);

in1 = _mm512_unpacklo_epi64(tmp1, tmp2);
in2 = _mm512_unpacklo_epi64(tmp3, tmp4);
in3 = _mm512_unpackhi_epi64(tmp1, tmp2);
in4 = _mm512_unpackhi_epi64(tmp3, tmp4);

_mm_storeu_si128((__m128i*)(t1data + 0), _mm512_castsi512_si128(in1));
_mm_storeu_si128((__m128i*)(t1data + 4), _mm512_castsi512_si128(in3));
_mm_storeu_si128((__m128i*)(t1data + 8), _mm512_castsi512_si128(in2));
_mm_storeu_si128((__m128i*)(t1data + 12), _mm512_castsi512_si128(in4));

tmp1 = _mm512_permutex2var_epi64(in1, perm1, in3);
tmp2 = _mm512_permutex2var_epi64(in2, perm1, in4);

_mm256_storeu_si256((__m256i*)(t1data + 16), _mm512_castsi512_si256(tmp1));
_mm256_storeu_si256((__m256i*)(t1data + 24), _mm512_castsi512_si256(tmp2));
_mm256_storeu_si256((__m256i*)(t1data + 32), _mm512_extracti64x4_epi64(tmp1,
0x1));
_mm256_storeu_si256((__m256i*)(t1data + 40), _mm512_extracti64x4_epi64(tmp2,
0x1));
_mm256_storeu_si256((__m256i*)(t1data + 48),
_mm512_castsi512_si256(_mm512_permutex2var_epi64(in1, perm2, in3)));
_mm256_storeu_si256((__m256i*)(t1data + 56),
_mm512_castsi512_si256(_mm512_permutex2var_epi64(in2, perm2, in4)));
t1data += 64;
ptr += 16;
}
for (i = 0; i < cblk_w % 16; ++i) {
t1data[0] = ptr[(j + 0) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data[1] = ptr[(j + 1) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data[2] = ptr[(j + 2) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data[3] = ptr[(j + 3) * tile_w] << T1_NMSEDEC_FRACBITS;
t1data += 4;
ptr += 1;
}
#elif defined(__AVX2__)
OPJ_UINT32* ptr = tiledp_u;
for (i = 0; i < cblk_w / 8; ++i) {
Expand All @@ -2328,10 +2336,14 @@ static void opj_t1_cblk_encode_processor(void* user_data, opj_tls_t* tls)
// 10 11 12 13 14 15 16 17 02 12 22 32 03 13 23 33
// 20 21 22 23 24 25 26 27 04 14 24 34 05 15 25 35
// 30 31 32 33 34 35 36 37 06 16 26 36 07 17 27 37
__m256i in1 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr + (j + 0) * tile_w)), T1_NMSEDEC_FRACBITS);
__m256i in2 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr + (j + 1) * tile_w)), T1_NMSEDEC_FRACBITS);
__m256i in3 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr + (j + 2) * tile_w)), T1_NMSEDEC_FRACBITS);
__m256i in4 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr + (j + 3) * tile_w)), T1_NMSEDEC_FRACBITS);
__m256i in1 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr +
(j + 0) * tile_w)), T1_NMSEDEC_FRACBITS);
__m256i in2 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr +
(j + 1) * tile_w)), T1_NMSEDEC_FRACBITS);
__m256i in3 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr +
(j + 2) * tile_w)), T1_NMSEDEC_FRACBITS);
__m256i in4 = _mm256_slli_epi32(_mm256_loadu_si256((__m256i*)(ptr +
(j + 3) * tile_w)), T1_NMSEDEC_FRACBITS);

__m256i tmp1 = _mm256_unpacklo_epi32(in1, in2);
__m256i tmp2 = _mm256_unpacklo_epi32(in3, in4);
Expand All @@ -2347,8 +2359,10 @@ static void opj_t1_cblk_encode_processor(void* user_data, opj_tls_t* tls)
_mm_storeu_si128((__m128i*)(t1data + 4), _mm256_castsi256_si128(in3));
_mm_storeu_si128((__m128i*)(t1data + 8), _mm256_castsi256_si128(in2));
_mm_storeu_si128((__m128i*)(t1data + 12), _mm256_castsi256_si128(in4));
_mm256_storeu_si256((__m256i*)(t1data + 16), _mm256_permute2x128_si256(in1, in3, 0x31));
_mm256_storeu_si256((__m256i*)(t1data + 24), _mm256_permute2x128_si256(in2, in4, 0x31));
_mm256_storeu_si256((__m256i*)(t1data + 16), _mm256_permute2x128_si256(in1, in3,
0x31));
_mm256_storeu_si256((__m256i*)(t1data + 24), _mm256_permute2x128_si256(in2, in4,
0x31));
t1data += 32;
ptr += 8;
}
Expand Down

0 comments on commit ececcd9

Please sign in to comment.