Skip to content

Commit

Permalink
Optimized SRTP stream lookup with SSE2.
Browse files Browse the repository at this point in the history
Stream lookup by SSRC is now performed using SSE2 intrinsics, which
is considerably faster when there are many streams in the list. Although
the lookup still has linear complexity, its absolute times are reduced
and with tens to hundreds elements are lower or comparable with a typical
rb-tree equivalent.

Expected stream lookup performance of scalar array-based implementation
and its SSE2 version compared to the list-based implementation that was
used previously:

SSRCs    speedup (scalar)   speedup (SSE2)

1        0.39x              0.22x
3        0.57x              0.23x
5        0.69x              0.62x
10       0.77x              1.43x
20       0.86x              2.38x
30       0.87x              3.44x
50       1.13x              6.21x
100      1.25x              8.51x
200      1.30x              9.83x

Performance tested on an Intel Core i7 2600K CPU.
  • Loading branch information
Lastique committed Jan 21, 2022
1 parent 4303c1a commit 0cec15e
Showing 1 changed file with 63 additions and 0 deletions.
63 changes: 63 additions & 0 deletions srtp/stream_list.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@
#include <stdint.h>
#include <string.h>

#if defined(__SSE2__)
#include <emmintrin.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#endif

#include "srtp_priv.h"
#include "err.h"
#include "alloc.h"
Expand All @@ -69,6 +76,61 @@ void srtp_stream_list_init(srtp_stream_list_t *streams)
*/
uint32_t srtp_stream_list_find(const srtp_stream_list_t *streams, uint32_t ssrc)
{
#if defined(__SSE2__)
const uint32_t *const ssrcs = streams->ssrcs;
const __m128i mm_ssrc = _mm_set1_epi32(ssrc);
uint32_t pos = 0u, n = (streams->size + 7u) & ~(uint32_t)(7u);
for (uint32_t m = n & ~(uint32_t)(15u); pos < m; pos += 16u) {
__m128i mm1 = _mm_loadu_si128((const __m128i *)(ssrcs + pos));
__m128i mm2 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 4u));
__m128i mm3 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 8u));
__m128i mm4 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 12u));
mm1 = _mm_cmpeq_epi32(mm1, mm_ssrc);
mm2 = _mm_cmpeq_epi32(mm2, mm_ssrc);
mm3 = _mm_cmpeq_epi32(mm3, mm_ssrc);
mm4 = _mm_cmpeq_epi32(mm4, mm_ssrc);
mm1 = _mm_packs_epi32(mm1, mm2);
mm3 = _mm_packs_epi32(mm3, mm4);
mm1 = _mm_packs_epi16(mm1, mm3);
uint32_t mask = _mm_movemask_epi8(mm1);
if (mask) {
#if defined(_MSC_VER)
unsigned long bit_pos;
_BitScanForward(&bit_pos, mask);
pos += bit_pos;
#else
pos += __builtin_ctz(mask);
#endif

goto done;
}
}

if (pos < n) {
__m128i mm1 = _mm_loadu_si128((const __m128i *)(ssrcs + pos));
__m128i mm2 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 4u));
mm1 = _mm_cmpeq_epi32(mm1, mm_ssrc);
mm2 = _mm_cmpeq_epi32(mm2, mm_ssrc);
mm1 = _mm_packs_epi32(mm1, mm2);

uint32_t mask = _mm_movemask_epi8(mm1);
if (mask) {
#if defined(_MSC_VER)
unsigned long bit_pos;
_BitScanForward(&bit_pos, mask);
pos += bit_pos / 2u;
#else
pos += __builtin_ctz(mask) / 2u;
#endif
goto done;
}

pos += 8u;
}

done:
return pos;
#else /* defined(__SSE2__) */
/* walk down list until ssrc is found */
uint32_t pos = 0u, n = streams->size;
for (; pos < n; ++pos) {
Expand All @@ -77,6 +139,7 @@ uint32_t srtp_stream_list_find(const srtp_stream_list_t *streams, uint32_t ssrc)
}

return pos;
#endif /* defined(__SSE2__) */
}

/*
Expand Down

0 comments on commit 0cec15e

Please sign in to comment.