diff --git a/lib/os/prf.c b/lib/os/prf.c index 0e42491d47dcd5..5d48d067554081 100644 --- a/lib/os/prf.c +++ b/lib/os/prf.c @@ -130,37 +130,56 @@ static void _rlrshift(uint64_t *v) * sense to define this much smaller special case here to avoid * including it just for printf. * - * It works by iteratively dividing the most significant 32 bits of - * the 64 bit value by 5. This will leave a remainder of 0-4 - * (i.e. three significant bits), ensuring that the top 29 bits of the - * remainder are zero for the next iteration. Thus in the second - * iteration only 35 significant bits remain, and in the third only - * six. This was tested exhaustively through the first ~10B values in - * the input space, and for ~2e12 (4 hours runtime) random inputs - * taken from the full 64 bit space. + * It works by multiplying v by the reciprocal of 5 i.e.: + * + * result = v * ((1 << 64) / 5) / (1 << 64) + * + * This produces a 128-bit result, but we drop the bottom 64 bits which + * accounts for the division by (1 << 64). The product is kept to 64 bits + * by summing partial multiplications and shifting right by 32 which on + * most 32-bit architectures means only a register drop. + * + * Here the multiplier is: (1 << 64) / 5 = 0x3333333333333333 + * i.e. a 62 bits value. To compensate for the reduced precision, we + * add an initial bias of 1 to v. Enlarging the multiplier to 64 bits + * would also work but a final right shift would be needed, and carry + * handling on the summing of partial mults would be necessary, requiring + * more instructions. Given that we already want to add bias of 2 for + * the result to be rounded to nearest and not truncated, we might as well + * combine those together into a bias of 3. This also conveniently allows + * for keeping the multiplier in a single 32-bit register given its pattern. */ static void _ldiv5(uint64_t *v) { - uint32_t hi; - uint64_t rem = *v, quot = 0U, q; - int i; + uint32_t v_lo = *v; + uint32_t v_hi = *v >> 32; + uint32_t m = 0x33333333; + uint64_t result; - static const char shifts[] = { 32, 3, 0 }; + /* + * Force the multiplier constant into a register and make it + * opaque to the compiler, otherwise gcc tries to be too smart + * for its own good with a large expansion of adds and shifts. + */ + __asm__ ("" : "+r" (m)); /* - * Usage in this file wants rounded behavior, not truncation. So add - * two to get the threshold right. + * Apply the bias of 3. We can't add it to v as this would overflow + * it when at max range. Factor it out with the multiplier upfront. + * Here we multiply the low and high parts separately to avoid an + * unnecessary 64-bit add-with-carry. */ - rem += 2U; + result = ((uint64_t)(m * 3U) << 32) | (m * 3U); - for (i = 0; i < 3; i++) { - hi = rem >> shifts[i]; - q = (uint64_t)(hi / 5U) << shifts[i]; - rem -= q * 5U; - quot += q; - } + /* The actual multiplication. */ + result += (uint64_t)v_lo * m; + result >>= 32; + result += (uint64_t)v_lo * m; + result += (uint64_t)v_hi * m; + result >>= 32; + result += (uint64_t)v_hi * m; - *v = quot; + *v = result; } static char _get_digit(uint64_t *fr, int *digit_count)