-
Notifications
You must be signed in to change notification settings - Fork 546
/
bfloat16.cpp
42 lines (33 loc) · 960 Bytes
/
bfloat16.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/*
* SPDX-License-Identifier: Apache-2.0
*/
#include "bfloat16.hpp"
#include <cstring>
namespace onnx2trt
{
BFloat16::operator float() const
{
static_assert(sizeof(uint32_t) == sizeof(float), "");
float val{0.F};
auto bits = static_cast<uint32_t>(mRep) << 16;
std::memcpy(&val, &bits, sizeof(uint32_t));
return val;
}
BFloat16::BFloat16(float x)
{
static_assert(sizeof(uint32_t) == sizeof(float), "");
uint32_t bits{0};
std::memcpy(&bits, &x, sizeof(float));
// FP32 format: 1 sign bit, 8 bit exponent, 23 bit mantissa
// BF16 format: 1 sign bit, 8 bit exponent, 7 bit mantissa
// Mask for exponent
constexpr uint32_t exponent = 0xFFU << 23;
// Check if exponent is all 1s (NaN or infinite)
if ((bits & exponent) != exponent)
{
// x is finite - round to even
bits += 0x7FFFU + (bits >> 16 & 1);
}
mRep = static_cast<uint16_t>(bits >> 16);
}
} // namespace onnx2trt