-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathmath.c
201 lines (186 loc) · 9.49 KB
/
math.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#include <platform.h>
#define EXP_MASK 0x7F800000
#define EXP_ZERO 0x3F800000 // hex representation of (float)0.0f
#define MANT_MASK 0x007FFFFF
#define SIGN_BIT 0x80000000
#define F32_BIT_CAST(raw_x) (*(float*) &raw_x)
#define F32_UNBIASED_EXPONENT(raw_x) \
(((raw_x & 0xFF800000) >> 23) - /*exponent bias*/ 127)
#define F32_HIGH_MANTISSA_BITS(raw_x) \
((raw_x >> 16) & 0x7F) // 7 most significant mantissa bits
#define F32_LOW_MANTISSA_BITS(raw_x) \
(raw_x & 0xFFFF) // remaining 16 mantissa bits
#define LN2 0.6931472f // natural logarithm of 2
// TODO: use standard library constants, because this definition seems to be
// platform dependant
float float_nan = 0.0 / 0.0;
float float_inf = 1.0e100 * 1.0e100;
// __ln_F[n] = ln(1.0 + n/128) for n = 0 .. 128
const float __ln_F[0x81] = {
0.0f, 0.0077821403f, 0.015504186f, 0.023167059f, 0.030771658f,
0.038318865f, 0.045809537f, 0.053244516f, 0.06062462f, 0.06795066f,
0.07522342f, 0.08244367f, 0.089612156f, 0.09672963f, 0.103796795f,
0.11081436f, 0.11778303f, 0.12470348f, 0.13157636f, 0.13840233f,
0.14518201f, 0.15191604f, 0.15860502f, 0.16524957f, 0.17185026f,
0.17840765f, 0.18492234f, 0.19139485f, 0.19782574f, 0.20421554f,
0.21056476f, 0.21687394f, 0.22314355f, 0.2293741f, 0.23556606f,
0.24171993f, 0.24783616f, 0.25391522f, 0.25995752f, 0.26596355f,
0.2719337f, 0.27786845f, 0.28376818f, 0.2896333f, 0.29546422f,
0.30126134f, 0.30702505f, 0.3127557f, 0.31845373f, 0.32411948f,
0.32975328f, 0.33535555f, 0.3409266f, 0.34646678f, 0.35197642f,
0.35745588f, 0.3629055f, 0.36832556f, 0.3737164f, 0.37907836f,
0.3844117f, 0.38971674f, 0.3949938f, 0.40024316f, 0.4054651f,
0.41065994f, 0.4158279f, 0.4209693f, 0.4260844f, 0.43117347f,
0.43623677f, 0.44127455f, 0.4462871f, 0.45127463f, 0.45623744f,
0.4611757f, 0.46608973f, 0.47097972f, 0.4758459f, 0.48068854f,
0.48550782f, 0.490304f, 0.49507725f, 0.49982786f, 0.504556f,
0.5092619f, 0.51394576f, 0.51860774f, 0.52324814f, 0.5278671f,
0.5324648f, 0.5370415f, 0.5415973f, 0.54613245f, 0.55064714f,
0.5551415f, 0.5596158f, 0.56407017f, 0.56850475f, 0.5729197f,
0.5773154f, 0.58169174f, 0.586049f, 0.59038746f, 0.59470713f,
0.5990082f, 0.60329086f, 0.60755527f, 0.61180156f, 0.61602986f,
0.6202404f, 0.6244333f, 0.62860864f, 0.63276666f, 0.63690746f,
0.6410312f, 0.64513797f, 0.649228f, 0.6533013f, 0.65735805f,
0.6613985f, 0.6654226f, 0.6694307f, 0.6734227f, 0.6773988f,
0.68135923f, 0.685304f, 0.6892333f, 0.6931472f,
}; /* const */
// __one_over_F[n] = 1.0 / (1.0 + n/128) for n = 0..128
const float __one_over_F[0x81] = {
1.0f, 0.99224806f, 0.9846154f, 0.97709924f, 0.969697f,
0.96240604f, 0.95522386f, 0.94814813f, 0.9411765f, 0.93430656f,
0.92753625f, 0.92086333f, 0.9142857f, 0.9078014f, 0.90140843f,
0.8951049f, 0.8888889f, 0.8827586f, 0.8767123f, 0.8707483f,
0.8648649f, 0.8590604f, 0.85333335f, 0.8476821f, 0.84210527f,
0.8366013f, 0.83116883f, 0.82580644f, 0.82051283f, 0.81528664f,
0.8101266f, 0.8050314f, 0.8f, 0.7950311f, 0.79012346f,
0.78527606f, 0.7804878f, 0.77575755f, 0.7710843f, 0.7664671f,
0.7619048f, 0.75739646f, 0.7529412f, 0.748538f, 0.74418604f,
0.7398844f, 0.7356322f, 0.73142856f, 0.72727275f, 0.72316384f,
0.71910113f, 0.7150838f, 0.7111111f, 0.70718235f, 0.7032967f,
0.69945353f, 0.6956522f, 0.6918919f, 0.68817204f, 0.684492f,
0.68085104f, 0.67724866f, 0.67368424f, 0.6701571f, 0.6666667f,
0.6632124f, 0.6597938f, 0.6564103f, 0.6530612f, 0.6497462f,
0.64646465f, 0.6432161f, 0.64f, 0.6368159f, 0.63366336f,
0.63054186f, 0.627451f, 0.62439024f, 0.6213592f, 0.6183575f,
0.61538464f, 0.61244017f, 0.60952383f, 0.6066351f, 0.6037736f,
0.600939f, 0.5981308f, 0.59534883f, 0.5925926f, 0.58986175f,
0.58715594f, 0.58447486f, 0.58181816f, 0.57918555f, 0.5765766f,
0.57399106f, 0.5714286f, 0.5688889f, 0.5663717f, 0.5638766f,
0.5614035f, 0.558952f, 0.5565217f, 0.55411255f, 0.55172414f,
0.5493562f, 0.5470086f, 0.54468083f, 0.5423729f, 0.54008436f,
0.53781515f, 0.53556484f, 0.53333336f, 0.53112036f, 0.5289256f,
0.52674896f, 0.52459013f, 0.52244896f, 0.5203252f, 0.51821864f,
0.516129f, 0.5140562f, 0.512f, 0.5099602f, 0.50793654f,
0.5059289f, 0.503937f, 0.50196075f, 0.5f,
}; /* const */
/*
Computes the natural logarithm of x.
Algorithm:
- Decompose `x = 2^E * 1.M`, where `E` is the exponent, `M` are the mantissa
bits and `1.M` denotes the binary number with integer part 1 and fractional
bits `M`
- Observe that `ln(x) = ln(2)*E + ln(1.M)`. It remains to compute `ln(1.M)`.
- Choose a 7 bit mantissa `m` such that `1.m` best approximates `1.M`. Details:
- Let `m` be the 7 most significant bits of `M`, then `1.m` already is a good
approximation. But when the 8'th most significant bit of `M` is set, we
increment `m` accordingly by 1 bit to get an equal or better approximation.
- Next approximate `ln(1.M) = ln(y + h)` with `y=1.m` and `h=1.M - 1.m` using a
Taylor expansion at `y`. Maths detour:
- We have `ln(y + h) = ln(y) + h/y - (h/y)^2 / 2 + (h/y)^3 / 3 - ...`
- So for `H := h/y` we have `ln(y + h) = ln(y) + H + H^2 * (-0.5 + H/3 -
...)`
- I don't understand how this works, but the algorithm uses this
approximation formula: `ln(1.M) = ln(y + h) = ln(y) + H + H^2 * (H*0.000005086
- 0.500003)`
- In the formula above, the algorithm computes `ln(y) = ln(1.m)` using a lookup
table, then computes `H = h * 1/y = h * 1/1.m` using another lookup table for
`1/1.m`.
*/
float logf(float x /*@f1*/)
{
u32 raw_x /*@r4*/ =
*(u32*) &x; // @8 implemented via: x@sp8 = x@f1; x@r4 = x@sp8
// @14 4:r0=0x7f80_0000; 10:r3=x@r4 & EXP_MASK; 14:cmpw r3,0x7f80_0000@r0
switch (raw_x & EXP_MASK) {
default: /*x is normal*/
{
float C[2] = { -0.500003, 0.000005086 }; //@28
// C[0] = 0xbf000030@(r2-0x1830=0x804DE1B0), stored in sp10 @28,30
// C[1] = 0x3eaaaa36@(r2-0x182C=0x804DE1B4), stored in sp14 @34,40
// decompose x = 2^E * 1.M in exponent E and mantissa M
s32 E = F32_UNBIASED_EXPONENT(raw_x);
s32 m = F32_HIGH_MANTISSA_BITS(
raw_x); // @4c the 7 most significant mantissa bits
float H;
if (F32_LOW_MANTISSA_BITS(raw_x)) // condition @28, if @50
{
//@54
u32 raw_fm = (raw_x & 0x7F0000) | EXP_ZERO; // = 1.m
u32 raw_fM = (raw_x & MANT_MASK) | EXP_ZERO; // = 1.M
// float fm, fM, h;
// is the 8'th most significant mantissa bit set?
if (raw_x & 0x8000) // condition @34, if @6c
{
//@70
raw_fm += 0x10000; // @74
m += 1; // @70,@78,@7c
}
//@80
// fM = F32_BIT_CAST(raw_fM); // @84 result fM@f2
// fm = F32_BIT_CAST(raw_fm); // @88 result fm@f0
// h = fM - fm; // @a0 result h@f6
// reciprocal_table_803B9154e1 base pointer: @80,90
// load table item into f0 with index m@r8: @8c,98,a4, result in f0
// H = h * reciprocal_table_803B9154[m]; // = h / 1.m code @b4,
// result in H@f6
H = (F32_BIT_CAST(raw_fM) - F32_BIT_CAST(raw_fm)) *
__one_over_F[m];
// return ln(2)*E + ln(1.m) + H * H*H * (H * (-0.500003) +
// 0.000005086), this is an approximation of the Taylor series.
// code positions:
// convert the E@r7 integer to float@f3: @9c,ac,b0,bc,c4,c8,dc
// lookup __ln_F[m@r4{index already multiplied by 4}]:
// @a8,c0,cc,e0, result in f1 load 0.6931472f =
// 0x3f317218@(r2-0x1820=0x804DE1C0) into f5: @d4 compute
// 0.6931472f*exp + __ln_F[m]: @e8
return LN2 * E + __ln_F[m] + H + H * H * (H * C[1] + C[0]);
}
return LN2 * E + __ln_F[m]; //@f8
}
case EXP_MASK: //@134
{
if (raw_x & MANT_MASK) {
// @13c
return x; // x is nan
} else {
// @144
return (raw_x & SIGN_BIT) ? /*x=-inf*/ float_nan
: /*x=inf*/ float_inf;
}
}
case 0: /* @15c x is subnormal or zero*/
return -float_inf;
}
}
/*
matching `__one_over_F[m]` (currently line 72 in the C code above)
target code:
r8 = array index
80: lis r3,lbl_803B9154@ha // load array base pointer 0x803C0000-28848
8c: rlwinm r4,r8,2,0,0x1d // multiply index by 4 = sizeof(float)
90: addi r0,r3,lbl_803B9154@l
98: add r3,r0,r4 // r3 = base_pointer + 4*index
a4: lfs f1,0(r3) // deref the pointer
a8: lis r3, 0x803C // load array base pointer 0x803C0000-28848
c0: subi r0, r3, 28848
cc: add r3,r0,r4 // r3 = base_pointer + 4*index
e0: lfs f1,0(r3) // deref the pointer
my code:
r8 = array index
4: lis r3,...data.0@ha // load array base pointer
10: addi r5,r3,...data.0@l
80: rlwinm r4,r8,2,0,0x1d // multiply index by 4 = sizeof(float)
90: add r3,r5,r4 // r3 = base_pointer + 4*index
a0: lfs f3, 0x204(r3) // load __one_over_F[m]
ac: lfsx f0,r5,r4 // load float32 from address r5+r4, = __ln_F[m]
*/