-
Notifications
You must be signed in to change notification settings - Fork 0
/
running_sum_array.s
253 lines (189 loc) · 4.28 KB
/
running_sum_array.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
.data
array: .word 0x411d0000, 0xc2c80000, 0x426b0000
length: .word 3
print_newline: .string "\n"
.text
main:
la s0, array
lw s2, length
#################### convert fp32 to bf16 ####################
lw a0, 0(s0)
jal ra, fp32_to_bf16
sw a1, 0(s0)
lw a0, 4(s0)
jal ra, fp32_to_bf16
sw a1, 4(s0)
lw a0, 8(s0)
jal ra, fp32_to_bf16
sw a1, 8(s0)
#################### do running sum ####################
lw t0, 0(s0)
lw t1, 4(s0)
jal ra, bf16_add
sw t1, 4(s0)
lw t0, 4(s0)
lw t1, 8(s0)
jal ra, bf16_add
sw t1, 8(s0)
lw t0, 8(s0)
lw t1, 12(s0)
jal ra, bf16_add
sw t1, 12(s0)
#################### print and finish ####################
lw a0, 0(s0)
li a7, 34
ecall
la a0, print_newline
li a7, 4
ecall
lw a0, 4(s0)
li a7, 34
ecall
la a0, print_newline
li a7, 4
ecall
lw a0, 8(s0)
li a7, 34
ecall
li a7, 10
ecall
fp32_to_bf16:
addi sp, sp, -8 # Allocate stack space for local variables (ra and a0)
sw ra, 4(sp) # Save return address (ra) on the stack
sw a0, 0(sp) # Save input argument (a0) on the stack
mv t0, a0 # t0 = a0
li t1, 0x7fffffff # t1 = 0x7fffffff
and t1, t0, t1 # t1 = t0 & 0x7fffffff
li t2, 0x7f800000 # t2 = 0x7f800000
bge t2, t1, Else # if 0x7f800000 >= t0 & 0x7fffffff, goto Else
srli t1, t0, 16 # t1 = t0 >> 16
ori t1, t1, 64 # t1 = t1 | 64
mv a1, t1 # a1 = t1
j Exit # goto Exit;
Else:
srli t1, t0, 16 # t1 = t0 >> 16
andi t1, t1, 1 # t1 = t1 & 1
li t2, 0x7fff # t2 = 0x7fff
add t1, t1, t2 # t1 = t1 + t2
add t1, t1, t0 # t1 = t1 + t0
srli t1, t1, 16 # t1 = t1 >> 16
mv a1, t1 # a1 = t1
ret # return
Exit:
lw ra, 4(sp) # Restore ra on stack
lw a0, 0(sp) # Restore a0 on stack
addi sp, sp, 8 # Restore stack
ret # return
clz:
#
addi sp, sp, -4
sh t0, 0(sp)
sh t2, 2(sp)
# x |= (x >> 1)
srli t1, t0, 1
or t0, t0, t1
# x |= (x >> 2)
srli t1, t0, 2
or t0, t0, t1
# x |= (x >> 4)
srli t1, t0, 4
or t0, t0, t1
# x |= (x >> 8)
srli t1, t0, 8
or t0, t0, t1
# x -= ((x >> 1) & 0x5555)
srli t1, t0, 1
li t2, 0x5555
and t1, t1, t2
sub t0, t0, t1
# x = ((x >> 2) & 0x3333) + (x & 0x3333)
srli t1, t0, 2
li t2, 0x3333
and t1, t1, t2
and t0, t0, t2
add t0, t0, t1
# x = ((x >> 4) + x) & 0x0f0f
srli t1, t0, 4
add t0, t0, t1
li t2, 0x0f0f
and t0, t0, t2
# x += (x >> 8)
srli t1, t0, 8
add t0, t0, t1
# return (16 - (x & 0x7f))
andi t0, t0, 0x7f
xori t0, t0, -1
addi t1, t0, 17
#
lh t0, 0(sp)
lh t2, 2(sp)
addi sp, sp 4
ret
bf16_add:
addi sp, sp, -4
sw ra, 0(sp)
# exp1
slli t2, t0, 17
srli t2, t2, 24
# exp2
slli t3, t1, 17
srli t3, t3, 24
blt t2, t3, swap
# t2 = exp, t4 = shift amount
sub t4, t2, t3
j cal
swap:
# swap t0 and t1
mv t4, t0
mv t0, t1
mv t1, t4
# t2 = exp, t4 = v
sub t4, t3, t2
mv t2, t3
cal:
# t3 = sign
srli t3, t0, 15
# t5 = 0 -> add, t5 = 1 -> sub
xor t5, t0, t1
srli t5, t5, 15
# t0 = t0_mant
andi t0, t0, 127
ori, t0, t0, 128
# t1 = t1_mant
andi t1, t1, 127
ori t1, t1, 128
srl t1, t1, t4
# decide add or sub
beqz t5, add_operation
# t0 = mant
sub t0, t0, t1
j normalize
add_operation:
add t0, t0, t1
normalize:
# t1 = lz
call clz
li t4, 8
addi t1, t1, -8
# normalize exp
sub t2, t2, t1
# t1 = |t1|
srai t4, t1, 4
xor t1, t1, t4
srli t4, t4, 31
add t1, t1, t4
#
beqz t4, shift_left
srl t0, t0, t1
j finish
shift_left:
sll t0, t0, t1
finish:
addi t0, t0, -128
slli t3, t3, 15
slli t2, t2, 7
or t1, t0, t2
or t1, t1, t3
lw ra, 0(sp)
addi sp, sp, 4
ret