-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquadratic.asm
1589 lines (1530 loc) · 62.4 KB
/
quadratic.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
; * This is the source code for the program used in
; https://powdertoy.co.uk/Browse/View.html?ID=2303519
; * This program makes use of the stack pointer, so it is first initialised to 0
; and is rarely (never?) written to explicitly.
; * The calling convention used in this program is as follows:
; * r0 through r9 are saved by the caller,
; * r10 through r13 are saved by the callee,
; * arguments are passed and results are returned in caller-saved registers.
; * Denormals are not supported right now, and I'm not sure how difficult it'd
; be to add support. Probably very difficult so I pass.
;
; * Some comments only make sense if you realise that this code was written by a
; living, breathing person. Yes, I'm one, however difficult that may be to
; believe.
;
; -- LBPHacker
%include "common"
; * Entry point.
; * It's nothing special, it's not like the assembler looks for it or anything,
; it's just what's at address 0 so it's the entry point.
; * I wasn't even alive back when this was the norm.
start:
mov sp, 0 ; * Initialise stack pointer.
mov r10, 0 ; * r10 holds the address of the port
; the terminal is connected to.
bump r10 ; * Reset terminal.
send r10, 0x1012
mov r0, .string_formula
call write_string ; * Print formula for fun.
.demo:
mov r12, .inputdata_prompt
.inputdata_loop:
mov r0, r12
call write_string ; * Print prompts, one for each of A, B and C.
mov r0, 14
call clear_continuous ; * Clear previous input.
mov r0, global_str_buf
mov r1, 14
mov r7, 0x200F
mov r11, [r12]
add r11, 2
call read_string ; * Read string.
mov r0, global_str_buf
call float_from_string ; * Convert to float.
test r1, r1
jnz .inputdata_loop ; * Try again if the conversion failed.
push r3 ; * Push number to stack.
push r2
add r12, 6
test [r12], 0xFFFF
jnz .inputdata_loop ; * Exit loop if there are no more prompts left.
; * At this point the stack is ($, A, B, C) (where
; $ is the bottom of the stack in this context).
send r10, 0x2080
send r10, 0x1072
mov r0, .working_string
call write_string ; * Draw empty progress bar.
mov r13, 0x1072 ; * Prepare for bumping the progress bar
mov r12, .working_string ; continuously throughout the calculation.
mov r0, [sp+4] ; * Move A into r0_32.
mov r1, [sp+5]
mov r2, r1 ; * Check if A is 0, ...
and r2, 0x7FFF
or r2, r0
jz .demo_emit_linear ; ... branch off if it is.
pop r2 ; * Pop C, stack is ($, A, B).
pop r3
call float_multiply ; * Multiply A and C, yielding AC.
call .bump_progress_bar
mov r2, 0x0000 ; * This is 4.0.
mov r3, 0x4080
call float_multiply ; * Multiply AC and 4.0, yielding 4AC.
push r1
push r0 ; * Push 4AC back, stack is ($, A, B, 4AC).
call .bump_progress_bar
mov r0, [sp+4] ; * Move A into r0_32.
mov r1, [sp+5]
mov r2, 0x0000 ; * This is 2.0.
mov r3, 0x4000
call float_multiply ; * Multiply A and 2.0, yielding 2A.
mov [sp+4], r0
mov [sp+5], r1 ; * Write 2A back, stack is ($, 2A, B, 4AC).
call .bump_progress_bar
mov r0, [sp+2] ; * Move B into r0_32.
mov r1, [sp+3]
mov r2, 0x8000
xor [sp+3], r2 ; * The old B on the stack becomes -B. Stack is
mov r2, r0 ; ($, 2A, -B, 4AC).
mov r3, r1 ; * Copy B into r2_32.
call float_multiply ; * Multiply B with itself, yielding B**2.
call .bump_progress_bar
pop r2
pop r3 ; * Pop 4AC into r2_32, stack is ($, 2A, -B).
call float_subtract ; * Subtract 4AC from B**2, yielding B**2-4AC.
call .bump_progress_bar
mov r2, r1 ; * Check if B**2-4AC is 0, ...
and r2, 0x7FFF
or r2, r0
jz .demo_emit_single ; ... branch off if it is.
test r1, 0x8000 ; * Check if B**2-4AC is negative,
jnz .demo_emit_complex ; branch off if it is.
call float_sqrt
push r1 ; * Calculate sqrt(B**2-4AC) and push it onto the
push r0 ; stack, stack is ($, 2A, -B, sqrt(B**2-4AC)).
call .bump_progress_bar
mov r0, [sp+2] ; * Move -B into r0_32.
mov r1, [sp+3]
mov r2, [sp+0] ; * Move sqrt(B**2-4AC) into r2_32.
mov r3, [sp+1]
call float_subtract ; * Subtract sqrt(B**2-4AC) from -B, push the
push r1 ; result back, stack is ($, 2A,
push r0 ; -B, sqrt(B**2-4AC), -B-sqrt(B**2-4AC)).
call .bump_progress_bar
mov r0, [sp+4] ; * Move -B into r0_32.
mov r1, [sp+5]
mov r2, [sp+2] ; * Move sqrt(B**2-4AC) into r2_32.
mov r3, [sp+3]
call float_add ; * Add sqrt(B**2-4AC) to -B, update the old -B on
mov [sp+4], r0 ; the stack, stack is ($, 2A, -B+sqrt(B**2-4AC),
mov [sp+5], r1 ; sqrt(B**2-4AC), -B-sqrt(B**2-4AC)).
call .bump_progress_bar
mov r2, [sp+6] ; * Move 2A into r2_32.
mov r3, [sp+7]
pop r0 ; * Pop -B-sqrt(B**2-4AC) into r0_32, stack is
pop r1 ; ($, 2A, -B+sqrt(B**2-4AC), sqrt(B**2-4AC)).
call float_divide ; * Divide...
call .bump_progress_bar
mov r2, r0
mov r3, r1
mov r0, global_str_buf
call float_to_string ; ... and then convert,...
send r10, 0x1090
mov r0, .string_x1
call write_string ; ... then tell the user what we're printing,...
mov r0, 13
call clear_continuous ; ... then clear the previous solution...
send r10, 0x1093
mov r0, global_str_buf
call write_string ; ... and print the current solution.
call .bump_progress_bar
mov r0, [sp+2] ; * Move -B+sqrt(B**2-4AC) into r0_32.
mov r1, [sp+3]
mov r2, [sp+4] ; * Move 2A into r2_32.
mov r3, [sp+5]
call float_divide ; * Divide...
call .bump_progress_bar
mov r2, r0
mov r3, r1
mov r0, global_str_buf
call float_to_string ; ... and then convert,...
send r10, 0x10A0
mov r0, .string_x2
call write_string ; ... then tell the user what we're printing,...
mov r0, 13
call clear_continuous ; ... then clear the previous solution...
send r10, 0x10A3
mov r0, global_str_buf
call write_string ; ... and print the current solution.
call .bump_progress_bar
add sp, 6 ; * Pop everything, stack is ($).
jmp .demo_wrapup ; * Then branch off.
.demo_emit_complex:
call .bump_progress_bar ; * We can skip a few steps here.
call .bump_progress_bar ; * Reminder: stack is ($, 2A, -B).
xor r1, 0x8000 ; * r0_32 is B**2-4AC, and it's negative. Make
call float_sqrt ; it positive and take the square root.
push r1
push r0 ; * Push it back,
call .bump_progress_bar ; stack is ($, 2A, -B, sqrt(B**2-4AC)).
mov r0, [sp+2] ; * Move -B into r0_32.
mov r1, [sp+3]
mov r2, [sp+4] ; * Move 2A into r0_32.
mov r3, [sp+5]
call float_divide ; * Divide -B by 2A, yielding -B/2A, the real part
call .bump_progress_bar ; of both solutions.
mov r2, r0
mov r3, r1
mov r0, global_str_buf
call float_to_string ; * And then convert,...
send r10, 0x1090
mov r0, .string_xc
call write_string ; ... then tell the user what we're printing,...
mov r0, 14
call clear_continuous ; ... then clear the previous solution...
send r10, 0x1092
mov r0, global_str_buf
call write_string ; ... and print the current solution.
call .bump_progress_bar
pop r0 ; * Pop sqrt(B**2-4AC) into r0_32,
pop r1 ; stack is ($, 2A, -B).
mov r2, [sp+2] ; * Move 2A into r2_32.
mov r3, [sp+3]
call float_divide ; * Divide...
call .bump_progress_bar
mov r2, r0
mov r3, r1
mov r0, global_str_buf
call float_to_string ; ... and then convert,...
send r10, 0x10A0
mov r0, .string_xpm
call write_string ; ... then tell the user what we're printing,...
mov r0, 14
call clear_continuous ; ... then clear the previous solution...
send r10, 0x10A2
mov r0, global_str_buf
call write_string ; ... and print both current solutions
send r10, 'i' ; with an i for the imaginary parts.
call .bump_progress_bar
add sp, 4 ; * Pop everything, stack is ($).
jmp .demo_wrapup ; * Then branch off.
.demo_emit_linear:
call .bump_progress_bar ; * We can skip a few steps here.
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar ; * Reminder: stack is ($, A, B, C).
pop r0
pop r1 ; * Pop C into r0_32, stack is ($, A, B).
pop r2
pop r3 ; * Pop B into r2_32, stack is ($, A).
mov r4, r3 ; * Check if B is 0, ...
and r4, 0x7FFF
or r4, r2
jz .demo_emit_constant ; ... branch off if it is.
xor r3, 0x8000 ; * Negate B, yielding -B.
call float_divide ; * Divide...
call .bump_progress_bar
mov r2, r0
mov r3, r1
mov r0, global_str_buf
call float_to_string ; ... and then convert,...
send r10, 0x1090
mov r0, .string_xc
call write_string ; ... then tell the user what we're printing,...
mov r0, 14
call clear_continuous ; ... then clear the previous solution...
send r10, 0x1092
mov r0, global_str_buf
call write_string ; ... and print both current solutions.
send r10, 0x10A0
mov r0, .string_xl
call write_string ; * State that this is the linear equation case.
call .bump_progress_bar
add sp, 2 ; * Pop everything, stack is ($).
jmp .demo_wrapup ; * Then branch off.
.demo_emit_constant:
call .bump_progress_bar
mov r4, r1 ; * Prepare to state that this is the no solution
and r4, 0x7FFF ; case, but then check if C is 0, ...
or r4, r0
jz .demo_emit_zeroes ; ... and make it the all-zeroes case
mov r0, .string_xn ; only if it is.
jmp .demo_emit_zeroes_or_none
.demo_emit_zeroes:
mov r0, .string_xz
.demo_emit_zeroes_or_none:
send r10, 0x1090
call write_string ; * State that this is the no solution case.
send r10, 0x10A0
mov r0, 16
call clear_continuous ; * Clear the previous solution.
call .bump_progress_bar ; * Reminder: stack is ($, A).
add sp, 2 ; * Pop everything, stack is ($).
jmp .demo_wrapup ; * Then branch off.
.demo_emit_single:
call .bump_progress_bar ; * We can skip a few steps here.
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar
call .bump_progress_bar ; * Reminder: stack is ($, 2A, -B).
pop r0
pop r1 ; * Pop -B into r0_32, stack is ($, 2A).
pop r2
pop r3 ; * Pop 2A into r2_32, stack is ($).
call float_divide ; * Divide...
call .bump_progress_bar
mov r2, r0
mov r3, r1
mov r0, global_str_buf
call float_to_string ; ... and then convert,...
send r10, 0x1090
mov r0, .string_xc
call write_string ; ... then tell the user what we're printing,...
mov r0, 14
call clear_continuous ; ... then clear the previous solution...
send r10, 0x1092
mov r0, global_str_buf
call write_string ; ... and print both current solutions.
send r10, 0x10A0
mov r0, .string_xs
call write_string ; * State that this is the single solution case.
call .bump_progress_bar
.demo_wrapup:
send r10, 0x2003
send r10, 0x1071
mov r0, .press_any_key_string
call write_string ; * Display nice Press any key message.
bump r10 ; * Drop whatever is in the input buffer.
mov r6, 0x2003
mov r11, 0x107E
call read_character_blink ; * Wait for a key press while blinking.
send r10, 0x2000
send r10, 0x1071
mov r0, .press_any_key_string
call write_string ; * Clear Press any key message.
send r10, 32
jmp .demo ; * Start over.
.bump_progress_bar:
send r10, 0x20F0
send r10, r13 ; * r13 remembers the position of the last
add r13, 1 ; character written to the progress bar.
send r10, [r12] ; * r12 remembers the pointer to the last
add r12, 1 ; character written to the progress bar.
ret
.string_formula:
dw 0x200C, "A", 0x200F, "X**2+"
dw 0x200E, "B", 0x200F, "X+"
dw 0x200A, "C", 0x200F, "=0", 0
.inputdata_prompt:
dw 0x1030, 0x200C, "A", 0x200F, "=", 0
dw 0x1040, 0x200E, "B", 0x200F, "=", 0
dw 0x1050, 0x200A, "C", 0x200F, "=", 0
dw 0
.string_x1:
dw 0x200F, "X1=", 0
.string_x2:
dw 0x200F, "X2=", 0
.string_xc:
dw 0x200F, "X=", 0
.string_xpm:
dw 0x200F, " ", 0xB5, 0
.string_xs:
dw 0x2007, " (double root) ", 0
.string_xn:
dw 0x2007, " (no solution) ", 0
.string_xz:
dw 0x2007, " (zeroes case) ", 0
.string_xl:
dw 0x2007, " (linear case) ", 0
.working_string:
dw " Hold on... ", 0
.press_any_key_string:
dw "Press any key", 0
global_str_buf:
dw " " ; * Global string buffer for use with functions
; that operate on strings. 14 cells. Don't
; worry, it's thread-safe.
; * Reads a single character from the terminal.
; * Character code is returned in r0.
; * r10 is terminal port address.
read_character:
.wait_loop:
wait r3 ; * Wait for a bump. r3 should be checked but
; as in this demo there's no other peripheral,
; it's fine this way.
js .wait_loop
bump r10 ; * Ask for character code.
.recv_loop:
recv r0, r10 ; * Receive character code.
jnc .recv_loop ; * The carry bit it set if something is received.
ret
; * Sends spaces to the terminal.
; * r10 holds the number of spaces to send.
clear_continuous:
.loop:
send r10, 32
sub r0, 1
jnz .loop
ret
; * Reads a single character from the terminal while blinking a cursor.
; * r6 is cursor colour.
; * r10 is terminal port address.
; * r11 is cursor position.
; * Character read is returned in r3.
read_character_blink:
mov r4, 0x7F ; * r4 holds the current cursor character.
mov r2, 8 ; * r2 is the counter for the blink loop.
send r10, r6
send r10, r11
send r10, r4 ; * Display cursor.
.wait_loop:
wait r3 ; * Wait for a bump. r3 should be checked but
; as in this demo there's no other peripheral,
; it's fine this way.
jns .got_bump ; * The sign flag is cleared if a bump arrives.
sub r2, 1
jnz .wait_loop ; * Back to waiting if it's not time to blink yet.
xor r4, 0x5F ; * Turn a 0x20 into a 0x7F or vice versa.
send r10, r6 ; Those are ' ' and a box, respectively.
send r10, r11
send r10, r4 ; * Display cursor.
mov r2, 8
jmp .wait_loop ; * Back to waiting, unconditionally this time.
.got_bump:
bump r10 ; * Ask for character code.
.recv_loop:
recv r3, r10 ; * Receive character code.
jnc .recv_loop ; * The carry bit it set if something is received.
ret
; * Reads zero-terminated strings from the terminal.
; * r0 points to buffer to read into and r1 is the size of the buffer,
; including the zero that terminates the string. If you have a 15 cell
; buffer, do pass 15 in r1, but expect only 14 characters to be read at most.
; * r7 is the default cursor colour (the one used when the buffer is not about
; to overflow; when it is, the cursor changes to yellow, 0x200E).
; * r10 is terminal port address.
; * r11 is cursor position.
read_string:
bump r10 ; * Drop whatever is in the input buffer.
mov r5, r1
sub r5, 1 ; * The size of the buffer includes the
; terminating zero, so the character limit
; should be one less than this size.
mov r6, r7 ; * Reset the default cursor colour.
mov r1, 0 ; * r1 holds the number of characters read.
.read_character:
call read_character_blink
cmp r3, 13 ; * Check for thr Return key.
je .got_return
cmp r3, 8 ; * Check for the Backspace key.
je .got_backspace
cmp r5, r1 ; * Check if whatever else we got fits the buffer.
je .read_character
send r10, r11 ; * If it does, display it and add it to the
send r10, r3 ; buffer.
add r11, 1
mov [r0+r1], r3
add r1, 1
cmp r5, r1
ja .read_character ; * Change cursor colour to yellow if the buffer
mov r6, 0x200E ; is full.
jmp .read_character ; * Back to waiting.
.got_backspace:
cmp r1, 0 ; * Only delete a character if there is at least
je .read_character ; one to delete.
mov r6, r7 ; * Reset the default cursor colour.
send r10, r11
send r10, 0x20 ; * Clear the previous position of the cursor.
sub r11, 1
sub r1, 1
jmp .read_character ; * Back to waiting.
.got_return:
send r10, r11
send r10, 0x20 ; * Clear the previous position of the cursor.
mov [r0+r1], 0 ; * Terminate string explicitly.
ret
; * Writes zero-terminated strings to the terminal.
; * r0 points to buffer to write from.
; * r10 is terminal port address.
; * r11 is incremented by the number of characters sent to the terminal (which
; doesn't help at all if the string contains colour or cursor codes).
write_string:
mov r2, r0
.loop:
mov r1, [r0]
jz .exit
add r0, 1
send r10, r1
jmp .loop
.exit:
add r11, r0
sub r11, r2
ret
; * Parses a floating point number from a zero-terminated string.
; * r0 points to the string buffer.
; * r1 is 0 if the conversion succeeds or 1 if it fails.
; * NOTE: Proper nan and inf input should be implemented eventually, which
; should override default success detection (i.e. the conversion may
; actually succeed even though the result is nan).
; * r2_32 is the result of the conversion or nan if the conversion fails.
; * Requires a working stack.
float_from_string:
mov r2, 0 ; * r2_32 holds the digits read. In other
mov r3, 0 ; words, it's the digit buffer.
mov r4, 0 ; * r4 holds the number of digits read. Its upper
; 8 bits hold the number of digits in the
; explicit exponent.
mov r5, 0 ; * r5 holds the relative base-10 exponent of the
; number entered. The number entered is
; basically r2_32 * (10 ** r5), e.g. for
; 3.141592 r2_32 would be 3141592 and r5 would
; be -6 after parsing.
mov r8, 0 ; * r8 holds the parser state:
; * bit 0: parsed explicit base-10 exponent,
; * bit 1: sign of explicit base-10 exponent,
; * bit 2: leading zeroes have been ignored,
; * bit 15: sign of the number.
mov r9, 1 ; * r9 is 1 until the decimal dot is read and
; becomes 0 afterwards. It's used to increment
; r5 for every digit read.
.ignore_spaces:
cmp [r0], ' '
jne .parse_sign
add r0, 1 ; * Ignore leading spaces.
jmp .ignore_spaces
.parse_sign:
cmp [r0], '+'
je .sign_positive
cmp [r0], '-'
jne .ignore_zeroes
xor r8, 0x8000 ; * Flip the sign of the number.
.sign_positive:
add r0, 1
.ignore_zeroes:
cmp [r0], '0'
jne .parse_digits
or r8, 0x0004
add r0, 1 ; * Ignore leading zeroes.
jmp .ignore_zeroes
.parse_digits:
mov r1, [r0]
sub r1, '0'
jnz .no_2nd_ignore_zeroes ; * It's possible that we get this far even though
; there are still leading zeroes to be ignored.
; * The reason might be a dot that breaks the
; first streak of zeroes. If this is the case,
; continue skipping zeroes here.
ors r9, r4 ; * The ors here clears the zero flag if either
jnz .no_3rd_ignore_zeroes ; the dot hasn't been read yet or if all the
; leading zeroes have been skipped, derived from
; the fact that r4 is more than 0.
sub r5, 1 ; * Decrement r5 if the dot has already been read.
add r0, 1 ; * The trick is that we don't increase r4, the
jmp .parse_digits ; significant digit counter.
.no_2nd_ignore_zeroes:
jb .parse_dot ; * It's not a digit but it may still be a dot.
.no_3rd_ignore_zeroes: ; * We may arrive to this label after the ors
; check above. This branch skips the jb above
; because it'd act based on the flags we get
; from the ors.
cmp r1, 9 ; * That's a '9' (we subtracted 0x30 earlier).
ja .parse_dot ; * It's not a digit but it may still be a dot.
cmp r4, 7 ; * The parser only guarantees a precision of
ja .ignore_last_digit ; 7 digits, any more than that is truncated.
je .truncate_last_digit
mov r6, r2 ; * Multiply r2_32 by 10.
mov r7, r3 ; * Basically the following happens:
shl r6, 2 ; * r6_32 = r2_32,
scl r7, 2 ; * r6_32 <<= 2,
add r2, r6 ; * r2_32 += r6_32,
adc r3, r7 ; * r2_32 <<= 1.
shl r2, 1 ; * So in the end (r2_32 * (4 + 1)) * 2 or
scl r3, 1 ; r2_32 * 10 is assigned to r2_32.
add r2, r1 ; * Add r1 to r2_32, merging in the last digit.
adc r3, 0
sub r5, 1 ; * Decrement r5 if the dot has already been read.
add r5, r9
jmp .back_to_digit_loop
.truncate_last_digit:
cmp r1, 5 ; * At this point the digit cannot be merged
jb .ignore_last_digit ; into the buffer but it can help make whatever
add r2, 1 ; is in the buffer a closer approximation of
adc r3, 0 ; the number entered.
.ignore_last_digit:
add r5, r9 ; * This is a bit tricky. Increment r5 if the
; dot hasn't been read yet. This is needed
; because even though the digit read is ignored,
; it still does influence the base-10 exponent
; of the number if it's not after the decimal
; dot.
.back_to_digit_loop:
add r4, 1 ; * Increment digit counter.
add r0, 1
jmp .parse_digits
.parse_dot:
cmp r1, 0xFFFE ; * That's a '.' (we subtracted 0x30 earlier,
; 0xFFFE = -2).
jne .parse_exponent ; * If it's not a even dot, move on.
test r9, r9 ; * Check if the dot has already been read.
jz .parse_exponent ; Move on if it has.
mov r9, 0 ; * Well, it certainly has now.
add r0, 1
jmp .parse_digits
.parse_exponent:
mov r9, 0 ; * r9 will hold the explicit base-10 exponent.
cmp [r0], 'e'
je .seen_exponent_e
cmp [r0], 'E'
jne .parse_done
.seen_exponent_e:
xor r8, 0x0001 ; * Flip explicit exponent bit in parser state.
add r0, 1
cmp [r0], '+'
je .sign_exponent_positive
cmp [r0], '-'
jne .read_exp_digits
xor r8, 0x0002 ; * Flip exponent sign bit in parser state.
.sign_exponent_positive:
add r0, 1
.read_exp_digits:
mov r1, [r0]
sub r1, '0'
jb .read_exp_done ; * It's not a digit, move on.
cmp r1, 9 ; * That's a '9' (we subtracted 0x30 earlier).
ja .read_exp_done ; * It's not a digit, move on.
add r0, 1
mov r7, r9 ; * The same thing happens here as earlier,
shl r7, 2 ; except it's 16-bit arithmetic now and it's
add r9, r7 ; much easier to follow:
shl r9, 1 ; * r9 = (r9 * (4 + 1)) * 2.
add r9, r1 ; * Merge digit into buffer.
add r4, 0x100 ; * Increment digit counter for the exponent.
jmp .read_exp_digits
.read_exp_done:
mov r1, r8 ; * Load final sign into r1 for the early return
and r1, 0x8000 ; code paths.
cmp r9, 38 ; * In case of an overflow this may not jump, but
ja .huge_exponent ; the later check of the number of digits in the
test r8, 0x0002 ; exponent will. Otherwise it works fine.
jz .no_negate_exp
xor r9, 0xFFFF ; * Trick to negate r9 if the exponent had a
add r9, 1 ; negative sign.
.no_negate_exp:
.parse_done:
mov r1, r8 ; * Load final sign into r1 for the early return
and r1, 0x8000 ; code paths.
mov r6, r8
and r6, 0x0004
add r4, r6 ; * Merge leading zero bit into r4.
test r4, 0xFF ; * No digits, not a valid result.
jz .result_is_nan
test r8, 0x0001 ; * Skip checking the explicit exponent if it
jz .no_check_exponent ; doesn't exist.
test r4, 0xFF00 ; * No digits, not a valid result.
jz .result_is_nan
cmp r4, 0x0300 ; * Too many digits in the exponent.
jae .huge_exponent
.no_check_exponent:
add r9, r5 ; * Merge base-10 exponent with the relative
; base-10 exponent from earlier.
mov r4, 158 ; * r4 will hold the base-2 logarithm of the
test r3, r3 ; number. The bias would originally be 127 but
jnz .no_shift16_dbuffer ; since our 32-bit digit buffer is not yet
mov r3, r2 ; normalised, another bias of 31 is added.
mov r2, 0
sub r4, 16
test r3, r3 ; * If there's nothing in the digit buffer, there
jz .result_is_zero ; is no point in trying to shift anything.
.no_shift16_dbuffer: ; * The conditional shift above and the loop here
mov r5, 0xFF00 ; normalise the digit buffer. The goal is to
mov r7, 8 ; shift it until the MSB is set.
.shift_dbuffer_loop:
test r3, r5
jnz .shift_dbuffer_skip
sub r4, r7 ; * This shifting of course affects the base-2
shl r2, r7 ; logarithm.
scl r3, r7
.shift_dbuffer_skip:
shr r7, 1
jz .shift_dbuffer_done
shl r5, r7
jmp .shift_dbuffer_loop
.shift_dbuffer_done:
mov r6, 0x4D42 ; * Load log10(2) << 32 into r6_32.
mov r7, 0x4D10
and r8, 0x8000 ; * Only preserve the parser state from r8
push r8 ; and push it.
mov r8, 0 ; * Load r9 << 25 into r8_32.
shl r9, 9 ; * This is going to get technical.
jns .no_mult_2_128 ; * If the base-10 exponent is negative, quickly
sub r4, 128 ; divide the digit buffer by 2 ** 128 (through
add r8, r6 ; the base-2 logarithm). Of course this means
adc r9, r7 ; the base-10 exponent has to be increased by
.no_mult_2_128: ; log10(2 ** 128).
shr r7, 1
scr r6, 1 ; * r6_32 is now log10(2) << 31.
test r9, r9
jns .no_mult_2_64 ; * If the base-10 exponent is negative, quickly
sub r4, 64 ; divide the digit buffer by 2 ** 64 (through
add r8, r6 ; the base-2 logarithm). Of course this means
adc r9, r7 ; the base-10 exponent has to be increased by
.no_mult_2_64: ; log10(2 ** 64).
push r11 ; * Incredibly, we're going to need to use these
push r10 ; registers as locals.
mov r5, 0x40 ; * Loop with 7 iterations (loop condition
.cordic_coarse_loop: ; is jnz).
mov r10, r8 ; * Bring the base-10 exponent as close to
mov r11, r9 ; log10(3/2) as possible as that's the value
sub r8, r6 ; we can reliably reduce to 0 with the finer
sbb r9, r7 ; CORDIC loop that uses an actual lookup table.
jc .ccl_restore_r8_32
add r4, r5
jmp .ccl_success
.ccl_restore_r8_32:
mov r8, r10
mov r9, r11
.ccl_success:
shr r7, 1
scr r6, 1
shr r5, 1
jnz .cordic_coarse_loop
shl r8, 7 ; * The base-10 exponent is now quite reduced,
scl r9, 7 ; shifting it up gives us more precision.
push r0
mov r0, 30 ; * Loop with 31 iterations (loop condition
.cordic_fine_loop: ; is jnc).
mov r10, r8 ; * CORDIC time. Subtract entries in the lookup
mov r11, r9 ; table from the base-10 exponent and increment
sub r8, [r0+.cordic_table_low]
sbb r9, [r0+.cordic_table_high]
jc .cfl_restore_r8_32 ; the base-2 logarithm if the subtraction yields
mov r1, 31 ; a positive result.
sub r1, r0 ; * The constants in the lookup table are
mov r10, r2 ; log10(1 + 2 ** (-n)) where n is in the range
mov r11, r3 ; [31, 1]. 1 + 2 ** (-n) is easily doable with
test r1, 0x10 ; shifts and adds and that's what happens here.
jz .cfl_no_shift16
mov r10, r11
mov r11, 0
.cfl_no_shift16:
shr r11, r1 ; * r10_32 is shifted down n bits.
scr r10, r1
add r2, r10 ; * And with this r2_32 is multiplied by (roughly)
adc r3, r11 ; 1 + 2 ** (-n).
jnc .cfl_success
shr r3, 1
scr r2, 1
or r3, 0x8000 ; * Restore lost bit and increment base-2
add r4, 1 ; logarithm if the addition yields a carry.
jmp .cfl_success
.cfl_restore_r8_32:
mov r8, r10
mov r9, r11
.cfl_success:
shl r8, 1
scl r9, 1
sub r0, 1
jnc .cordic_fine_loop
shr r3, 8 ; * Shift digit buffer down by 8.
scr r2, 8
pop r0 ; * Pop stuff saved earlier.
pop r10
pop r11
pop r1 ; * r1 now holds the sign bit of the result.
cmp r4, 0
jl .result_is_zero
cmp r4, 254
jg .result_is_inf
and r3, 0x7F ; * Pack into IEEE-754 single precision format.
shl r4, 7
or r3, r4
mov r4, 0
.encode_and_exit:
xor r3, r1 ; * Merge sign bit.
mov r1, r4
ret
.result_is_nan:
mov r3, 0x7FFF
mov r2, 0xFFFF
mov r4, 1
jmp .encode_and_exit
.result_is_inf:
mov r3, 0x7F80
mov r2, 0x0000
mov r4, 0
jmp .encode_and_exit
.result_is_zero:
mov r3, 0x0000
mov r2, 0x0000
mov r4, 0
jmp .encode_and_exit
.huge_exponent:
test r8, 0x0002 ; * A huge explicit exponent means inf if it's
jz .result_is_inf ; positive, zero if it's negative.
jmp .result_is_zero
.cordic_table_low:
dw 0xF62A, 0xF629, 0xF629, 0xF628 ; * Low words of log10(1 + 2 ** (-n))
dw 0xF626, 0xF623, 0xF61C, 0xF60E ; values with the MSB being
dw 0xF5F2, 0xF5BB, 0xF54B, 0xF46D ; 2 ** (-32 - n), n ranging from
dw 0xF2B0, 0xEF37, 0xE844, 0xDA5E ; 31 to 1.
dw 0xBE93, 0x86FD, 0x17D3, 0x3985
dw 0x7D05, 0x0473, 0x150C, 0x3D29
dw 0xA8E4, 0xED49, 0x211D, 0xF256
dw 0x53AC, 0x3071, 0x5116
.cordic_table_high:
dw 0x3796, 0x3796, 0x3796, 0x3796 ; * High words of log10(1 + 2 ** (-n))
dw 0x3796, 0x3796, 0x3796, 0x3796 ; values with the MSB being
dw 0x3796, 0x3796, 0x3796, 0x3796 ; 2 ** (-32 - n), n ranging from
dw 0x3796, 0x3796, 0x3796, 0x3796 ; 31 to 1.
dw 0x3796, 0x3796, 0x3796, 0x3795 ; * I desperately want to reduce the size
dw 0x3793, 0x3790, 0x3789, 0x377B ; of this table. Literally more than
dw 0x375F, 0x3728, 0x36BD, 0x35EB ; half of it is just 0x3796. Any ideas?
dw 0x3461, 0x319E, 0x2D14
; * Renders a floating point number as a zero-terminated string.
; * r0 points to the string buffer.
; * r1 is the number of characters written, including the trailing zero.
; * r2_32 is the number to be converted.
; * Requires a working stack.
float_to_string:
push r0 ; * Save string buffer pointer.
test r3, r3 ; * Check sign bit.
jns .sign_positive
mov [r0], '-'
add r0, 1
.sign_positive:
mov r1, r3 ; * Extract base-2 exponent into r1.
shr r1, 7
and r1, 0xFF
jz .result_is_zero ; * Handle special cases early on.
cmp r1, 255
je .result_is_inf
sub r1, 127 ; * Remove bias.
and r3, 0x007F ; * Extract mantissa into r2_32.
or r3, 0x0080
push r3 ; * Save these for later. We really do need these
push r2 ; registers for the coarse loop.
mov r2, 0 ; * We store 0 in r2_48, which is going to hold
mov r3, 0 ; the base-10 logarithm.
mov r4, 0
mov r5, 0x7DE8 ; * We store 0x4D104D427DE8 in r5_48, which is
mov r6, 0x4D42 ; log10(2) << 48. Yes, we're going to use
mov r7, 0x4D10 ; 48-bit arithmetic. Fun stuff.
test r1, r1
jns .base2_log_nonnegative
sub r2, r5 ; * This way we only have to deal with positive
sbb r3, r6 ; base-2 exponents.
sbb r4, r7
add r1, 128
.base2_log_nonnegative:
mov r8, 0x40 ; * The coarse loop iterates 7 times. See exit
.coarse_loop: ; condition later.
shr r7, 1 ; * This loop basically gives an upper estimate of
scr r6, 1 ; the base-10 logarithm of the number by
scr r5, 1 ; reducing the base-2 exponent.
test r1, r8 ; * The base-10 exponent is stored as a fixed
jz .cl_skip_bit ; point number with the LSB being 2 ** -41.
add r2, r5
adc r3, r6
adc r4, r7
.cl_skip_bit:
shr r8, 1
jnz .coarse_loop
mov r6, 0
mov r5, r4 ; * r5 now holds the integer part of the base-10
jns .r5_se_no_sign ; logarithm.
mov r6, 0xFF80 ; * The bit fiddling here is basically an
.r5_se_no_sign: ; arithmetical right shift through r6.
shr r5, 9
or r5, r6
pop r6 ; * Restore the mantissa into r6_32.
pop r7
shl r6, 4 ; * Shift r6_32 up for use with the digit buffer
scl r7, 4 ; loops later.
shl r2, 7 ; * We discard the 7 MSB of the base-10 logarithm
scl r3, 7 ; from r2_48, thus only leaving the fraction
scl r4, 7 ; part in it.
.db_preshift_loop: ; * The idea is that we take the fraction part of
cmp r4, 0x6099 ; the base-10 logarithm later and reduce it to 0
jnae .db_preshift_done ; while also adjusting the digit buffer, much
sub r2, 0x7DE8 ; the same way it's done in float_from_string.
sbb r3, 0x4D42 ; * One problem is that the CORDIC table used
sbb r4, 0x4D10 ; there can only reduce the base-10 logarithm
shl r6, 1 ; by log10(2.384) or so which is about 0.377.
scl r7, 1 ; * The fraction part of the base-10 logarithm may
jmp .db_preshift_loop ; be anywhere in the range [0; 1). Multiplying
.db_preshift_done: ; the digit buffer by 2 and subtracting log10(2)
; from the base-10 is one way of getting it
; inside the desired range.
; * The 0x6099 is a lower estimate of the
; log10(2.384) mentioned earlier.
; * From this point onward r2 is free and we'll
; consider r3_32 to be the base-10 exponent with
; the MSB being 2 ** -25 so it can be used with
; our trusty log10(1 + 2 ** (-n)) CORDIC table.
push r11 ; * Incredibly, we're going to need to use these
push r10 ; registers as locals.
mov r2, 30 ; * Loop with 31 iterations (loop condition
.cordic_fine_loop: ; is jnc).
mov r8, r3 ; * CORDIC time. Subtract entries in the lookup
mov r9, r4 ; table from the base-10 exponent and increment
sub r3, [r2+float_from_string.cordic_table_low]
sbb r4, [r2+float_from_string.cordic_table_high]
jc .cfl_restore_r3_32 ; the base-2 logarithm if the subtraction yields
mov r1, 31 ; a positive result.
sub r1, r2 ; * The constants in the lookup table are
mov r10, r6 ; log10(1 + 2 ** (-n)) where n is in the range
mov r11, r7 ; [31, 1]. 1 + 2 ** (-n) is easily doable with
test r1, 0x10 ; shifts and adds and that's what happens here.
jz .cfl_no_shift16
mov r10, r11
mov r11, 0
.cfl_no_shift16:
shr r11, r1 ; * r10_32 is shifted down n bits.
scr r10, r1
add r6, r10 ; * And with this r6_32 is multiplied by (roughly)
adc r7, r11 ; 1 + 2 ** (-n).
jmp .cfl_success
.cfl_restore_r3_32:
mov r3, r8
mov r4, r9
.cfl_success:
shl r3, 1
scl r4, 1
sub r2, 1
jnc .cordic_fine_loop
pop r10
pop r11
mov r8, 0 ; * r8_32 will hold the BCD representation.
mov r9, 0
mov r2, 7 ; * The BCD extraction loop iterates 7 times,
cmp r7, 0x5000 ; except in the special case when the mantissa
jnae .extract_bcd_loop ; exceeds 10 in the CORDIC loop.
sub r2, 1 ; * This special case is handled here. The base-10
add r5, 1 ; logarithm is also bumped.
sub r7, 0x5000
add r8, 1
cmp r7, 0x5000
jnae .extract_bcd_loop
sub r7, 0x5000
add r8, 1
.extract_bcd_loop:
mov r3, r7 ; * Extract MSD from digit buffer,
shr r3, 11 ; push it into the BCD buffer.
shl r8, 4
scl r9, 4
or r8, r3
and r7, 0x7FF ; * Discard MSD.
mov r3, r6 ; * Multiply r6_32 by 10.
mov r4, r7 ; * Basically the following happens:
shl r3, 2 ; * r3_32 = r6_32,
scl r4, 2 ; * r3_32 <<= 2,
add r6, r3 ; * r6_32 += r3_32,
adc r7, r4 ; * r6_32 <<= 1.
shl r6, 1 ; * So in the end (r6_32 * (4 + 1)) * 2 or
scl r7, 1 ; r6_32 * 10 is assigned to r6_32.
sub r2, 1
jnz .extract_bcd_loop
add r8, 0x6666 ; * Rig BCD buffer so that when 1 is added,
adc r9, 0x666 ; possible carries propagate correctly.
cmp r7, 0x2800 ; * See if the next digit that could be
jnae .no_bump_bcd ; extracted from the digit buffer is 5
add r8, 1 ; or more and bump BCD buffer if it is.
adc r9, 0
test r9, 0x1000 ; * With this we might overflow the BCD
jz .no_bcd_bump_overflow ; buffer, so that needs to be handled.
mov r9, 0x700
add r5, 1
.no_bcd_bump_overflow:
.no_bump_bcd:
mov r2, 4 ; * The BCD cleanup loop iterates 4 times.
.clean_bcd_buffer_loop:
test r8, 0xF000 ; * Due to the previous rigging with 0x6666, all
jz .no_sub_6000_r8 ; the digits in the BCD buffer are off by 6,
sub r8, 0x6000 ; except the ones that overflowed to 0, which
.no_sub_6000_r8: ; should be left alone.
rol r8, 4 ; * Correcting the ones that aren't 0 is what
test r9, 0xF000 ; happens here.
jz .no_sub_6000_r9 ; * The four nibbles of the two 16-bit buffer
sub r9, 0x6000 ; registers are cleaned up in four iterations
.no_sub_6000_r9: ; of the loop.
rol r9, 4 ; * Believe me, these ugly conditional jumps
sub r2, 1 ; provide the fastest way to do this.
jnz .clean_bcd_buffer_loop
mov r4, 7 ; * This is where the fancy printing of numbers
mov r6, 0 ; happens. They are printed in scientific
mov r7, 1 ; notation unless the base-10 logarithm, held in
cmp r5, 0xFFFD ; r5, falls in the range [-3; 6] (0xFFFD = -3).
jl .emit_scientific ; * When printing in scientific notation, 7 digits