-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
151 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# GoAT | ||
|
||
Go assembly transpiler for C programming languages. | ||
|
||
It help to utilize optimization from C compiler in Go projects. For example, generate SIMD vectorized functions for Go (refer to How to Use AVX512 in Golang). | ||
|
||
## Install | ||
|
||
go install github.com/gorse-io/goat@latest | ||
|
||
## Usage | ||
|
||
```bash | ||
cd example | ||
|
||
goat src/mul_to.c -O3 -mavx -mfma -mavx512f -mavx512dq | ||
GoAT transpiles example/src/mul_to.c to two files. | ||
``` | ||
|
||
Go function definition file mul_to.go: | ||
|
||
```go | ||
//go:build !noasm && amd64 | ||
// AUTO-GENERATED BY GOAT -- DO NOT EDIT | ||
|
||
package example | ||
|
||
import "unsafe" | ||
|
||
//go:noescape | ||
func mul_to(a, b, c, n unsafe.Pointer) | ||
Go assembly file mul_to.s: | ||
//go:build !noasm && amd64 | ||
// AUTO-GENERATED BY GOAT -- DO NOT EDIT | ||
|
||
TEXT ·mul_to(SB), $0-32 | ||
MOVQ a+0(FP), DI | ||
MOVQ b+8(FP), SI | ||
MOVQ c+16(FP), DX | ||
MOVQ n+24(FP), CX | ||
BYTE $0x55 // pushq %rbp | ||
WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp | ||
LONG $0xf8e48348 // andq $-8, %rsp | ||
WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx | ||
JLE LBB0_12 | ||
LONG $0x3ff98348 // cmpq $63, %rcx | ||
JA LBB0_7 | ||
WORD $0xc031 // xorl %eax, %eax | ||
JMP LBB0_3 | ||
|
||
LBB0_7: | ||
LONG $0x8a0c8d4c // leaq (%rdx,%rcx,4), %r9 | ||
LONG $0x8f048d48 // leaq (%rdi,%rcx,4), %rax | ||
WORD $0x3948; BYTE $0xd0 // cmpq %rdx, %rax | ||
LONG $0xc2970f41 // seta %r10b | ||
LONG $0x8e048d48 // leaq (%rsi,%rcx,4), %rax | ||
WORD $0x3949; BYTE $0xf9 // cmpq %rdi, %r9 | ||
LONG $0xc3970f41 // seta %r11b | ||
WORD $0x3948; BYTE $0xd0 // cmpq %rdx, %rax | ||
LONG $0xc0970f41 // seta %r8b | ||
WORD $0x3949; BYTE $0xf1 // cmpq %rsi, %r9 | ||
LONG $0xc1970f41 // seta %r9b | ||
WORD $0xc031 // xorl %eax, %eax | ||
WORD $0x8445; BYTE $0xda // testb %r11b, %r10b | ||
JNE LBB0_3 | ||
WORD $0x2045; BYTE $0xc8 // andb %r9b, %r8b | ||
JNE LBB0_3 | ||
WORD $0x8948; BYTE $0xc8 // movq %rcx, %rax | ||
LONG $0xc0e08348 // andq $-64, %rax | ||
WORD $0x3145; BYTE $0xc0 // xorl %r8d, %r8d | ||
|
||
LBB0_10: | ||
LONG $0x487cb162; WORD $0x0410; BYTE $0x87 // vmovups (%rdi,%r8,4), %zmm0 | ||
QUAD $0x01874c10487cb162 // vmovups 64(%rdi,%r8,4), %zmm1 | ||
QUAD $0x02875410487cb162 // vmovups 128(%rdi,%r8,4), %zmm2 | ||
QUAD $0x03875c10487cb162 // vmovups 192(%rdi,%r8,4), %zmm3 | ||
LONG $0x487cb162; WORD $0x0459; BYTE $0x86 // vmulps (%rsi,%r8,4), %zmm0, %zmm0 | ||
QUAD $0x01864c594874b162 // vmulps 64(%rsi,%r8,4), %zmm1, %zmm1 | ||
QUAD $0x02865459486cb162 // vmulps 128(%rsi,%r8,4), %zmm2, %zmm2 | ||
QUAD $0x03865c594864b162 // vmulps 192(%rsi,%r8,4), %zmm3, %zmm3 | ||
LONG $0x487cb162; WORD $0x0411; BYTE $0x82 // vmovups %zmm0, (%rdx,%r8,4) | ||
QUAD $0x01824c11487cb162 // vmovups %zmm1, 64(%rdx,%r8,4) | ||
QUAD $0x02825411487cb162 // vmovups %zmm2, 128(%rdx,%r8,4) | ||
QUAD $0x03825c11487cb162 // vmovups %zmm3, 192(%rdx,%r8,4) | ||
LONG $0x40c08349 // addq $64, %r8 | ||
WORD $0x394c; BYTE $0xc0 // cmpq %r8, %rax | ||
JNE LBB0_10 | ||
WORD $0x3948; BYTE $0xc8 // cmpq %rcx, %rax | ||
JE LBB0_12 | ||
|
||
LBB0_3: | ||
WORD $0x8949; BYTE $0xc0 // movq %rax, %r8 | ||
WORD $0xf749; BYTE $0xd0 // notq %r8 | ||
WORD $0x0149; BYTE $0xc8 // addq %rcx, %r8 | ||
WORD $0x8949; BYTE $0xc9 // movq %rcx, %r9 | ||
LONG $0x03e18349 // andq $3, %r9 | ||
JE LBB0_5 | ||
|
||
LBB0_4: | ||
LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 | ||
LONG $0x0459fac5; BYTE $0x86 // vmulss (%rsi,%rax,4), %xmm0, %xmm0 | ||
LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) | ||
LONG $0x01c08348 // addq $1, %rax | ||
LONG $0xffc18349 // addq $-1, %r9 | ||
JNE LBB0_4 | ||
|
||
LBB0_5: | ||
LONG $0x03f88349 // cmpq $3, %r8 | ||
JB LBB0_12 | ||
|
||
LBB0_6: | ||
LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 | ||
LONG $0x0459fac5; BYTE $0x86 // vmulss (%rsi,%rax,4), %xmm0, %xmm0 | ||
LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) | ||
LONG $0x4410fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm0 | ||
LONG $0x4459fac5; WORD $0x0486 // vmulss 4(%rsi,%rax,4), %xmm0, %xmm0 | ||
LONG $0x4411fac5; WORD $0x0482 // vmovss %xmm0, 4(%rdx,%rax,4) | ||
LONG $0x4410fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm0 | ||
LONG $0x4459fac5; WORD $0x0886 // vmulss 8(%rsi,%rax,4), %xmm0, %xmm0 | ||
LONG $0x4411fac5; WORD $0x0882 // vmovss %xmm0, 8(%rdx,%rax,4) | ||
LONG $0x4410fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm0 | ||
LONG $0x4459fac5; WORD $0x0c86 // vmulss 12(%rsi,%rax,4), %xmm0, %xmm0 | ||
LONG $0x4411fac5; WORD $0x0c82 // vmovss %xmm0, 12(%rdx,%rax,4) | ||
LONG $0x04c08348 // addq $4, %rax | ||
WORD $0x3948; BYTE $0xc1 // cmpq %rax, %rcx | ||
JNE LBB0_6 | ||
|
||
LBB0_12: | ||
WORD $0x8948; BYTE $0xec // movq %rbp, %rsp | ||
BYTE $0x5d // popq %rbp | ||
WORD $0xf8c5; BYTE $0x77 // vzeroupper | ||
BYTE $0xc3 // retq | ||
Finally, the mul_to function can be called by: | ||
|
||
func MulTo(a, b, c []float32) { | ||
if len(a) ! = len(b) || len(a) ! = len(c) { | ||
panic("floats: slice lengths do not match") | ||
} | ||
mul_to(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&c[0]), unsafe.Pointer(uintptr(len(a)))) | ||
} | ||
``` | ||
|
||
## Limitations | ||
|
||
- Arguments need (for now) to be 64-bit size, meaning either a value or a pointer | ||
- Maximum number of 4 arguments | ||
- Generally no call statements | ||
|
||
## Acknowledgments | ||
|
||
GoAT is inspired by [c2goasm](https://github.com/minio/c2goasm). |