From dcfd453734f75e151155c5829462a60347086241 Mon Sep 17 00:00:00 2001 From: zhenghaoz Date: Sun, 22 Dec 2024 22:37:45 +0800 Subject: [PATCH] implement SIMD-accelerated euclidean (#903) --- base/floats/floats.go | 20 +- base/floats/floats_amd64.go | 22 +- base/floats/floats_amd64_test.go | 39 ++ base/floats/floats_arm64.go | 12 +- base/floats/floats_arm64_test.go | 25 + base/floats/floats_avx.go | 6 +- base/floats/floats_avx.s | 855 +++++++++++++++++-------------- base/floats/floats_avx512.go | 6 +- base/floats/floats_avx512.s | 681 ++++++++++++++---------- base/floats/floats_neon.go | 6 +- base/floats/floats_neon.s | 565 ++++++++++++-------- base/floats/floats_noasm.go | 4 + base/floats/floats_test.go | 13 + base/floats/src/Makefile | 6 +- base/floats/src/floats_avx.c | 43 ++ base/floats/src/floats_avx512.c | 70 +++ base/floats/src/floats_neon.c | 34 ++ base/floats/src/floats_test.c | 54 +- cmd/goat/main.go | 325 ------------ cmd/goat/parser_amd64.go | 211 -------- cmd/goat/parser_arm64.go | 183 ------- 21 files changed, 1586 insertions(+), 1594 deletions(-) delete mode 100644 cmd/goat/main.go delete mode 100644 cmd/goat/parser_amd64.go delete mode 100644 cmd/goat/parser_arm64.go diff --git a/base/floats/floats.go b/base/floats/floats.go index c3d923bb9..615a87f5e 100644 --- a/base/floats/floats.go +++ b/base/floats/floats.go @@ -14,7 +14,11 @@ package floats -import "math" +import ( + "math" + + "github.com/chewxy/math32" +) func dot(a, b []float32) (ret float32) { for i := range a { @@ -23,6 +27,13 @@ func dot(a, b []float32) (ret float32) { return } +func euclidean(a, b []float32) (ret float32) { + for i := range a { + ret += (a[i] - b[i]) * (a[i] - b[i]) + } + return math32.Sqrt(ret) +} + func mulTo(a, b, c []float32) { for i := range a { c[i] = a[i] * b[i] @@ -170,3 +181,10 @@ func Dot(a, b []float32) (ret float32) { } return impl.dot(a, b) } + +func Euclidean(a, b []float32) float32 { + if len(a) != len(b) { + panic("floats: slice lengths do not match") + } + return impl.euclidean(a, b) +} diff --git a/base/floats/floats_amd64.go b/base/floats/floats_amd64.go index 6e3aecc72..357b3bf52 100644 --- a/base/floats/floats_amd64.go +++ b/base/floats/floats_amd64.go @@ -17,12 +17,13 @@ package floats import ( - "github.com/klauspost/cpuid/v2" "unsafe" + + "github.com/klauspost/cpuid/v2" ) -//go:generate go run ../../cmd/goat src/floats_avx.c -O3 -mavx -//go:generate go run ../../cmd/goat src/floats_avx512.c -O3 -mavx -mfma -mavx512f -mavx512dq +//go:generate goat src/floats_avx.c -O3 -mavx +//go:generate goat src/floats_avx512.c -O3 -mavx -mfma -mavx512f -mavx512dq var impl = Default @@ -111,3 +112,18 @@ func (i implementation) dot(a, b []float32) float32 { return dot(a, b) } } + +func (i implementation) euclidean(a, b []float32) float32 { + switch i { + case AVX: + var ret float32 + _mm256_euclidean(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(uintptr(len(a))), unsafe.Pointer(&ret)) + return ret + case AVX512: + var ret float32 + _mm512_euclidean(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(uintptr(len(a))), unsafe.Pointer(&ret)) + return ret + default: + return euclidean(a, b) + } +} diff --git a/base/floats/floats_amd64_test.go b/base/floats/floats_amd64_test.go index 31bcccb08..d721a4f97 100644 --- a/base/floats/floats_amd64_test.go +++ b/base/floats/floats_amd64_test.go @@ -83,6 +83,17 @@ func TestAVX_Dot(t *testing.T) { assert.Equal(t, expected, actual) } +func TestAVX_Euclidean(t *testing.T) { + if !cpuid.CPU.Supports(cpuid.AVX) || !cpuid.CPU.Supports(cpuid.FMA3) { + t.Skip("AVX and FMA3 are not supported in the current CPU") + } + a := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} + b := []float32{0, 2, 4, 6, 8, 10, 12, 14, 16, 18} + actual := AVX.euclidean(a, b) + expected := Default.euclidean(a, b) + assert.InDelta(t, expected, actual, 1e-6) +} + func TestAVX512_MulConstAddTo(t *testing.T) { if !cpuid.CPU.Supports(cpuid.AVX512F) || !cpuid.CPU.Supports(cpuid.AVX512DQ) { t.Skip("AVX512F and AVX512DQ are not supported in the current CPU") @@ -141,6 +152,17 @@ func TestAVX512_Dot(t *testing.T) { assert.Equal(t, expected, actual) } +func TestAVX512_Euclidean(t *testing.T) { + if !cpuid.CPU.Supports(cpuid.AVX512F) || !cpuid.CPU.Supports(cpuid.AVX512DQ) { + t.Skip("AVX512F and AVX512DQ are not supported in the current CPU") + } + a := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + b := []float32{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20} + actual := AVX512.euclidean(a, b) + expected := Default.euclidean(a, b) + assert.InDelta(t, expected, actual, 1e-6) +} + func initializeFloat32Array(n int) []float32 { x := make([]float32, n) for i := 0; i < n; i++ { @@ -166,6 +188,23 @@ func BenchmarkDot(b *testing.B) { } } +func BenchmarkEuclidean(b *testing.B) { + for _, impl := range []implementation{Default, AVX, AVX512} { + b.Run(impl.String(), func(b *testing.B) { + for i := 16; i <= 128; i *= 2 { + b.Run(strconv.Itoa(i), func(b *testing.B) { + v1 := initializeFloat32Array(i) + v2 := initializeFloat32Array(i) + b.ResetTimer() + for i := 0; i < b.N; i++ { + impl.euclidean(v1, v2) + } + }) + } + }) + } +} + func BenchmarkMulConstAddTo(b *testing.B) { for _, impl := range []implementation{Default, AVX, AVX512} { b.Run(impl.String(), func(b *testing.B) { diff --git a/base/floats/floats_arm64.go b/base/floats/floats_arm64.go index abb94da14..2fd6e1d77 100644 --- a/base/floats/floats_arm64.go +++ b/base/floats/floats_arm64.go @@ -18,7 +18,7 @@ package floats import "unsafe" -//go:generate go run ../../cmd/goat src/floats_neon.c -O3 +//go:generate goat src/floats_neon.c -O3 var impl = Neon @@ -79,3 +79,13 @@ func (i implementation) dot(a, b []float32) float32 { return dot(a, b) } } + +func (i implementation) euclidean(a, b []float32) float32 { + if i == Neon { + var ret float32 + veuclidean(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(uintptr(len(a))), unsafe.Pointer(&ret)) + return ret + } else { + return euclidean(a, b) + } +} diff --git a/base/floats/floats_arm64_test.go b/base/floats/floats_arm64_test.go index b28b3cf03..9b5047fe9 100644 --- a/base/floats/floats_arm64_test.go +++ b/base/floats/floats_arm64_test.go @@ -67,6 +67,14 @@ func TestNEON_Dot(t *testing.T) { assert.Equal(t, expected, actual) } +func TestNEON_Euclidean(t *testing.T) { + a := []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + b := []float32{10, 20, 30, 40, 50, 60, 70, 80, 90, 100} + actual := Neon.euclidean(a, b) + expected := Default.euclidean(a, b) + assert.Equal(t, expected, actual) +} + func initializeFloat32Array(n int) []float32 { x := make([]float32, n) for i := 0; i < n; i++ { @@ -92,6 +100,23 @@ func BenchmarkDot(b *testing.B) { } } +func BenchmarkEuclidean(b *testing.B) { + for _, impl := range []implementation{Default, Neon} { + b.Run(impl.String(), func(b *testing.B) { + for i := 16; i <= 128; i *= 2 { + b.Run(strconv.Itoa(i), func(b *testing.B) { + v1 := initializeFloat32Array(i) + v2 := initializeFloat32Array(i) + b.ResetTimer() + for i := 0; i < b.N; i++ { + impl.euclidean(v1, v2) + } + }) + } + }) + } +} + func BenchmarkMulConstAddTo(b *testing.B) { for _, impl := range []implementation{Default, Neon} { b.Run(impl.String(), func(b *testing.B) { diff --git a/base/floats/floats_avx.go b/base/floats/floats_avx.go index cb7988966..db2a40760 100644 --- a/base/floats/floats_avx.go +++ b/base/floats/floats_avx.go @@ -1,6 +1,5 @@ //go:build !noasm && amd64 - -// AUTO-GENERATED BY GOAT -- DO NOT EDIT +// Code generated by GoAT. DO NOT EDIT. package floats @@ -20,3 +19,6 @@ func _mm256_mul_to(a, b, c, n unsafe.Pointer) //go:noescape func _mm256_dot(a, b, n, ret unsafe.Pointer) + +//go:noescape +func _mm256_euclidean(a, b, n, ret unsafe.Pointer) diff --git a/base/floats/floats_avx.s b/base/floats/floats_avx.s index 1607e912c..e155f2903 100644 --- a/base/floats/floats_avx.s +++ b/base/floats/floats_avx.s @@ -1,30 +1,28 @@ //go:build !noasm && amd64 -// AUTO-GENERATED BY GOAT -- DO NOT EDIT +// Code generated by GoAT. DO NOT EDIT. TEXT ·_mm256_mul_const_add_to(SB), $0-32 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVQ c+16(FP), DX MOVQ n+24(FP), CX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - WORD $0x5641 // pushq %r14 - BYTE $0x53 // pushq %rbx - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x07418d48 // leaq 7(%rcx), %rax - WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx - LONG $0xc1490f48 // cmovnsq %rcx, %rax - WORD $0x8949; BYTE $0xc0 // movq %rax, %r8 - LONG $0x03f8c149 // sarq $3, %r8 - LONG $0xf8e08348 // andq $-8, %rax - WORD $0x2948; BYTE $0xc1 // subq %rax, %rcx - WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + BYTE $0x53 // pushq %rbx + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x07418d4c // leaq 7(%rcx), %r8 + WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx + LONG $0xc1490f4c // cmovnsq %rcx, %r8 + WORD $0x894c; BYTE $0xc0 // movq %r8, %rax + LONG $0x03e8c148 // shrq $3, %rax + LONG $0xf8e08349 // andq $-8, %r8 + WORD $0x294c; BYTE $0xc1 // subq %r8, %rcx + WORD $0xc085 // testl %eax, %eax JLE LBB0_6 - LONG $0x01f88341 // cmpl $1, %r8d + WORD $0xf883; BYTE $0x01 // cmpl $1, %eax JE LBB0_4 - WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax - WORD $0xe083; BYTE $0xfe // andl $-2, %eax - WORD $0xd8f7 // negl %eax + WORD $0x8941; BYTE $0xc0 // movl %eax, %r8d + LONG $0xfee08141; WORD $0xffff; BYTE $0x7f // andl $2147483646, %r8d # imm = 0x7FFFFFFE LBB0_3: LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 @@ -37,103 +35,102 @@ LBB0_3: LONG $0x4211fcc5; BYTE $0x20 // vmovups %ymm0, 32(%rdx) LONG $0x40c78348 // addq $64, %rdi LONG $0x40c28348 // addq $64, %rdx - WORD $0xc083; BYTE $0x02 // addl $2, %eax + LONG $0xfec08341 // addl $-2, %r8d JNE LBB0_3 LBB0_4: - LONG $0x01c0f641 // testb $1, %r8b + WORD $0x01a8 // testb $1, %al JE LBB0_6 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 LONG $0x0759fcc5 // vmulps (%rdi), %ymm0, %ymm0 LONG $0x0258fcc5 // vaddps (%rdx), %ymm0, %ymm0 LONG $0x0211fcc5 // vmovups %ymm0, (%rdx) - LONG $0x20c28348 // addq $32, %rdx LONG $0x20c78348 // addq $32, %rdi + LONG $0x20c28348 // addq $32, %rdx LBB0_6: WORD $0xc985 // testl %ecx, %ecx JLE LBB0_18 - WORD $0x8941; BYTE $0xc8 // movl %ecx, %r8d - LONG $0x20f88349 // cmpq $32, %r8 + WORD $0xc889 // movl %ecx, %eax + LONG $0x20f88348 // cmpq $32, %rax JAE LBB0_9 - WORD $0x3145; BYTE $0xd2 // xorl %r10d, %r10d + WORD $0x3145; BYTE $0xc0 // xorl %r8d, %r8d JMP LBB0_14 LBB0_9: - LONG $0x82048d4a // leaq (%rdx,%r8,4), %rax - LONG $0x870c8d4e // leaq (%rdi,%r8,4), %r9 + LONG $0x82048d4c // leaq (%rdx,%rax,4), %r8 + LONG $0x870c8d4c // leaq (%rdi,%rax,4), %r9 LONG $0x04568d4c // leaq 4(%rsi), %r10 WORD $0x394c; BYTE $0xca // cmpq %r9, %rdx - LONG $0xc6920f41 // setb %r14b - WORD $0x3948; BYTE $0xc7 // cmpq %rax, %rdi + LONG $0xc3920f41 // setb %r11b + WORD $0x394c; BYTE $0xc7 // cmpq %r8, %rdi WORD $0x920f; BYTE $0xc3 // setb %bl WORD $0x394c; BYTE $0xd2 // cmpq %r10, %rdx - LONG $0xc3920f41 // setb %r11b - WORD $0x3948; BYTE $0xf0 // cmpq %rsi, %rax - LONG $0xc1970f41 // seta %r9b - WORD $0x3145; BYTE $0xd2 // xorl %r10d, %r10d - WORD $0x8441; BYTE $0xde // testb %bl, %r14b + LONG $0xc1920f41 // setb %r9b + WORD $0x3949; BYTE $0xf0 // cmpq %rsi, %r8 + LONG $0xc2970f41 // seta %r10b + WORD $0x3145; BYTE $0xc0 // xorl %r8d, %r8d + WORD $0x8441; BYTE $0xdb // testb %bl, %r11b JNE LBB0_14 - WORD $0x2045; BYTE $0xcb // andb %r9b, %r11b + WORD $0x2045; BYTE $0xd1 // andb %r10b, %r9b JNE LBB0_14 WORD $0x8941; BYTE $0xc9 // movl %ecx, %r9d LONG $0x1fe18341 // andl $31, %r9d - WORD $0x894d; BYTE $0xc2 // movq %r8, %r10 - WORD $0x294d; BYTE $0xca // subq %r9, %r10 + WORD $0x8949; BYTE $0xc0 // movq %rax, %r8 + WORD $0x294d; BYTE $0xc8 // subq %r9, %r8 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 - WORD $0xc031 // xorl %eax, %eax + WORD $0x3145; BYTE $0xd2 // xorl %r10d, %r10d LBB0_12: - LONG $0x0c59fcc5; BYTE $0x87 // vmulps (%rdi,%rax,4), %ymm0, %ymm1 - LONG $0x5459fcc5; WORD $0x2087 // vmulps 32(%rdi,%rax,4), %ymm0, %ymm2 - LONG $0x5c59fcc5; WORD $0x4087 // vmulps 64(%rdi,%rax,4), %ymm0, %ymm3 - LONG $0x6459fcc5; WORD $0x6087 // vmulps 96(%rdi,%rax,4), %ymm0, %ymm4 - LONG $0x0c58f4c5; BYTE $0x82 // vaddps (%rdx,%rax,4), %ymm1, %ymm1 - LONG $0x5458ecc5; WORD $0x2082 // vaddps 32(%rdx,%rax,4), %ymm2, %ymm2 - LONG $0x5c58e4c5; WORD $0x4082 // vaddps 64(%rdx,%rax,4), %ymm3, %ymm3 - LONG $0x6458dcc5; WORD $0x6082 // vaddps 96(%rdx,%rax,4), %ymm4, %ymm4 - LONG $0x0c11fcc5; BYTE $0x82 // vmovups %ymm1, (%rdx,%rax,4) - LONG $0x5411fcc5; WORD $0x2082 // vmovups %ymm2, 32(%rdx,%rax,4) - LONG $0x5c11fcc5; WORD $0x4082 // vmovups %ymm3, 64(%rdx,%rax,4) - LONG $0x6411fcc5; WORD $0x6082 // vmovups %ymm4, 96(%rdx,%rax,4) - LONG $0x20c08348 // addq $32, %rax - WORD $0x3949; BYTE $0xc2 // cmpq %rax, %r10 + LONG $0x597ca1c4; WORD $0x970c // vmulps (%rdi,%r10,4), %ymm0, %ymm1 + LONG $0x5874a1c4; WORD $0x920c // vaddps (%rdx,%r10,4), %ymm1, %ymm1 + LONG $0x597ca1c4; WORD $0x9754; BYTE $0x20 // vmulps 32(%rdi,%r10,4), %ymm0, %ymm2 + LONG $0x586ca1c4; WORD $0x9254; BYTE $0x20 // vaddps 32(%rdx,%r10,4), %ymm2, %ymm2 + LONG $0x597ca1c4; WORD $0x975c; BYTE $0x40 // vmulps 64(%rdi,%r10,4), %ymm0, %ymm3 + LONG $0x5864a1c4; WORD $0x925c; BYTE $0x40 // vaddps 64(%rdx,%r10,4), %ymm3, %ymm3 + LONG $0x597ca1c4; WORD $0x9764; BYTE $0x60 // vmulps 96(%rdi,%r10,4), %ymm0, %ymm4 + LONG $0x585ca1c4; WORD $0x9264; BYTE $0x60 // vaddps 96(%rdx,%r10,4), %ymm4, %ymm4 + LONG $0x117ca1c4; WORD $0x920c // vmovups %ymm1, (%rdx,%r10,4) + LONG $0x117ca1c4; WORD $0x9254; BYTE $0x20 // vmovups %ymm2, 32(%rdx,%r10,4) + LONG $0x117ca1c4; WORD $0x925c; BYTE $0x40 // vmovups %ymm3, 64(%rdx,%r10,4) + LONG $0x117ca1c4; WORD $0x9264; BYTE $0x60 // vmovups %ymm4, 96(%rdx,%r10,4) + LONG $0x20c28349 // addq $32, %r10 + WORD $0x394d; BYTE $0xd0 // cmpq %r10, %r8 JNE LBB0_12 - WORD $0x854d; BYTE $0xc9 // testq %r9, %r9 + WORD $0x854d; BYTE $0xc9 // testq %r9, %r9 JE LBB0_18 LBB0_14: - WORD $0x2944; BYTE $0xd1 // subl %r10d, %ecx - LONG $0x01428d49 // leaq 1(%r10), %rax + WORD $0x2944; BYTE $0xc1 // subl %r8d, %ecx + LONG $0x01488d4d // leaq 1(%r8), %r9 WORD $0xc1f6; BYTE $0x01 // testb $1, %cl JE LBB0_16 - LONG $0x107aa1c4; WORD $0x9704 // vmovss (%rdi,%r10,4), %xmm0 + LONG $0x107aa1c4; WORD $0x8704 // vmovss (%rdi,%r8,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x587aa1c4; WORD $0x9204 // vaddss (%rdx,%r10,4), %xmm0, %xmm0 - LONG $0x117aa1c4; WORD $0x9204 // vmovss %xmm0, (%rdx,%r10,4) - WORD $0x8949; BYTE $0xc2 // movq %rax, %r10 + LONG $0x587aa1c4; WORD $0x8204 // vaddss (%rdx,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8204 // vmovss %xmm0, (%rdx,%r8,4) + WORD $0x894d; BYTE $0xc8 // movq %r9, %r8 LBB0_16: - WORD $0x3949; BYTE $0xc0 // cmpq %rax, %r8 + WORD $0x394c; BYTE $0xc8 // cmpq %r9, %rax JE LBB0_18 LBB0_17: - LONG $0x107aa1c4; WORD $0x9704 // vmovss (%rdi,%r10,4), %xmm0 + LONG $0x107aa1c4; WORD $0x8704 // vmovss (%rdi,%r8,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x587aa1c4; WORD $0x9204 // vaddss (%rdx,%r10,4), %xmm0, %xmm0 - LONG $0x117aa1c4; WORD $0x9204 // vmovss %xmm0, (%rdx,%r10,4) - LONG $0x107aa1c4; WORD $0x9744; BYTE $0x04 // vmovss 4(%rdi,%r10,4), %xmm0 + LONG $0x587aa1c4; WORD $0x8204 // vaddss (%rdx,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8204 // vmovss %xmm0, (%rdx,%r8,4) + LONG $0x107aa1c4; WORD $0x8744; BYTE $0x04 // vmovss 4(%rdi,%r8,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x587aa1c4; WORD $0x9244; BYTE $0x04 // vaddss 4(%rdx,%r10,4), %xmm0, %xmm0 - LONG $0x117aa1c4; WORD $0x9244; BYTE $0x04 // vmovss %xmm0, 4(%rdx,%r10,4) - LONG $0x02c28349 // addq $2, %r10 - WORD $0x394d; BYTE $0xd0 // cmpq %r10, %r8 + LONG $0x587aa1c4; WORD $0x8244; BYTE $0x04 // vaddss 4(%rdx,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8244; BYTE $0x04 // vmovss %xmm0, 4(%rdx,%r8,4) + LONG $0x02c08349 // addq $2, %r8 + WORD $0x394c; BYTE $0xc0 // cmpq %r8, %rax JNE LBB0_17 LBB0_18: - LONG $0xf0658d48 // leaq -16(%rbp), %rsp + LONG $0xf8658d48 // leaq -8(%rbp), %rsp BYTE $0x5b // popq %rbx - WORD $0x5e41 // popq %r14 BYTE $0x5d // popq %rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // retq @@ -143,24 +140,23 @@ TEXT ·_mm256_mul_const_to(SB), $0-32 MOVQ b+8(FP), SI MOVQ c+16(FP), DX MOVQ n+24(FP), CX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - BYTE $0x53 // pushq %rbx - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x07418d48 // leaq 7(%rcx), %rax - WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx - LONG $0xc1490f48 // cmovnsq %rcx, %rax - WORD $0x8949; BYTE $0xc0 // movq %rax, %r8 - LONG $0x03f8c149 // sarq $3, %r8 - LONG $0xf8e08348 // andq $-8, %rax - WORD $0x2948; BYTE $0xc1 // subq %rax, %rcx - WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + BYTE $0x53 // pushq %rbx + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x07418d4c // leaq 7(%rcx), %r8 + WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx + LONG $0xc1490f4c // cmovnsq %rcx, %r8 + WORD $0x894c; BYTE $0xc0 // movq %r8, %rax + LONG $0x03e8c148 // shrq $3, %rax + LONG $0xf8e08349 // andq $-8, %r8 + WORD $0x294c; BYTE $0xc1 // subq %r8, %rcx + WORD $0xc085 // testl %eax, %eax JLE LBB1_6 - LONG $0x01f88341 // cmpl $1, %r8d + WORD $0xf883; BYTE $0x01 // cmpl $1, %eax JE LBB1_4 - WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax - WORD $0xe083; BYTE $0xfe // andl $-2, %eax - WORD $0xd8f7 // negl %eax + WORD $0x8941; BYTE $0xc0 // movl %eax, %r8d + LONG $0xfee08141; WORD $0xffff; BYTE $0x7f // andl $2147483646, %r8d # imm = 0x7FFFFFFE LBB1_3: LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 @@ -171,48 +167,48 @@ LBB1_3: LONG $0x4211fcc5; BYTE $0x20 // vmovups %ymm0, 32(%rdx) LONG $0x40c78348 // addq $64, %rdi LONG $0x40c28348 // addq $64, %rdx - WORD $0xc083; BYTE $0x02 // addl $2, %eax + LONG $0xfec08341 // addl $-2, %r8d JNE LBB1_3 LBB1_4: - LONG $0x01c0f641 // testb $1, %r8b + WORD $0x01a8 // testb $1, %al JE LBB1_6 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 LONG $0x0759fcc5 // vmulps (%rdi), %ymm0, %ymm0 LONG $0x0211fcc5 // vmovups %ymm0, (%rdx) - LONG $0x20c28348 // addq $32, %rdx LONG $0x20c78348 // addq $32, %rdi + LONG $0x20c28348 // addq $32, %rdx LBB1_6: WORD $0xc985 // testl %ecx, %ecx - JLE LBB1_18 - WORD $0x8941; BYTE $0xc8 // movl %ecx, %r8d - LONG $0x20f88349 // cmpq $32, %r8 + JLE LBB1_19 + WORD $0xc889 // movl %ecx, %eax + LONG $0x20f88348 // cmpq $32, %rax JAE LBB1_9 - WORD $0xc031 // xorl %eax, %eax + WORD $0x3145; BYTE $0xc0 // xorl %r8d, %r8d JMP LBB1_14 LBB1_9: - LONG $0x82048d4a // leaq (%rdx,%r8,4), %rax - LONG $0x870c8d4e // leaq (%rdi,%r8,4), %r9 + LONG $0x82048d4c // leaq (%rdx,%rax,4), %r8 + LONG $0x870c8d4c // leaq (%rdi,%rax,4), %r9 LONG $0x04568d4c // leaq 4(%rsi), %r10 WORD $0x394c; BYTE $0xca // cmpq %r9, %rdx LONG $0xc3920f41 // setb %r11b - WORD $0x3948; BYTE $0xc7 // cmpq %rax, %rdi + WORD $0x394c; BYTE $0xc7 // cmpq %r8, %rdi WORD $0x920f; BYTE $0xc3 // setb %bl WORD $0x394c; BYTE $0xd2 // cmpq %r10, %rdx LONG $0xc1920f41 // setb %r9b - WORD $0x3948; BYTE $0xf0 // cmpq %rsi, %rax + WORD $0x3949; BYTE $0xf0 // cmpq %rsi, %r8 LONG $0xc2970f41 // seta %r10b - WORD $0xc031 // xorl %eax, %eax + WORD $0x3145; BYTE $0xc0 // xorl %r8d, %r8d WORD $0x8441; BYTE $0xdb // testb %bl, %r11b JNE LBB1_14 WORD $0x2045; BYTE $0xd1 // andb %r10b, %r9b JNE LBB1_14 WORD $0x8941; BYTE $0xc9 // movl %ecx, %r9d LONG $0x1fe18341 // andl $31, %r9d - WORD $0x894c; BYTE $0xc0 // movq %r8, %rax - WORD $0x294c; BYTE $0xc8 // subq %r9, %rax + WORD $0x8949; BYTE $0xc0 // movq %rax, %r8 + WORD $0x294d; BYTE $0xc8 // subq %r9, %r8 LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 WORD $0x3145; BYTE $0xd2 // xorl %r10d, %r10d @@ -226,49 +222,49 @@ LBB1_12: LONG $0x117ca1c4; WORD $0x925c; BYTE $0x40 // vmovups %ymm3, 64(%rdx,%r10,4) LONG $0x117ca1c4; WORD $0x9264; BYTE $0x60 // vmovups %ymm4, 96(%rdx,%r10,4) LONG $0x20c28349 // addq $32, %r10 - WORD $0x394c; BYTE $0xd0 // cmpq %r10, %rax + WORD $0x394d; BYTE $0xd0 // cmpq %r10, %r8 JNE LBB1_12 WORD $0x854d; BYTE $0xc9 // testq %r9, %r9 - JE LBB1_18 + JE LBB1_19 LBB1_14: - WORD $0xc129 // subl %eax, %ecx - WORD $0x8949; BYTE $0xc1 // movq %rax, %r9 - WORD $0xf749; BYTE $0xd1 // notq %r9 - WORD $0x014d; BYTE $0xc1 // addq %r8, %r9 - LONG $0x03e18348 // andq $3, %rcx - JE LBB1_16 - -LBB1_15: - LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) - LONG $0x01c08348 // addq $1, %rax - LONG $0xffc18348 // addq $-1, %rcx - JNE LBB1_15 + WORD $0x2944; BYTE $0xc1 // subl %r8d, %ecx + WORD $0x894d; BYTE $0xc1 // movq %r8, %r9 + WORD $0xe183; BYTE $0x03 // andl $3, %ecx + JE LBB1_17 + WORD $0x894d; BYTE $0xc1 // movq %r8, %r9 LBB1_16: - LONG $0x03f98349 // cmpq $3, %r9 - JB LBB1_18 + LONG $0x107aa1c4; WORD $0x8f04 // vmovss (%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a04 // vmovss %xmm0, (%rdx,%r9,4) + WORD $0xff49; BYTE $0xc1 // incq %r9 + WORD $0xff48; BYTE $0xc9 // decq %rcx + JNE LBB1_16 LBB1_17: - LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0482 // vmovss %xmm0, 4(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0882 // vmovss %xmm0, 8(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0c82 // vmovss %xmm0, 12(%rdx,%rax,4) - LONG $0x04c08348 // addq $4, %rax - WORD $0x3949; BYTE $0xc0 // cmpq %rax, %r8 - JNE LBB1_17 + WORD $0x2949; BYTE $0xc0 // subq %rax, %r8 + LONG $0xfcf88349 // cmpq $-4, %r8 + JA LBB1_19 LBB1_18: + LONG $0x107aa1c4; WORD $0x8f04 // vmovss (%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a04 // vmovss %xmm0, (%rdx,%r9,4) + LONG $0x107aa1c4; WORD $0x8f44; BYTE $0x04 // vmovss 4(%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a44; BYTE $0x04 // vmovss %xmm0, 4(%rdx,%r9,4) + LONG $0x107aa1c4; WORD $0x8f44; BYTE $0x08 // vmovss 8(%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a44; BYTE $0x08 // vmovss %xmm0, 8(%rdx,%r9,4) + LONG $0x107aa1c4; WORD $0x8f44; BYTE $0x0c // vmovss 12(%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a44; BYTE $0x0c // vmovss %xmm0, 12(%rdx,%r9,4) + LONG $0x04c18349 // addq $4, %r9 + WORD $0x394c; BYTE $0xc8 // cmpq %r9, %rax + JNE LBB1_18 + +LBB1_19: LONG $0xf8658d48 // leaq -8(%rbp), %rsp BYTE $0x5b // popq %rbx BYTE $0x5d // popq %rbp @@ -279,23 +275,22 @@ TEXT ·_mm256_mul_const(SB), $0-32 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVQ n+16(FP), DX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x074a8d48 // leaq 7(%rdx), %rcx - WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx - LONG $0xca490f48 // cmovnsq %rdx, %rcx - WORD $0x8948; BYTE $0xc8 // movq %rcx, %rax - LONG $0x03f8c148 // sarq $3, %rax - LONG $0xf8e18348 // andq $-8, %rcx - WORD $0x2948; BYTE $0xca // subq %rcx, %rdx - WORD $0xc085 // testl %eax, %eax + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x074a8d48 // leaq 7(%rdx), %rcx + WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx + LONG $0xca490f48 // cmovnsq %rdx, %rcx + WORD $0x8948; BYTE $0xc8 // movq %rcx, %rax + LONG $0x03e8c148 // shrq $3, %rax + LONG $0xf8e18348 // andq $-8, %rcx + WORD $0x2948; BYTE $0xca // subq %rcx, %rdx + WORD $0xc085 // testl %eax, %eax JLE LBB2_6 - WORD $0xf883; BYTE $0x01 // cmpl $1, %eax + WORD $0xf883; BYTE $0x01 // cmpl $1, %eax JE LBB2_4 - WORD $0xc189 // movl %eax, %ecx - WORD $0xe183; BYTE $0xfe // andl $-2, %ecx - WORD $0xd9f7 // negl %ecx + WORD $0xc189 // movl %eax, %ecx + LONG $0xfffee181; WORD $0x7fff // andl $2147483646, %ecx # imm = 0x7FFFFFFE LBB2_3: LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 @@ -305,7 +300,7 @@ LBB2_3: LONG $0x4759fcc5; BYTE $0x20 // vmulps 32(%rdi), %ymm0, %ymm0 LONG $0x4711fcc5; BYTE $0x20 // vmovups %ymm0, 32(%rdi) LONG $0x40c78348 // addq $64, %rdi - WORD $0xc183; BYTE $0x02 // addl $2, %ecx + WORD $0xc183; BYTE $0xfe // addl $-2, %ecx JNE LBB2_3 LBB2_4: @@ -318,15 +313,15 @@ LBB2_4: LBB2_6: WORD $0xd285 // testl %edx, %edx - JLE LBB2_19 - WORD $0x8941; BYTE $0xd1 // movl %edx, %r9d - LONG $0x20f98349 // cmpq $32, %r9 + JLE LBB2_20 + WORD $0xd089 // movl %edx, %eax + LONG $0x20f88348 // cmpq $32, %rax JB LBB2_8 - LONG $0x04468d48 // leaq 4(%rsi), %rax - WORD $0x3948; BYTE $0xc7 // cmpq %rax, %rdi + LONG $0x044e8d48 // leaq 4(%rsi), %rcx + WORD $0x3948; BYTE $0xcf // cmpq %rcx, %rdi JAE LBB2_12 - LONG $0x8f048d4a // leaq (%rdi,%r9,4), %rax - WORD $0x3948; BYTE $0xf0 // cmpq %rsi, %rax + LONG $0x870c8d48 // leaq (%rdi,%rax,4), %rcx + WORD $0x3948; BYTE $0xf1 // cmpq %rsi, %rcx JBE LBB2_12 LBB2_8: @@ -335,95 +330,92 @@ LBB2_8: LBB2_15: WORD $0xca29 // subl %ecx, %edx WORD $0x8949; BYTE $0xc8 // movq %rcx, %r8 - WORD $0xf749; BYTE $0xd0 // notq %r8 - WORD $0x014d; BYTE $0xc8 // addq %r9, %r8 - LONG $0x03e28348 // andq $3, %rdx - JE LBB2_17 - -LBB2_16: - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 - LONG $0x0459fac5; BYTE $0x8f // vmulss (%rdi,%rcx,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x8f // vmovss %xmm0, (%rdi,%rcx,4) - LONG $0x01c18348 // addq $1, %rcx - LONG $0xffc28348 // addq $-1, %rdx - JNE LBB2_16 + WORD $0xe283; BYTE $0x03 // andl $3, %edx + JE LBB2_18 + WORD $0x8949; BYTE $0xc8 // movq %rcx, %r8 LBB2_17: - LONG $0x03f88349 // cmpq $3, %r8 - JB LBB2_19 + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8704 // vmulss (%rdi,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8704 // vmovss %xmm0, (%rdi,%r8,4) + WORD $0xff49; BYTE $0xc0 // incq %r8 + WORD $0xff48; BYTE $0xca // decq %rdx + JNE LBB2_17 LBB2_18: - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 - LONG $0x0459fac5; BYTE $0x8f // vmulss (%rdi,%rcx,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x8f // vmovss %xmm0, (%rdi,%rcx,4) - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 - LONG $0x4459fac5; WORD $0x048f // vmulss 4(%rdi,%rcx,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x048f // vmovss %xmm0, 4(%rdi,%rcx,4) - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 - LONG $0x4459fac5; WORD $0x088f // vmulss 8(%rdi,%rcx,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x088f // vmovss %xmm0, 8(%rdi,%rcx,4) - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 - LONG $0x4459fac5; WORD $0x0c8f // vmulss 12(%rdi,%rcx,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0c8f // vmovss %xmm0, 12(%rdi,%rcx,4) - LONG $0x04c18348 // addq $4, %rcx - WORD $0x3949; BYTE $0xc9 // cmpq %rcx, %r9 - JNE LBB2_18 - JMP LBB2_19 + WORD $0x2948; BYTE $0xc1 // subq %rax, %rcx + LONG $0xfcf98348 // cmpq $-4, %rcx + JA LBB2_20 + +LBB2_19: + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8704 // vmulss (%rdi,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8704 // vmovss %xmm0, (%rdi,%r8,4) + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8744; BYTE $0x04 // vmulss 4(%rdi,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8744; BYTE $0x04 // vmovss %xmm0, 4(%rdi,%r8,4) + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8744; BYTE $0x08 // vmulss 8(%rdi,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8744; BYTE $0x08 // vmovss %xmm0, 8(%rdi,%r8,4) + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8744; BYTE $0x0c // vmulss 12(%rdi,%r8,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8744; BYTE $0x0c // vmovss %xmm0, 12(%rdi,%r8,4) + LONG $0x04c08349 // addq $4, %r8 + WORD $0x394c; BYTE $0xc0 // cmpq %r8, %rax + JNE LBB2_19 + +LBB2_20: + WORD $0x8948; BYTE $0xec // movq %rbp, %rsp + BYTE $0x5d // popq %rbp + WORD $0xf8c5; BYTE $0x77 // vzeroupper + BYTE $0xc3 // retq LBB2_12: WORD $0x8941; BYTE $0xd0 // movl %edx, %r8d LONG $0x1fe08341 // andl $31, %r8d - WORD $0x894c; BYTE $0xc9 // movq %r9, %rcx + WORD $0x8948; BYTE $0xc1 // movq %rax, %rcx WORD $0x294c; BYTE $0xc1 // subq %r8, %rcx LONG $0x187de2c4; BYTE $0x06 // vbroadcastss (%rsi), %ymm0 - WORD $0xc031 // xorl %eax, %eax + WORD $0x3145; BYTE $0xc9 // xorl %r9d, %r9d LBB2_13: - LONG $0x0c59fcc5; BYTE $0x87 // vmulps (%rdi,%rax,4), %ymm0, %ymm1 - LONG $0x5459fcc5; WORD $0x2087 // vmulps 32(%rdi,%rax,4), %ymm0, %ymm2 - LONG $0x5c59fcc5; WORD $0x4087 // vmulps 64(%rdi,%rax,4), %ymm0, %ymm3 - LONG $0x6459fcc5; WORD $0x6087 // vmulps 96(%rdi,%rax,4), %ymm0, %ymm4 - LONG $0x0c11fcc5; BYTE $0x87 // vmovups %ymm1, (%rdi,%rax,4) - LONG $0x5411fcc5; WORD $0x2087 // vmovups %ymm2, 32(%rdi,%rax,4) - LONG $0x5c11fcc5; WORD $0x4087 // vmovups %ymm3, 64(%rdi,%rax,4) - LONG $0x6411fcc5; WORD $0x6087 // vmovups %ymm4, 96(%rdi,%rax,4) - LONG $0x20c08348 // addq $32, %rax - WORD $0x3948; BYTE $0xc1 // cmpq %rax, %rcx + LONG $0x597ca1c4; WORD $0x8f0c // vmulps (%rdi,%r9,4), %ymm0, %ymm1 + LONG $0x597ca1c4; WORD $0x8f54; BYTE $0x20 // vmulps 32(%rdi,%r9,4), %ymm0, %ymm2 + LONG $0x597ca1c4; WORD $0x8f5c; BYTE $0x40 // vmulps 64(%rdi,%r9,4), %ymm0, %ymm3 + LONG $0x597ca1c4; WORD $0x8f64; BYTE $0x60 // vmulps 96(%rdi,%r9,4), %ymm0, %ymm4 + LONG $0x117ca1c4; WORD $0x8f0c // vmovups %ymm1, (%rdi,%r9,4) + LONG $0x117ca1c4; WORD $0x8f54; BYTE $0x20 // vmovups %ymm2, 32(%rdi,%r9,4) + LONG $0x117ca1c4; WORD $0x8f5c; BYTE $0x40 // vmovups %ymm3, 64(%rdi,%r9,4) + LONG $0x117ca1c4; WORD $0x8f64; BYTE $0x60 // vmovups %ymm4, 96(%rdi,%r9,4) + LONG $0x20c18349 // addq $32, %r9 + WORD $0x394c; BYTE $0xc9 // cmpq %r9, %rcx JNE LBB2_13 - WORD $0x854d; BYTE $0xc0 // testq %r8, %r8 + WORD $0x854d; BYTE $0xc0 // testq %r8, %r8 JNE LBB2_15 - -LBB2_19: - WORD $0x8948; BYTE $0xec // movq %rbp, %rsp - BYTE $0x5d // popq %rbp - WORD $0xf8c5; BYTE $0x77 // vzeroupper - BYTE $0xc3 // retq + JMP LBB2_20 TEXT ·_mm256_mul_to(SB), $0-32 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVQ c+16(FP), DX MOVQ n+24(FP), CX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - BYTE $0x53 // pushq %rbx - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x07418d4c // leaq 7(%rcx), %r8 - WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx - LONG $0xc1490f4c // cmovnsq %rcx, %r8 - WORD $0x894c; BYTE $0xc0 // movq %r8, %rax - LONG $0x03f8c148 // sarq $3, %rax - LONG $0xf8e08349 // andq $-8, %r8 - WORD $0x294c; BYTE $0xc1 // subq %r8, %rcx - WORD $0xc085 // testl %eax, %eax + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x07418d48 // leaq 7(%rcx), %rax + WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx + LONG $0xc1490f48 // cmovnsq %rcx, %rax + WORD $0x8949; BYTE $0xc0 // movq %rax, %r8 + LONG $0x03e8c149 // shrq $3, %r8 + LONG $0xf8e08348 // andq $-8, %rax + WORD $0x2948; BYTE $0xc1 // subq %rax, %rcx + WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d JLE LBB3_6 - LONG $0xff488d44 // leal -1(%rax), %r9d - WORD $0x8941; BYTE $0xc0 // movl %eax, %r8d - LONG $0x03e08341 // andl $3, %r8d - LONG $0x03f98341 // cmpl $3, %r9d + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + LONG $0x04f88341 // cmpl $4, %r8d JB LBB3_4 - WORD $0xe083; BYTE $0xfc // andl $-4, %eax - WORD $0xd8f7 // negl %eax + LONG $0xfce08141; WORD $0xffff; BYTE $0x7f // andl $2147483644, %r8d # imm = 0x7FFFFFFC LBB3_3: LONG $0x0710fcc5 // vmovups (%rdi), %ymm0 @@ -441,11 +433,11 @@ LBB3_3: LONG $0x80ef8348 // subq $-128, %rdi LONG $0x80ee8348 // subq $-128, %rsi LONG $0x80ea8348 // subq $-128, %rdx - WORD $0xc083; BYTE $0x04 // addl $4, %eax + LONG $0xfcc08341 // addl $-4, %r8d JNE LBB3_3 LBB3_4: - WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d + WORD $0xc085 // testl %eax, %eax JE LBB3_6 LBB3_5: @@ -455,40 +447,33 @@ LBB3_5: LONG $0x20c78348 // addq $32, %rdi LONG $0x20c68348 // addq $32, %rsi LONG $0x20c28348 // addq $32, %rdx - LONG $0xffc08341 // addl $-1, %r8d + WORD $0xc8ff // decl %eax JNE LBB3_5 LBB3_6: WORD $0xc985 // testl %ecx, %ecx - JLE LBB3_18 - WORD $0x8941; BYTE $0xc8 // movl %ecx, %r8d - LONG $0x20f88349 // cmpq $32, %r8 + JLE LBB3_19 + WORD $0xc889 // movl %ecx, %eax + LONG $0x20f88348 // cmpq $32, %rax JAE LBB3_9 - WORD $0xc031 // xorl %eax, %eax + WORD $0x3145; BYTE $0xc0 // xorl %r8d, %r8d JMP LBB3_14 LBB3_9: - LONG $0x82048d4a // leaq (%rdx,%r8,4), %rax - LONG $0x870c8d4e // leaq (%rdi,%r8,4), %r9 - LONG $0x86148d4e // leaq (%rsi,%r8,4), %r10 - WORD $0x394c; BYTE $0xca // cmpq %r9, %rdx - LONG $0xc3920f41 // setb %r11b - WORD $0x3948; BYTE $0xc7 // cmpq %rax, %rdi - WORD $0x920f; BYTE $0xc3 // setb %bl - WORD $0x394c; BYTE $0xd2 // cmpq %r10, %rdx - LONG $0xc1920f41 // setb %r9b - WORD $0x3948; BYTE $0xc6 // cmpq %rax, %rsi - LONG $0xc2920f41 // setb %r10b - WORD $0xc031 // xorl %eax, %eax - WORD $0x8441; BYTE $0xdb // testb %bl, %r11b - JNE LBB3_14 - WORD $0x2045; BYTE $0xd1 // andb %r10b, %r9b - JNE LBB3_14 - WORD $0x8941; BYTE $0xc9 // movl %ecx, %r9d - LONG $0x1fe18341 // andl $31, %r9d - WORD $0x894c; BYTE $0xc0 // movq %r8, %rax - WORD $0x294c; BYTE $0xc8 // subq %r9, %rax - WORD $0x3145; BYTE $0xd2 // xorl %r10d, %r10d + WORD $0x8949; BYTE $0xd1 // movq %rdx, %r9 + WORD $0x2949; BYTE $0xf9 // subq %rdi, %r9 + WORD $0x3145; BYTE $0xc0 // xorl %r8d, %r8d + LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmpq $128, %r9 + JB LBB3_14 + WORD $0x8949; BYTE $0xd1 // movq %rdx, %r9 + WORD $0x2949; BYTE $0xf1 // subq %rsi, %r9 + LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmpq $128, %r9 + JB LBB3_14 + WORD $0x8941; BYTE $0xc9 // movl %ecx, %r9d + LONG $0x1fe18341 // andl $31, %r9d + WORD $0x8949; BYTE $0xc0 // movq %rax, %r8 + WORD $0x294d; BYTE $0xc8 // subq %r9, %r8 + WORD $0x3145; BYTE $0xd2 // xorl %r10d, %r10d LBB3_12: LONG $0x107ca1c4; WORD $0x9704 // vmovups (%rdi,%r10,4), %ymm0 @@ -504,51 +489,50 @@ LBB3_12: LONG $0x117ca1c4; WORD $0x9254; BYTE $0x40 // vmovups %ymm2, 64(%rdx,%r10,4) LONG $0x117ca1c4; WORD $0x925c; BYTE $0x60 // vmovups %ymm3, 96(%rdx,%r10,4) LONG $0x20c28349 // addq $32, %r10 - WORD $0x394c; BYTE $0xd0 // cmpq %r10, %rax + WORD $0x394d; BYTE $0xd0 // cmpq %r10, %r8 JNE LBB3_12 WORD $0x854d; BYTE $0xc9 // testq %r9, %r9 - JE LBB3_18 + JE LBB3_19 LBB3_14: - WORD $0xc129 // subl %eax, %ecx - WORD $0x8949; BYTE $0xc1 // movq %rax, %r9 - WORD $0xf749; BYTE $0xd1 // notq %r9 - WORD $0x014d; BYTE $0xc1 // addq %r8, %r9 - LONG $0x03e18348 // andq $3, %rcx - JE LBB3_16 - -LBB3_15: - LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 - LONG $0x0459fac5; BYTE $0x86 // vmulss (%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) - LONG $0x01c08348 // addq $1, %rax - LONG $0xffc18348 // addq $-1, %rcx - JNE LBB3_15 + WORD $0x2944; BYTE $0xc1 // subl %r8d, %ecx + WORD $0x894d; BYTE $0xc1 // movq %r8, %r9 + WORD $0xe183; BYTE $0x03 // andl $3, %ecx + JE LBB3_17 + WORD $0x894d; BYTE $0xc1 // movq %r8, %r9 LBB3_16: - LONG $0x03f98349 // cmpq $3, %r9 - JB LBB3_18 + LONG $0x107aa1c4; WORD $0x8f04 // vmovss (%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8e04 // vmulss (%rsi,%r9,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a04 // vmovss %xmm0, (%rdx,%r9,4) + WORD $0xff49; BYTE $0xc1 // incq %r9 + WORD $0xff48; BYTE $0xc9 // decq %rcx + JNE LBB3_16 LBB3_17: - LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 - LONG $0x0459fac5; BYTE $0x86 // vmulss (%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm0 - LONG $0x4459fac5; WORD $0x0486 // vmulss 4(%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0482 // vmovss %xmm0, 4(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm0 - LONG $0x4459fac5; WORD $0x0886 // vmulss 8(%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0882 // vmovss %xmm0, 8(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm0 - LONG $0x4459fac5; WORD $0x0c86 // vmulss 12(%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0c82 // vmovss %xmm0, 12(%rdx,%rax,4) - LONG $0x04c08348 // addq $4, %rax - WORD $0x3949; BYTE $0xc0 // cmpq %rax, %r8 - JNE LBB3_17 + WORD $0x2949; BYTE $0xc0 // subq %rax, %r8 + LONG $0xfcf88349 // cmpq $-4, %r8 + JA LBB3_19 LBB3_18: - LONG $0xf8658d48 // leaq -8(%rbp), %rsp - BYTE $0x5b // popq %rbx + LONG $0x107aa1c4; WORD $0x8f04 // vmovss (%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8e04 // vmulss (%rsi,%r9,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a04 // vmovss %xmm0, (%rdx,%r9,4) + LONG $0x107aa1c4; WORD $0x8f44; BYTE $0x04 // vmovss 4(%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8e44; BYTE $0x04 // vmulss 4(%rsi,%r9,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a44; BYTE $0x04 // vmovss %xmm0, 4(%rdx,%r9,4) + LONG $0x107aa1c4; WORD $0x8f44; BYTE $0x08 // vmovss 8(%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8e44; BYTE $0x08 // vmulss 8(%rsi,%r9,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a44; BYTE $0x08 // vmovss %xmm0, 8(%rdx,%r9,4) + LONG $0x107aa1c4; WORD $0x8f44; BYTE $0x0c // vmovss 12(%rdi,%r9,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x597aa1c4; WORD $0x8e44; BYTE $0x0c // vmulss 12(%rsi,%r9,4), %xmm0, %xmm0 + LONG $0x117aa1c4; WORD $0x8a44; BYTE $0x0c // vmovss %xmm0, 12(%rdx,%r9,4) + LONG $0x04c18349 // addq $4, %r9 + WORD $0x394c; BYTE $0xc8 // cmpq %r9, %rax + JNE LBB3_18 + +LBB3_19: + WORD $0x8948; BYTE $0xec // movq %rbp, %rsp BYTE $0x5d // popq %rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // retq @@ -558,152 +542,249 @@ TEXT ·_mm256_dot(SB), $0-32 MOVQ b+8(FP), SI MOVQ n+16(FP), DX MOVQ ret+24(FP), CX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - WORD $0x5641 // pushq %r14 - BYTE $0x53 // pushq %rbx - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x07428d48 // leaq 7(%rdx), %rax - WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx - LONG $0xc2490f48 // cmovnsq %rdx, %rax - WORD $0x8949; BYTE $0xc1 // movq %rax, %r9 - LONG $0x03f9c149 // sarq $3, %r9 - LONG $0xf8e08348 // andq $-8, %rax - WORD $0x2948; BYTE $0xc2 // subq %rax, %rdx - WORD $0x8545; BYTE $0xc9 // testl %r9d, %r9d + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x07428d48 // leaq 7(%rdx), %rax + WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx + LONG $0xc2490f48 // cmovnsq %rdx, %rax + WORD $0x8949; BYTE $0xc1 // movq %rax, %r9 + LONG $0x03e9c149 // shrq $3, %r9 + LONG $0xf8e08348 // andq $-8, %rax + WORD $0x2948; BYTE $0xc2 // subq %rax, %rdx + WORD $0x8545; BYTE $0xc9 // testl %r9d, %r9d JLE LBB4_1 - LONG $0x0710fcc5 // vmovups (%rdi), %ymm0 - LONG $0x0659fcc5 // vmulps (%rsi), %ymm0, %ymm0 - LONG $0x20c78348 // addq $32, %rdi - LONG $0x20c68348 // addq $32, %rsi - LONG $0x01f98341 // cmpl $1, %r9d - JE LBB4_9 - QUAD $0x0007fffffff0b849; WORD $0x0000 // movabsq $34359738352, %r8 - LONG $0xc8048d4b // leaq (%r8,%r9,8), %rax - LONG $0x08c88349 // orq $8, %r8 - WORD $0x2149; BYTE $0xc0 // andq %rax, %r8 - LONG $0xff598d45 // leal -1(%r9), %r11d - LONG $0xfe418d41 // leal -2(%r9), %eax - WORD $0xf883; BYTE $0x03 // cmpl $3, %eax - JAE LBB4_16 - WORD $0x8949; BYTE $0xfa // movq %rdi, %r10 - WORD $0x8948; BYTE $0xf0 // movq %rsi, %rax - JMP LBB4_5 - -LBB4_1: - JMP LBB4_9 - -LBB4_16: - WORD $0x8944; BYTE $0xdb // movl %r11d, %ebx - WORD $0xe383; BYTE $0xfc // andl $-4, %ebx - WORD $0xdbf7 // negl %ebx - WORD $0x8949; BYTE $0xfa // movq %rdi, %r10 - WORD $0x8948; BYTE $0xf0 // movq %rsi, %rax - -LBB4_17: - LONG $0x107cc1c4; BYTE $0x0a // vmovups (%r10), %ymm1 - LONG $0x107cc1c4; WORD $0x2052 // vmovups 32(%r10), %ymm2 - LONG $0x107cc1c4; WORD $0x405a // vmovups 64(%r10), %ymm3 - LONG $0x107cc1c4; WORD $0x6062 // vmovups 96(%r10), %ymm4 - LONG $0x0859f4c5 // vmulps (%rax), %ymm1, %ymm1 - LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 - LONG $0x4859ecc5; BYTE $0x20 // vmulps 32(%rax), %ymm2, %ymm1 - LONG $0x5059e4c5; BYTE $0x40 // vmulps 64(%rax), %ymm3, %ymm2 - LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 - LONG $0xc258fcc5 // vaddps %ymm2, %ymm0, %ymm0 - LONG $0x4859dcc5; BYTE $0x60 // vmulps 96(%rax), %ymm4, %ymm1 - LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 - LONG $0x80ea8349 // subq $-128, %r10 - LONG $0x80e88348 // subq $-128, %rax - WORD $0xc383; BYTE $0x04 // addl $4, %ebx - JNE LBB4_17 + LONG $0x0710fcc5 // vmovups (%rdi), %ymm0 + LONG $0x0659fcc5 // vmulps (%rsi), %ymm0, %ymm0 + LONG $0x20c78348 // addq $32, %rdi + LONG $0x20c68348 // addq $32, %rsi + LONG $0x01f98341 // cmpl $1, %r9d + JE LBB4_8 + LONG $0xff418d45 // leal -1(%r9), %r8d + LONG $0xfec18341 // addl $-2, %r9d + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + LONG $0x03f98341 // cmpl $3, %r9d + JB LBB4_6 + LONG $0xfce08341 // andl $-4, %r8d LBB4_5: - LONG $0x08708d4d // leaq 8(%r8), %r14 - LONG $0x03c3f641 // testb $3, %r11b + LONG $0x0f10fcc5 // vmovups (%rdi), %ymm1 + LONG $0x5710fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm2 + LONG $0x5f10fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm3 + LONG $0x6710fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm4 + LONG $0x0e59f4c5 // vmulps (%rsi), %ymm1, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x4e59ecc5; BYTE $0x20 // vmulps 32(%rsi), %ymm2, %ymm1 + LONG $0x5659e4c5; BYTE $0x40 // vmulps 64(%rsi), %ymm3, %ymm2 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0xc258fcc5 // vaddps %ymm2, %ymm0, %ymm0 + LONG $0x4e59dcc5; BYTE $0x60 // vmulps 96(%rsi), %ymm4, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x80ef8348 // subq $-128, %rdi + LONG $0x80ee8348 // subq $-128, %rsi + LONG $0xfcc08341 // addl $-4, %r8d + JNE LBB4_5 + +LBB4_6: + WORD $0xc085 // testl %eax, %eax JE LBB4_8 - LONG $0xffc18041 // addb $-1, %r9b - LONG $0xc9b60f45 // movzbl %r9b, %r9d - LONG $0x03e18341 // andl $3, %r9d - LONG $0x05e1c149 // shlq $5, %r9 - WORD $0xdb31 // xorl %ebx, %ebx LBB4_7: - LONG $0x107cc1c4; WORD $0x1a0c // vmovups (%r10,%rbx), %ymm1 - LONG $0x0c59f4c5; BYTE $0x18 // vmulps (%rax,%rbx), %ymm1, %ymm1 - LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 - LONG $0x20c38348 // addq $32, %rbx - WORD $0x3941; BYTE $0xd9 // cmpl %ebx, %r9d + LONG $0x0f10fcc5 // vmovups (%rdi), %ymm1 + LONG $0x0e59f4c5 // vmulps (%rsi), %ymm1, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x20c78348 // addq $32, %rdi + LONG $0x20c68348 // addq $32, %rsi + WORD $0xc8ff // decl %eax JNE LBB4_7 + JMP LBB4_8 +LBB4_1: LBB4_8: - LONG $0x873c8d4a // leaq (%rdi,%r8,4), %rdi - LONG $0x20c78348 // addq $32, %rdi - LONG $0xb6348d4a // leaq (%rsi,%r14,4), %rsi - -LBB4_9: LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 LONG $0xc058f0c5 // vaddps %xmm0, %xmm1, %xmm0 - LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd $1, %xmm0, %xmm1 + LONG $0xc8c6f9c5; BYTE $0x01 // vshufpd $1, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,0] LONG $0xc158f8c5 // vaddps %xmm1, %xmm0, %xmm0 - LONG $0xc816fac5 // vmovshdup %xmm0, %xmm1 + LONG $0xc816fac5 // vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) WORD $0xd285 // testl %edx, %edx - JLE LBB4_15 + JLE LBB4_14 WORD $0x8941; BYTE $0xd0 // movl %edx, %r8d LONG $0xff408d49 // leaq -1(%r8), %rax WORD $0xe283; BYTE $0x03 // andl $3, %edx LONG $0x03f88348 // cmpq $3, %rax - JAE LBB4_18 + JAE LBB4_15 WORD $0xc031 // xorl %eax, %eax - JMP LBB4_12 + JMP LBB4_11 -LBB4_18: +LBB4_15: WORD $0x2949; BYTE $0xd0 // subq %rdx, %r8 WORD $0xc031 // xorl %eax, %eax -LBB4_19: - LONG $0x0c10fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm1 +LBB4_16: + LONG $0x0c10fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero LONG $0x0c59f2c5; BYTE $0x86 // vmulss (%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x4c10fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm1 + LONG $0x4c10fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero LONG $0x4c59f2c5; WORD $0x0486 // vmulss 4(%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x4c10fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm1 + LONG $0x4c10fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero LONG $0x4c59f2c5; WORD $0x0886 // vmulss 8(%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x4c10fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm1 + LONG $0x4c10fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero LONG $0x4c59f2c5; WORD $0x0c86 // vmulss 12(%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) LONG $0x04c08348 // addq $4, %rax WORD $0x3949; BYTE $0xc0 // cmpq %rax, %r8 - JNE LBB4_19 + JNE LBB4_16 -LBB4_12: +LBB4_11: WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx - JE LBB4_15 + JE LBB4_14 LONG $0x86348d48 // leaq (%rsi,%rax,4), %rsi LONG $0x87048d48 // leaq (%rdi,%rax,4), %rax WORD $0xff31 // xorl %edi, %edi -LBB4_14: - LONG $0x0c10fac5; BYTE $0xb8 // vmovss (%rax,%rdi,4), %xmm1 +LBB4_13: + LONG $0x0c10fac5; BYTE $0xb8 // vmovss (%rax,%rdi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero LONG $0x0c59f2c5; BYTE $0xbe // vmulss (%rsi,%rdi,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x01c78348 // addq $1, %rdi + WORD $0xff48; BYTE $0xc7 // incq %rdi WORD $0x3948; BYTE $0xfa // cmpq %rdi, %rdx - JNE LBB4_14 + JNE LBB4_13 -LBB4_15: - LONG $0xf0658d48 // leaq -16(%rbp), %rsp - BYTE $0x5b // popq %rbx - WORD $0x5e41 // popq %r14 +LBB4_14: + WORD $0x8948; BYTE $0xec // movq %rbp, %rsp + BYTE $0x5d // popq %rbp + WORD $0xf8c5; BYTE $0x77 // vzeroupper + BYTE $0xc3 // retq + +TEXT ·_mm256_euclidean(SB), $0-32 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ n+16(FP), DX + MOVQ ret+24(FP), CX + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x07428d48 // leaq 7(%rdx), %rax + WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx + LONG $0xc2490f48 // cmovnsq %rdx, %rax + WORD $0x8949; BYTE $0xc1 // movq %rax, %r9 + LONG $0x03e9c149 // shrq $3, %r9 + LONG $0xf8e08348 // andq $-8, %rax + WORD $0x2948; BYTE $0xc2 // subq %rax, %rdx + WORD $0x8545; BYTE $0xc9 // testl %r9d, %r9d + JLE LBB5_1 + LONG $0x0710fcc5 // vmovups (%rdi), %ymm0 + LONG $0x065cfcc5 // vsubps (%rsi), %ymm0, %ymm0 + LONG $0xc059fcc5 // vmulps %ymm0, %ymm0, %ymm0 + LONG $0x20c78348 // addq $32, %rdi + LONG $0x20c68348 // addq $32, %rsi + LONG $0x01f98341 // cmpl $1, %r9d + JE LBB5_8 + LONG $0xff418d45 // leal -1(%r9), %r8d + LONG $0xfec18341 // addl $-2, %r9d + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + LONG $0x03f98341 // cmpl $3, %r9d + JB LBB5_6 + LONG $0xfce08341 // andl $-4, %r8d + +LBB5_5: + LONG $0x0f10fcc5 // vmovups (%rdi), %ymm1 + LONG $0x5710fcc5; BYTE $0x20 // vmovups 32(%rdi), %ymm2 + LONG $0x5f10fcc5; BYTE $0x40 // vmovups 64(%rdi), %ymm3 + LONG $0x6710fcc5; BYTE $0x60 // vmovups 96(%rdi), %ymm4 + LONG $0x0e5cf4c5 // vsubps (%rsi), %ymm1, %ymm1 + LONG $0xc959f4c5 // vmulps %ymm1, %ymm1, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x4e5cecc5; BYTE $0x20 // vsubps 32(%rsi), %ymm2, %ymm1 + LONG $0xc959f4c5 // vmulps %ymm1, %ymm1, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x4e5ce4c5; BYTE $0x40 // vsubps 64(%rsi), %ymm3, %ymm1 + LONG $0xc959f4c5 // vmulps %ymm1, %ymm1, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x4e5cdcc5; BYTE $0x60 // vsubps 96(%rsi), %ymm4, %ymm1 + LONG $0xc959f4c5 // vmulps %ymm1, %ymm1, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x80ef8348 // subq $-128, %rdi + LONG $0x80ee8348 // subq $-128, %rsi + LONG $0xfcc08341 // addl $-4, %r8d + JNE LBB5_5 + +LBB5_6: + WORD $0xc085 // testl %eax, %eax + JE LBB5_8 + +LBB5_7: + LONG $0x0f10fcc5 // vmovups (%rdi), %ymm1 + LONG $0x0e5cf4c5 // vsubps (%rsi), %ymm1, %ymm1 + LONG $0xc959f4c5 // vmulps %ymm1, %ymm1, %ymm1 + LONG $0xc158fcc5 // vaddps %ymm1, %ymm0, %ymm0 + LONG $0x20c78348 // addq $32, %rdi + LONG $0x20c68348 // addq $32, %rsi + WORD $0xc8ff // decl %eax + JNE LBB5_7 + JMP LBB5_8 + +LBB5_1: +LBB5_8: + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 + LONG $0xc058f0c5 // vaddps %xmm0, %xmm1, %xmm0 + LONG $0xc8c6f9c5; BYTE $0x01 // vshufpd $1, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,0] + LONG $0xc158f8c5 // vaddps %xmm1, %xmm0, %xmm0 + LONG $0xc816fac5 // vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] + LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + WORD $0xd285 // testl %edx, %edx + JLE LBB5_13 + WORD $0x8941; BYTE $0xd0 // movl %edx, %r8d + WORD $0xe283; BYTE $0x01 // andl $1, %edx + LONG $0x01f88349 // cmpq $1, %r8 + JNE LBB5_14 + WORD $0xc031 // xorl %eax, %eax + JMP LBB5_11 + +LBB5_14: + WORD $0x2949; BYTE $0xd0 // subq %rdx, %r8 + WORD $0xc031 // xorl %eax, %eax + +LBB5_15: + LONG $0x0c10fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x0c5cf2c5; BYTE $0x86 // vsubss (%rsi,%rax,4), %xmm1, %xmm1 + LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + LONG $0x4c10fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x4c5cf2c5; WORD $0x0486 // vsubss 4(%rsi,%rax,4), %xmm1, %xmm1 + LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + LONG $0x02c08348 // addq $2, %rax + WORD $0x3949; BYTE $0xc0 // cmpq %rax, %r8 + JNE LBB5_15 + +LBB5_11: + WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx + JE LBB5_13 + LONG $0x0c10fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x0c5cf2c5; BYTE $0x86 // vsubss (%rsi,%rax,4), %xmm1, %xmm1 + LONG $0xc959f2c5 // vmulss %xmm1, %xmm1, %xmm1 + LONG $0xc058f2c5 // vaddss %xmm0, %xmm1, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + +LBB5_13: + LONG $0xc051fac5 // vsqrtss %xmm0, %xmm0, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + WORD $0x8948; BYTE $0xec // movq %rbp, %rsp BYTE $0x5d // popq %rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // retq diff --git a/base/floats/floats_avx512.go b/base/floats/floats_avx512.go index 1c65d3ef1..9fe9bf3c3 100644 --- a/base/floats/floats_avx512.go +++ b/base/floats/floats_avx512.go @@ -1,6 +1,5 @@ //go:build !noasm && amd64 - -// AUTO-GENERATED BY GOAT -- DO NOT EDIT +// Code generated by GoAT. DO NOT EDIT. package floats @@ -20,3 +19,6 @@ func _mm512_mul_to(a, b, c, n unsafe.Pointer) //go:noescape func _mm512_dot(a, b, n, ret unsafe.Pointer) + +//go:noescape +func _mm512_euclidean(a, b, n, ret unsafe.Pointer) diff --git a/base/floats/floats_avx512.s b/base/floats/floats_avx512.s index 21dbb4783..fd7ade661 100644 --- a/base/floats/floats_avx512.s +++ b/base/floats/floats_avx512.s @@ -1,52 +1,51 @@ //go:build !noasm && amd64 -// AUTO-GENERATED BY GOAT -- DO NOT EDIT +// Code generated by GoAT. DO NOT EDIT. TEXT ·_mm512_mul_const_add_to(SB), $0-32 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVQ c+16(FP), DX MOVQ n+24(FP), CX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x0f418d4c // leaq 15(%rcx), %r8 - WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx - LONG $0xc1490f4c // cmovnsq %rcx, %r8 - LONG $0x04e8c149 // shrq $4, %r8 - WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax - WORD $0xe0c1; BYTE $0x04 // shll $4, %eax - WORD $0xc129 // subl %eax, %ecx - WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x0f418d48 // leaq 15(%rcx), %rax + WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx + LONG $0xc1490f48 // cmovnsq %rcx, %rax + LONG $0x04e8c148 // shrq $4, %rax + WORD $0x8941; BYTE $0xc0 // movl %eax, %r8d + LONG $0x04e0c141 // shll $4, %r8d + WORD $0x2944; BYTE $0xc1 // subl %r8d, %ecx + WORD $0xc085 // testl %eax, %eax JLE LBB0_6 - LONG $0x01f88341 // cmpl $1, %r8d + WORD $0xf883; BYTE $0x01 // cmpl $1, %eax JE LBB0_4 - WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax - WORD $0xe083; BYTE $0xfe // andl $-2, %eax - WORD $0xd8f7 // negl %eax + WORD $0x8941; BYTE $0xc0 // movl %eax, %r8d + LONG $0xfee08141; WORD $0xffff; BYTE $0x7f // andl $2147483646, %r8d # imm = 0x7FFFFFFE LBB0_3: LONG $0x487cf162; WORD $0x0710 // vmovups (%rdi), %zmm0 LONG $0x487df262; WORD $0x0e18 // vbroadcastss (%rsi), %zmm1 - LONG $0x487df262; WORD $0x0aa8 // vfmadd213ps (%rdx), %zmm0, %zmm1 + LONG $0x487df262; WORD $0x0aa8 // vfmadd213ps (%rdx), %zmm0, %zmm1 # zmm1 = (zmm0 * zmm1) + mem LONG $0x487cf162; WORD $0x0a11 // vmovups %zmm1, (%rdx) LONG $0x487cf162; WORD $0x4710; BYTE $0x01 // vmovups 64(%rdi), %zmm0 LONG $0x487df262; WORD $0x0e18 // vbroadcastss (%rsi), %zmm1 - LONG $0x487df262; WORD $0x4aa8; BYTE $0x01 // vfmadd213ps 64(%rdx), %zmm0, %zmm1 + LONG $0x487df262; WORD $0x4aa8; BYTE $0x01 // vfmadd213ps 64(%rdx), %zmm0, %zmm1 # zmm1 = (zmm0 * zmm1) + mem LONG $0x487cf162; WORD $0x4a11; BYTE $0x01 // vmovups %zmm1, 64(%rdx) LONG $0x80ef8348 // subq $-128, %rdi LONG $0x80ea8348 // subq $-128, %rdx - WORD $0xc083; BYTE $0x02 // addl $2, %eax + LONG $0xfec08341 // addl $-2, %r8d JNE LBB0_3 LBB0_4: - LONG $0x01c0f641 // testb $1, %r8b + WORD $0x01a8 // testb $1, %al JE LBB0_6 LONG $0x487cf162; WORD $0x0710 // vmovups (%rdi), %zmm0 LONG $0x487df262; WORD $0x0e18 // vbroadcastss (%rsi), %zmm1 - LONG $0x487df262; WORD $0x0aa8 // vfmadd213ps (%rdx), %zmm0, %zmm1 + LONG $0x487df262; WORD $0x0aa8 // vfmadd213ps (%rdx), %zmm0, %zmm1 # zmm1 = (zmm0 * zmm1) + mem LONG $0x487cf162; WORD $0x0a11 // vmovups %zmm1, (%rdx) - LONG $0x40c28348 // addq $64, %rdx LONG $0x40c78348 // addq $64, %rdi + LONG $0x40c28348 // addq $64, %rdx LBB0_6: WORD $0xf983; BYTE $0x07 // cmpl $7, %ecx @@ -62,37 +61,37 @@ LBB0_6: LBB0_8: WORD $0xc985 // testl %ecx, %ecx JLE LBB0_13 - WORD $0x8941; BYTE $0xc8 // movl %ecx, %r8d + WORD $0xc889 // movl %ecx, %eax WORD $0xf983; BYTE $0x01 // cmpl $1, %ecx JNE LBB0_14 WORD $0xc931 // xorl %ecx, %ecx JMP LBB0_11 LBB0_14: - WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax - WORD $0xe083; BYTE $0xfe // andl $-2, %eax - WORD $0xc931 // xorl %ecx, %ecx + WORD $0x8941; BYTE $0xc0 // movl %eax, %r8d + LONG $0xfee08141; WORD $0xffff; BYTE $0x7f // andl $2147483646, %r8d # imm = 0x7FFFFFFE + WORD $0xc931 // xorl %ecx, %ecx LBB0_15: - LONG $0x0410fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x0458fac5; BYTE $0x8a // vaddss (%rdx,%rcx,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x8a // vmovss %xmm0, (%rdx,%rcx,4) - LONG $0x4410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x4458fac5; WORD $0x048a // vaddss 4(%rdx,%rcx,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x048a // vmovss %xmm0, 4(%rdx,%rcx,4) - LONG $0x02c18348 // addq $2, %rcx - WORD $0x3948; BYTE $0xc8 // cmpq %rcx, %rax + LONG $0x0410fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0e10fac5 // vmovss (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0xa979e2c4; WORD $0x8a0c // vfmadd213ss (%rdx,%rcx,4), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem + LONG $0x0c11fac5; BYTE $0x8a // vmovss %xmm1, (%rdx,%rcx,4) + LONG $0x4410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0e10fac5 // vmovss (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0xa979e2c4; WORD $0x8a4c; BYTE $0x04 // vfmadd213ss 4(%rdx,%rcx,4), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem + LONG $0x4c11fac5; WORD $0x048a // vmovss %xmm1, 4(%rdx,%rcx,4) + LONG $0x02c18348 // addq $2, %rcx + WORD $0x3949; BYTE $0xc8 // cmpq %rcx, %r8 JNE LBB0_15 LBB0_11: - LONG $0x01c0f641 // testb $1, %r8b + WORD $0x01a8 // testb $1, %al JE LBB0_13 - LONG $0x0410fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm0 - LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x0458fac5; BYTE $0x8a // vaddss (%rdx,%rcx,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x8a // vmovss %xmm0, (%rdx,%rcx,4) + LONG $0x0410fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0e10fac5 // vmovss (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0xa979e2c4; WORD $0x8a0c // vfmadd213ss (%rdx,%rcx,4), %xmm0, %xmm1 # xmm1 = (xmm0 * xmm1) + mem + LONG $0x0c11fac5; BYTE $0x8a // vmovss %xmm1, (%rdx,%rcx,4) LBB0_13: WORD $0x8948; BYTE $0xec // movq %rbp, %rsp @@ -105,25 +104,23 @@ TEXT ·_mm512_mul_const_to(SB), $0-32 MOVQ b+8(FP), SI MOVQ c+16(FP), DX MOVQ n+24(FP), CX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x0f498d4c // leaq 15(%rcx), %r9 - WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx - LONG $0xc9490f4c // cmovnsq %rcx, %r9 - LONG $0x04e9c149 // shrq $4, %r9 - WORD $0x8944; BYTE $0xc8 // movl %r9d, %eax - WORD $0xe0c1; BYTE $0x04 // shll $4, %eax - WORD $0xc129 // subl %eax, %ecx - WORD $0x8545; BYTE $0xc9 // testl %r9d, %r9d + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x0f418d4c // leaq 15(%rcx), %r8 + WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx + LONG $0xc1490f4c // cmovnsq %rcx, %r8 + LONG $0x04e8c149 // shrq $4, %r8 + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe0c1; BYTE $0x04 // shll $4, %eax + WORD $0xc129 // subl %eax, %ecx + WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d JLE LBB1_6 - LONG $0xff418d41 // leal -1(%r9), %eax - WORD $0x8945; BYTE $0xc8 // movl %r9d, %r8d - LONG $0x03e08341 // andl $3, %r8d - WORD $0xf883; BYTE $0x03 // cmpl $3, %eax + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + LONG $0x04f88341 // cmpl $4, %r8d JB LBB1_4 - LONG $0xfce18341 // andl $-4, %r9d - WORD $0xf741; BYTE $0xd9 // negl %r9d + LONG $0xfce08141; WORD $0xffff; BYTE $0x7f // andl $2147483644, %r8d # imm = 0x7FFFFFFC LBB1_3: LONG $0x487cf162; WORD $0x0710 // vmovups (%rdi), %zmm0 @@ -138,13 +135,13 @@ LBB1_3: LONG $0x487cf162; WORD $0x4710; BYTE $0x03 // vmovups 192(%rdi), %zmm0 LONG $0x587cf162; WORD $0x0659 // vmulps (%rsi){1to16}, %zmm0, %zmm0 LONG $0x487cf162; WORD $0x4211; BYTE $0x03 // vmovups %zmm0, 192(%rdx) - LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi - LONG $0x00c28148; WORD $0x0001; BYTE $0x00 // addq $256, %rdx - LONG $0x04c18341 // addl $4, %r9d + LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi # imm = 0x100 + LONG $0x00c28148; WORD $0x0001; BYTE $0x00 // addq $256, %rdx # imm = 0x100 + LONG $0xfcc08341 // addl $-4, %r8d JNE LBB1_3 LBB1_4: - WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d + WORD $0xc085 // testl %eax, %eax JE LBB1_6 LBB1_5: @@ -153,7 +150,7 @@ LBB1_5: LONG $0x487cf162; WORD $0x0211 // vmovups %zmm0, (%rdx) LONG $0x40c78348 // addq $64, %rdi LONG $0x40c28348 // addq $64, %rdx - LONG $0xffc08341 // addl $-1, %r8d + WORD $0xc8ff // decl %eax JNE LBB1_5 LBB1_6: @@ -169,49 +166,48 @@ LBB1_6: LBB1_8: WORD $0xc985 // testl %ecx, %ecx JLE LBB1_14 - WORD $0xc989 // movl %ecx, %ecx - LONG $0xff418d48 // leaq -1(%rcx), %rax WORD $0x8941; BYTE $0xc8 // movl %ecx, %r8d - LONG $0x03e08341 // andl $3, %r8d - LONG $0x03f88348 // cmpq $3, %rax + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + WORD $0xf983; BYTE $0x04 // cmpl $4, %ecx JAE LBB1_15 - WORD $0xc031 // xorl %eax, %eax + WORD $0xc931 // xorl %ecx, %ecx JMP LBB1_11 LBB1_15: - WORD $0xe183; BYTE $0xfc // andl $-4, %ecx - WORD $0xc031 // xorl %eax, %eax + LONG $0xfce08141; WORD $0xffff; BYTE $0x7f // andl $2147483644, %r8d # imm = 0x7FFFFFFC + WORD $0xc931 // xorl %ecx, %ecx LBB1_16: - LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 + LONG $0x0410fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm0 + LONG $0x0411fac5; BYTE $0x8a // vmovss %xmm0, (%rdx,%rcx,4) + LONG $0x4410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0482 // vmovss %xmm0, 4(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm0 + LONG $0x4411fac5; WORD $0x048a // vmovss %xmm0, 4(%rdx,%rcx,4) + LONG $0x4410fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0882 // vmovss %xmm0, 8(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm0 + LONG $0x4411fac5; WORD $0x088a // vmovss %xmm0, 8(%rdx,%rcx,4) + LONG $0x4410fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0c82 // vmovss %xmm0, 12(%rdx,%rax,4) - LONG $0x04c08348 // addq $4, %rax - WORD $0x3948; BYTE $0xc1 // cmpq %rax, %rcx + LONG $0x4411fac5; WORD $0x0c8a // vmovss %xmm0, 12(%rdx,%rcx,4) + LONG $0x04c18348 // addq $4, %rcx + WORD $0x3949; BYTE $0xc8 // cmpq %rcx, %r8 JNE LBB1_16 LBB1_11: - WORD $0x854d; BYTE $0xc0 // testq %r8, %r8 + WORD $0x8548; BYTE $0xc0 // testq %rax, %rax JE LBB1_14 - LONG $0x820c8d48 // leaq (%rdx,%rax,4), %rcx - LONG $0x87048d48 // leaq (%rdi,%rax,4), %rax - WORD $0xd231 // xorl %edx, %edx + LONG $0x8a148d48 // leaq (%rdx,%rcx,4), %rdx + LONG $0x8f0c8d48 // leaq (%rdi,%rcx,4), %rcx + WORD $0xff31 // xorl %edi, %edi LBB1_13: - LONG $0x0410fac5; BYTE $0x90 // vmovss (%rax,%rdx,4), %xmm0 + LONG $0x0410fac5; BYTE $0xb9 // vmovss (%rcx,%rdi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0659fac5 // vmulss (%rsi), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x91 // vmovss %xmm0, (%rcx,%rdx,4) - LONG $0x01c28348 // addq $1, %rdx - WORD $0x3949; BYTE $0xd0 // cmpq %rdx, %r8 + LONG $0x0411fac5; BYTE $0xba // vmovss %xmm0, (%rdx,%rdi,4) + WORD $0xff48; BYTE $0xc7 // incq %rdi + WORD $0x3948; BYTE $0xf8 // cmpq %rdi, %rax JNE LBB1_13 LBB1_14: @@ -224,25 +220,23 @@ TEXT ·_mm512_mul_const(SB), $0-32 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVQ n+16(FP), DX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x0f4a8d48 // leaq 15(%rdx), %rcx - WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx - LONG $0xca490f48 // cmovnsq %rdx, %rcx - LONG $0x04e9c148 // shrq $4, %rcx - WORD $0xc889 // movl %ecx, %eax - WORD $0xe0c1; BYTE $0x04 // shll $4, %eax - WORD $0xc229 // subl %eax, %edx - WORD $0xc985 // testl %ecx, %ecx + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x0f4a8d48 // leaq 15(%rdx), %rcx + WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx + LONG $0xca490f48 // cmovnsq %rdx, %rcx + LONG $0x04e9c148 // shrq $4, %rcx + WORD $0xc889 // movl %ecx, %eax + WORD $0xe0c1; BYTE $0x04 // shll $4, %eax + WORD $0xc229 // subl %eax, %edx + WORD $0xc985 // testl %ecx, %ecx JLE LBB2_6 - LONG $0xff418d44 // leal -1(%rcx), %r8d - WORD $0xc889 // movl %ecx, %eax - WORD $0xe083; BYTE $0x03 // andl $3, %eax - LONG $0x03f88341 // cmpl $3, %r8d + WORD $0xc889 // movl %ecx, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + WORD $0xf983; BYTE $0x04 // cmpl $4, %ecx JB LBB2_4 - WORD $0xe183; BYTE $0xfc // andl $-4, %ecx - WORD $0xd9f7 // negl %ecx + LONG $0xfffce181; WORD $0x7fff // andl $2147483644, %ecx # imm = 0x7FFFFFFC LBB2_3: LONG $0x487cf162; WORD $0x0710 // vmovups (%rdi), %zmm0 @@ -257,8 +251,8 @@ LBB2_3: LONG $0x487cf162; WORD $0x4711; BYTE $0x02 // vmovups %zmm0, 128(%rdi) LONG $0x5864f162; WORD $0x0659 // vmulps (%rsi){1to16}, %zmm3, %zmm0 LONG $0x487cf162; WORD $0x4711; BYTE $0x03 // vmovups %zmm0, 192(%rdi) - LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi - WORD $0xc183; BYTE $0x04 // addl $4, %ecx + LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi # imm = 0x100 + WORD $0xc183; BYTE $0xfc // addl $-4, %ecx JNE LBB2_3 LBB2_4: @@ -270,7 +264,7 @@ LBB2_5: LONG $0x587cf162; WORD $0x0659 // vmulps (%rsi){1to16}, %zmm0, %zmm0 LONG $0x487cf162; WORD $0x0711 // vmovups %zmm0, (%rdi) LONG $0x40c78348 // addq $64, %rdi - WORD $0xc083; BYTE $0xff // addl $-1, %eax + WORD $0xc8ff // decl %eax JNE LBB2_5 LBB2_6: @@ -286,29 +280,28 @@ LBB2_8: WORD $0xd285 // testl %edx, %edx JLE LBB2_14 WORD $0xd189 // movl %edx, %ecx - LONG $0xff518d48 // leaq -1(%rcx), %rdx WORD $0xc889 // movl %ecx, %eax WORD $0xe083; BYTE $0x03 // andl $3, %eax - LONG $0x03fa8348 // cmpq $3, %rdx + WORD $0xfa83; BYTE $0x04 // cmpl $4, %edx JAE LBB2_15 WORD $0xd231 // xorl %edx, %edx JMP LBB2_11 LBB2_15: - WORD $0xe183; BYTE $0xfc // andl $-4, %ecx - WORD $0xd231 // xorl %edx, %edx + LONG $0xfffce181; WORD $0x7fff // andl $2147483644, %ecx # imm = 0x7FFFFFFC + WORD $0xd231 // xorl %edx, %edx LBB2_16: - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0459fac5; BYTE $0x97 // vmulss (%rdi,%rdx,4), %xmm0, %xmm0 LONG $0x0411fac5; BYTE $0x97 // vmovss %xmm0, (%rdi,%rdx,4) - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x4459fac5; WORD $0x0497 // vmulss 4(%rdi,%rdx,4), %xmm0, %xmm0 LONG $0x4411fac5; WORD $0x0497 // vmovss %xmm0, 4(%rdi,%rdx,4) - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x4459fac5; WORD $0x0897 // vmulss 8(%rdi,%rdx,4), %xmm0, %xmm0 LONG $0x4411fac5; WORD $0x0897 // vmovss %xmm0, 8(%rdi,%rdx,4) - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x4459fac5; WORD $0x0c97 // vmulss 12(%rdi,%rdx,4), %xmm0, %xmm0 LONG $0x4411fac5; WORD $0x0c97 // vmovss %xmm0, 12(%rdi,%rdx,4) LONG $0x04c28348 // addq $4, %rdx @@ -322,10 +315,10 @@ LBB2_11: WORD $0xd231 // xorl %edx, %edx LBB2_13: - LONG $0x0610fac5 // vmovss (%rsi), %xmm0 + LONG $0x0610fac5 // vmovss (%rsi), %xmm0 # xmm0 = mem[0],zero,zero,zero LONG $0x0459fac5; BYTE $0x91 // vmulss (%rcx,%rdx,4), %xmm0, %xmm0 LONG $0x0411fac5; BYTE $0x91 // vmovss %xmm0, (%rcx,%rdx,4) - LONG $0x01c28348 // addq $1, %rdx + WORD $0xff48; BYTE $0xc2 // incq %rdx WORD $0x3948; BYTE $0xd0 // cmpq %rdx, %rax JNE LBB2_13 @@ -340,25 +333,23 @@ TEXT ·_mm512_mul_to(SB), $0-32 MOVQ b+8(FP), SI MOVQ c+16(FP), DX MOVQ n+24(FP), CX - BYTE $0x55 // pushq %rbp - WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x0f498d4c // leaq 15(%rcx), %r9 - WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx - LONG $0xc9490f4c // cmovnsq %rcx, %r9 - LONG $0x04e9c149 // shrq $4, %r9 - WORD $0x8944; BYTE $0xc8 // movl %r9d, %eax - WORD $0xe0c1; BYTE $0x04 // shll $4, %eax - WORD $0xc129 // subl %eax, %ecx - WORD $0x8545; BYTE $0xc9 // testl %r9d, %r9d + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x0f418d4c // leaq 15(%rcx), %r8 + WORD $0x8548; BYTE $0xc9 // testq %rcx, %rcx + LONG $0xc1490f4c // cmovnsq %rcx, %r8 + LONG $0x04e8c149 // shrq $4, %r8 + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe0c1; BYTE $0x04 // shll $4, %eax + WORD $0xc129 // subl %eax, %ecx + WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d JLE LBB3_6 - LONG $0xff418d41 // leal -1(%r9), %eax - WORD $0x8945; BYTE $0xc8 // movl %r9d, %r8d - LONG $0x03e08341 // andl $3, %r8d - WORD $0xf883; BYTE $0x03 // cmpl $3, %eax + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + LONG $0x04f88341 // cmpl $4, %r8d JB LBB3_4 - LONG $0xfce18341 // andl $-4, %r9d - WORD $0xf741; BYTE $0xd9 // negl %r9d + LONG $0xfce08141; WORD $0xffff; BYTE $0x7f // andl $2147483644, %r8d # imm = 0x7FFFFFFC LBB3_3: LONG $0x487cf162; WORD $0x0710 // vmovups (%rdi), %zmm0 @@ -373,14 +364,14 @@ LBB3_3: LONG $0x487cf162; WORD $0x4710; BYTE $0x03 // vmovups 192(%rdi), %zmm0 LONG $0x487cf162; WORD $0x4659; BYTE $0x03 // vmulps 192(%rsi), %zmm0, %zmm0 LONG $0x487cf162; WORD $0x4211; BYTE $0x03 // vmovups %zmm0, 192(%rdx) - LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi - LONG $0x00c68148; WORD $0x0001; BYTE $0x00 // addq $256, %rsi - LONG $0x00c28148; WORD $0x0001; BYTE $0x00 // addq $256, %rdx - LONG $0x04c18341 // addl $4, %r9d + LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq $256, %rdi # imm = 0x100 + LONG $0x00c68148; WORD $0x0001; BYTE $0x00 // addq $256, %rsi # imm = 0x100 + LONG $0x00c28148; WORD $0x0001; BYTE $0x00 // addq $256, %rdx # imm = 0x100 + LONG $0xfcc08341 // addl $-4, %r8d JNE LBB3_3 LBB3_4: - WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d + WORD $0xc085 // testl %eax, %eax JE LBB3_6 LBB3_5: @@ -390,7 +381,7 @@ LBB3_5: LONG $0x40c78348 // addq $64, %rdi LONG $0x40c68348 // addq $64, %rsi LONG $0x40c28348 // addq $64, %rdx - LONG $0xffc08341 // addl $-1, %r8d + WORD $0xc8ff // decl %eax JNE LBB3_5 LBB3_6: @@ -407,50 +398,49 @@ LBB3_6: LBB3_8: WORD $0xc985 // testl %ecx, %ecx JLE LBB3_14 - WORD $0xc989 // movl %ecx, %ecx - LONG $0xff418d48 // leaq -1(%rcx), %rax WORD $0x8941; BYTE $0xc8 // movl %ecx, %r8d - LONG $0x03e08341 // andl $3, %r8d - LONG $0x03f88348 // cmpq $3, %rax + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + WORD $0xf983; BYTE $0x04 // cmpl $4, %ecx JAE LBB3_15 - WORD $0xc031 // xorl %eax, %eax + WORD $0xc931 // xorl %ecx, %ecx JMP LBB3_11 LBB3_15: - WORD $0xe183; BYTE $0xfc // andl $-4, %ecx - WORD $0xc031 // xorl %eax, %eax + LONG $0xfce08141; WORD $0xffff; BYTE $0x7f // andl $2147483644, %r8d # imm = 0x7FFFFFFC + WORD $0xc931 // xorl %ecx, %ecx LBB3_16: - LONG $0x0410fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm0 - LONG $0x0459fac5; BYTE $0x86 // vmulss (%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0x82 // vmovss %xmm0, (%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm0 - LONG $0x4459fac5; WORD $0x0486 // vmulss 4(%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0482 // vmovss %xmm0, 4(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm0 - LONG $0x4459fac5; WORD $0x0886 // vmulss 8(%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0882 // vmovss %xmm0, 8(%rdx,%rax,4) - LONG $0x4410fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm0 - LONG $0x4459fac5; WORD $0x0c86 // vmulss 12(%rsi,%rax,4), %xmm0, %xmm0 - LONG $0x4411fac5; WORD $0x0c82 // vmovss %xmm0, 12(%rdx,%rax,4) - LONG $0x04c08348 // addq $4, %rax - WORD $0x3948; BYTE $0xc1 // cmpq %rax, %rcx + LONG $0x0410fac5; BYTE $0x8f // vmovss (%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0459fac5; BYTE $0x8e // vmulss (%rsi,%rcx,4), %xmm0, %xmm0 + LONG $0x0411fac5; BYTE $0x8a // vmovss %xmm0, (%rdx,%rcx,4) + LONG $0x4410fac5; WORD $0x048f // vmovss 4(%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x4459fac5; WORD $0x048e // vmulss 4(%rsi,%rcx,4), %xmm0, %xmm0 + LONG $0x4411fac5; WORD $0x048a // vmovss %xmm0, 4(%rdx,%rcx,4) + LONG $0x4410fac5; WORD $0x088f // vmovss 8(%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x4459fac5; WORD $0x088e // vmulss 8(%rsi,%rcx,4), %xmm0, %xmm0 + LONG $0x4411fac5; WORD $0x088a // vmovss %xmm0, 8(%rdx,%rcx,4) + LONG $0x4410fac5; WORD $0x0c8f // vmovss 12(%rdi,%rcx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x4459fac5; WORD $0x0c8e // vmulss 12(%rsi,%rcx,4), %xmm0, %xmm0 + LONG $0x4411fac5; WORD $0x0c8a // vmovss %xmm0, 12(%rdx,%rcx,4) + LONG $0x04c18348 // addq $4, %rcx + WORD $0x3949; BYTE $0xc8 // cmpq %rcx, %r8 JNE LBB3_16 LBB3_11: - WORD $0x854d; BYTE $0xc0 // testq %r8, %r8 + WORD $0x8548; BYTE $0xc0 // testq %rax, %rax JE LBB3_14 - LONG $0x820c8d48 // leaq (%rdx,%rax,4), %rcx - LONG $0x86148d48 // leaq (%rsi,%rax,4), %rdx - LONG $0x87048d48 // leaq (%rdi,%rax,4), %rax - WORD $0xf631 // xorl %esi, %esi + LONG $0x8a148d48 // leaq (%rdx,%rcx,4), %rdx + LONG $0x8e348d48 // leaq (%rsi,%rcx,4), %rsi + LONG $0x8f0c8d48 // leaq (%rdi,%rcx,4), %rcx + WORD $0xff31 // xorl %edi, %edi LBB3_13: - LONG $0x0410fac5; BYTE $0xb0 // vmovss (%rax,%rsi,4), %xmm0 - LONG $0x0459fac5; BYTE $0xb2 // vmulss (%rdx,%rsi,4), %xmm0, %xmm0 - LONG $0x0411fac5; BYTE $0xb1 // vmovss %xmm0, (%rcx,%rsi,4) - LONG $0x01c68348 // addq $1, %rsi - WORD $0x3949; BYTE $0xf0 // cmpq %rsi, %r8 + LONG $0x0410fac5; BYTE $0xb9 // vmovss (%rcx,%rdi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x0459fac5; BYTE $0xbe // vmulss (%rsi,%rdi,4), %xmm0, %xmm0 + LONG $0x0411fac5; BYTE $0xba // vmovss %xmm0, (%rdx,%rdi,4) + WORD $0xff48; BYTE $0xc7 // incq %rdi + WORD $0x3948; BYTE $0xf8 // cmpq %rdi, %rax JNE LBB3_13 LBB3_14: @@ -466,36 +456,35 @@ TEXT ·_mm512_dot(SB), $0-32 MOVQ ret+24(FP), CX BYTE $0x55 // pushq %rbp WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp - WORD $0x5641 // pushq %r14 BYTE $0x53 // pushq %rbx LONG $0xf8e48348 // andq $-8, %rsp - LONG $0x0f4a8d4c // leaq 15(%rdx), %r9 + LONG $0x0f428d4c // leaq 15(%rdx), %r8 WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx - LONG $0xca490f4c // cmovnsq %rdx, %r9 - LONG $0x04f9c149 // sarq $4, %r9 - WORD $0x8944; BYTE $0xc8 // movl %r9d, %eax + LONG $0xc2490f4c // cmovnsq %rdx, %r8 + LONG $0x04f8c149 // sarq $4, %r8 + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax WORD $0xe0c1; BYTE $0x04 // shll $4, %eax WORD $0xc229 // subl %eax, %edx - WORD $0x8545; BYTE $0xc9 // testl %r9d, %r9d + WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d JLE LBB4_1 LONG $0x487cf162; WORD $0x0710 // vmovups (%rdi), %zmm0 LONG $0x487cf162; WORD $0x0659 // vmulps (%rsi), %zmm0, %zmm0 LONG $0x40c78348 // addq $64, %rdi LONG $0x40c68348 // addq $64, %rsi - LONG $0x01f98341 // cmpl $1, %r9d + LONG $0x01f88341 // cmpl $1, %r8d JE LBB4_9 - WORD $0x894c; BYTE $0xc8 // movq %r9, %rax - LONG $0x04e0c148 // shlq $4, %rax - QUAD $0x000fffffffe0b849; WORD $0x0000 // movabsq $68719476704, %r8 - WORD $0x014c; BYTE $0xc0 // addq %r8, %rax - LONG $0x10c88349 // orq $16, %r8 - WORD $0x2149; BYTE $0xc0 // andq %rax, %r8 - LONG $0xff598d45 // leal -1(%r9), %r11d - LONG $0xfe418d41 // leal -2(%r9), %eax - WORD $0xf883; BYTE $0x03 // cmpl $3, %eax + WORD $0x894d; BYTE $0xc1 // movq %r8, %r9 + LONG $0x06e1c149 // shlq $6, %r9 + QUAD $0x003fffffff80b848; WORD $0x0000 // movabsq $274877906816, %rax # imm = 0x3FFFFFFF80 + WORD $0x0149; BYTE $0xc1 // addq %rax, %r9 + LONG $0x40c88348 // orq $64, %rax + WORD $0x214c; BYTE $0xc8 // andq %r9, %rax + LONG $0xff588d45 // leal -1(%r8), %r11d + LONG $0xfe488d45 // leal -2(%r8), %r9d + LONG $0x03f98341 // cmpl $3, %r9d JAE LBB4_18 - WORD $0x8949; BYTE $0xfa // movq %rdi, %r10 - WORD $0x8948; BYTE $0xf0 // movq %rsi, %rax + WORD $0x8949; BYTE $0xf9 // movq %rdi, %r9 + WORD $0x8949; BYTE $0xf2 // movq %rsi, %r10 JMP LBB4_5 LBB4_1: @@ -504,55 +493,54 @@ LBB4_1: LBB4_18: WORD $0x8944; BYTE $0xdb // movl %r11d, %ebx WORD $0xe383; BYTE $0xfc // andl $-4, %ebx - WORD $0xdbf7 // negl %ebx - WORD $0x8949; BYTE $0xfa // movq %rdi, %r10 - WORD $0x8948; BYTE $0xf0 // movq %rsi, %rax + WORD $0x8949; BYTE $0xf9 // movq %rdi, %r9 + WORD $0x8949; BYTE $0xf2 // movq %rsi, %r10 LBB4_19: - LONG $0x487cd162; WORD $0x0a10 // vmovups (%r10), %zmm1 - LONG $0x487cd162; WORD $0x5210; BYTE $0x01 // vmovups 64(%r10), %zmm2 - LONG $0x487cd162; WORD $0x5a10; BYTE $0x02 // vmovups 128(%r10), %zmm3 - LONG $0x487cd162; WORD $0x6210; BYTE $0x03 // vmovups 192(%r10), %zmm4 - LONG $0x487df262; WORD $0x0898 // vfmadd132ps (%rax), %zmm0, %zmm1 - LONG $0x486df262; WORD $0x48b8; BYTE $0x01 // vfmadd231ps 64(%rax), %zmm2, %zmm1 - LONG $0x4865f262; WORD $0x48b8; BYTE $0x02 // vfmadd231ps 128(%rax), %zmm3, %zmm1 + LONG $0x487cd162; WORD $0x0910 // vmovups (%r9), %zmm1 + LONG $0x487cd162; WORD $0x5110; BYTE $0x01 // vmovups 64(%r9), %zmm2 + LONG $0x487cd162; WORD $0x5910; BYTE $0x02 // vmovups 128(%r9), %zmm3 + LONG $0x487dd262; WORD $0x0a98 // vfmadd132ps (%r10), %zmm0, %zmm1 # zmm1 = (zmm1 * mem) + zmm0 + LONG $0x486dd262; WORD $0x4ab8; BYTE $0x01 // vfmadd231ps 64(%r10), %zmm2, %zmm1 # zmm1 = (zmm2 * mem) + zmm1 + LONG $0x4865d262; WORD $0x4ab8; BYTE $0x02 // vfmadd231ps 128(%r10), %zmm3, %zmm1 # zmm1 = (zmm3 * mem) + zmm1 + LONG $0x487cd162; WORD $0x5110; BYTE $0x03 // vmovups 192(%r9), %zmm2 LONG $0x487cf162; WORD $0xc128 // vmovaps %zmm1, %zmm0 - LONG $0x485df262; WORD $0x40b8; BYTE $0x03 // vfmadd231ps 192(%rax), %zmm4, %zmm0 - LONG $0x00c28149; WORD $0x0001; BYTE $0x00 // addq $256, %r10 - LONG $0x01000548; WORD $0x0000 // addq $256, %rax - WORD $0xc383; BYTE $0x04 // addl $4, %ebx + LONG $0x486dd262; WORD $0x42b8; BYTE $0x03 // vfmadd231ps 192(%r10), %zmm2, %zmm0 # zmm0 = (zmm2 * mem) + zmm0 + LONG $0x00c18149; WORD $0x0001; BYTE $0x00 // addq $256, %r9 # imm = 0x100 + LONG $0x00c28149; WORD $0x0001; BYTE $0x00 // addq $256, %r10 # imm = 0x100 + WORD $0xc383; BYTE $0xfc // addl $-4, %ebx JNE LBB4_19 LBB4_5: - LONG $0x10708d4d // leaq 16(%r8), %r14 - LONG $0x03c3f641 // testb $3, %r11b + LONG $0x40588d48 // leaq 64(%rax), %rbx + LONG $0x03c3f641 // testb $3, %r11b JE LBB4_8 - LONG $0xffc18041 // addb $-1, %r9b - LONG $0xc9b60f45 // movzbl %r9b, %r9d - LONG $0x03e18341 // andl $3, %r9d - LONG $0x06e1c149 // shlq $6, %r9 - WORD $0xdb31 // xorl %ebx, %ebx + WORD $0xfe41; BYTE $0xc8 // decb %r8b + LONG $0xc0b60f45 // movzbl %r8b, %r8d + LONG $0x03e08341 // andl $3, %r8d + LONG $0x06e0c141 // shll $6, %r8d + WORD $0x3145; BYTE $0xdb // xorl %r11d, %r11d LBB4_7: - LONG $0x487cd162; WORD $0x0c10; BYTE $0x1a // vmovups (%r10,%rbx), %zmm1 - LONG $0x4875f262; WORD $0x04b8; BYTE $0x18 // vfmadd231ps (%rax,%rbx), %zmm1, %zmm0 - LONG $0x40c38348 // addq $64, %rbx - WORD $0x3941; BYTE $0xd9 // cmpl %ebx, %r9d + LONG $0x487c9162; WORD $0x0c10; BYTE $0x19 // vmovups (%r9,%r11), %zmm1 + LONG $0x48759262; WORD $0x04b8; BYTE $0x1a // vfmadd231ps (%r10,%r11), %zmm1, %zmm0 # zmm0 = (zmm1 * mem) + zmm0 + LONG $0x40c38349 // addq $64, %r11 + WORD $0x3945; BYTE $0xd8 // cmpl %r11d, %r8d JNE LBB4_7 LBB4_8: - LONG $0x873c8d4a // leaq (%rdi,%r8,4), %rdi - LONG $0x40c78348 // addq $64, %rdi - LONG $0xb6348d4a // leaq (%rsi,%r14,4), %rsi + WORD $0x0148; BYTE $0xc7 // addq %rax, %rdi + LONG $0x40c78348 // addq $64, %rdi + WORD $0x0148; BYTE $0xde // addq %rbx, %rsi LBB4_9: LONG $0x48fdf362; WORD $0xc11b; BYTE $0x01 // vextractf64x4 $1, %zmm0, %ymm1 LONG $0xc058f4c5 // vaddps %ymm0, %ymm1, %ymm0 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 LONG $0xc058f0c5 // vaddps %xmm0, %xmm1, %xmm0 - LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd $1, %xmm0, %xmm1 + LONG $0xc8c6f9c5; BYTE $0x01 // vshufpd $1, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,0] LONG $0xc158f8c5 // vaddps %xmm1, %xmm0, %xmm0 - LONG $0xc816fac5 // vmovshdup %xmm0, %xmm1 + LONG $0xc816fac5 // vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] LONG $0xc158f8c5 // vaddps %xmm1, %xmm0, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) WORD $0xfa83; BYTE $0x07 // cmpl $7, %edx @@ -563,9 +551,9 @@ LBB4_9: LONG $0x20c68348 // addq $32, %rsi LONG $0x197de3c4; WORD $0x01ca // vextractf128 $1, %ymm1, %xmm2 LONG $0xc958e8c5 // vaddps %xmm1, %xmm2, %xmm1 - LONG $0x0579e3c4; WORD $0x01d1 // vpermilpd $1, %xmm1, %xmm2 + LONG $0xd1c6f1c5; BYTE $0x01 // vshufpd $1, %xmm1, %xmm1, %xmm2 # xmm2 = xmm1[1,0] LONG $0xca58f0c5 // vaddps %xmm2, %xmm1, %xmm1 - LONG $0xd116fac5 // vmovshdup %xmm1, %xmm2 + LONG $0xd116fac5 // vmovshdup %xmm1, %xmm2 # xmm2 = xmm1[1,1,3,3] LONG $0xca58f2c5 // vaddss %xmm2, %xmm1, %xmm1 LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 LONG $0x0111fac5 // vmovss %xmm0, (%rcx) @@ -574,60 +562,235 @@ LBB4_9: LBB4_11: WORD $0xd285 // testl %edx, %edx JLE LBB4_17 - WORD $0xd289 // movl %edx, %edx - LONG $0xff428d48 // leaq -1(%rdx), %rax WORD $0x8941; BYTE $0xd0 // movl %edx, %r8d - LONG $0x03e08341 // andl $3, %r8d - LONG $0x03f88348 // cmpq $3, %rax + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + WORD $0xfa83; BYTE $0x04 // cmpl $4, %edx JAE LBB4_20 - WORD $0xc031 // xorl %eax, %eax + WORD $0xd231 // xorl %edx, %edx JMP LBB4_14 LBB4_20: - WORD $0xe283; BYTE $0xfc // andl $-4, %edx - WORD $0xc031 // xorl %eax, %eax + LONG $0xfce08141; WORD $0xffff; BYTE $0x7f // andl $2147483644, %r8d # imm = 0x7FFFFFFC + WORD $0xd231 // xorl %edx, %edx LBB4_21: - LONG $0x0c10fac5; BYTE $0x87 // vmovss (%rdi,%rax,4), %xmm1 - LONG $0x0c59f2c5; BYTE $0x86 // vmulss (%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 - LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x4c10fac5; WORD $0x0487 // vmovss 4(%rdi,%rax,4), %xmm1 - LONG $0x4c59f2c5; WORD $0x0486 // vmulss 4(%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 - LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x4c10fac5; WORD $0x0887 // vmovss 8(%rdi,%rax,4), %xmm1 - LONG $0x4c59f2c5; WORD $0x0886 // vmulss 8(%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 - LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x4c10fac5; WORD $0x0c87 // vmovss 12(%rdi,%rax,4), %xmm1 - LONG $0x4c59f2c5; WORD $0x0c86 // vmulss 12(%rsi,%rax,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 - LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x04c08348 // addq $4, %rax - WORD $0x3948; BYTE $0xc2 // cmpq %rax, %rdx + LONG $0x0c10fac5; BYTE $0x97 // vmovss (%rdi,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x9979e2c4; WORD $0x960c // vfmadd132ss (%rsi,%rdx,4), %xmm0, %xmm1 # xmm1 = (xmm1 * mem) + xmm0 + LONG $0x0911fac5 // vmovss %xmm1, (%rcx) + LONG $0x4410fac5; WORD $0x0497 // vmovss 4(%rdi,%rdx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x9971e2c4; WORD $0x9644; BYTE $0x04 // vfmadd132ss 4(%rsi,%rdx,4), %xmm1, %xmm0 # xmm0 = (xmm0 * mem) + xmm1 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + LONG $0x4c10fac5; WORD $0x0897 // vmovss 8(%rdi,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x9979e2c4; WORD $0x964c; BYTE $0x08 // vfmadd132ss 8(%rsi,%rdx,4), %xmm0, %xmm1 # xmm1 = (xmm1 * mem) + xmm0 + LONG $0x0911fac5 // vmovss %xmm1, (%rcx) + LONG $0x4410fac5; WORD $0x0c97 // vmovss 12(%rdi,%rdx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x9971e2c4; WORD $0x9644; BYTE $0x0c // vfmadd132ss 12(%rsi,%rdx,4), %xmm1, %xmm0 # xmm0 = (xmm0 * mem) + xmm1 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + LONG $0x04c28348 // addq $4, %rdx + WORD $0x3949; BYTE $0xd0 // cmpq %rdx, %r8 JNE LBB4_21 LBB4_14: - WORD $0x854d; BYTE $0xc0 // testq %r8, %r8 + WORD $0x8548; BYTE $0xc0 // testq %rax, %rax JE LBB4_17 - LONG $0x86148d48 // leaq (%rsi,%rax,4), %rdx - LONG $0x87048d48 // leaq (%rdi,%rax,4), %rax - WORD $0xf631 // xorl %esi, %esi + LONG $0x96348d48 // leaq (%rsi,%rdx,4), %rsi + LONG $0x97148d48 // leaq (%rdi,%rdx,4), %rdx + WORD $0xff31 // xorl %edi, %edi LBB4_16: - LONG $0x0c10fac5; BYTE $0xb0 // vmovss (%rax,%rsi,4), %xmm1 - LONG $0x0c59f2c5; BYTE $0xb2 // vmulss (%rdx,%rsi,4), %xmm1, %xmm1 - LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 - LONG $0x0111fac5 // vmovss %xmm0, (%rcx) - LONG $0x01c68348 // addq $1, %rsi - WORD $0x3949; BYTE $0xf0 // cmpq %rsi, %r8 + LONG $0x0c10fac5; BYTE $0xba // vmovss (%rdx,%rdi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0xb971e2c4; WORD $0xbe04 // vfmadd231ss (%rsi,%rdi,4), %xmm1, %xmm0 # xmm0 = (xmm1 * mem) + xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + WORD $0xff48; BYTE $0xc7 // incq %rdi + WORD $0x3948; BYTE $0xf8 // cmpq %rdi, %rax JNE LBB4_16 LBB4_17: - LONG $0xf0658d48 // leaq -16(%rbp), %rsp + LONG $0xf8658d48 // leaq -8(%rbp), %rsp + BYTE $0x5b // popq %rbx + BYTE $0x5d // popq %rbp + WORD $0xf8c5; BYTE $0x77 // vzeroupper + BYTE $0xc3 // retq + +TEXT ·_mm512_euclidean(SB), $0-32 + MOVQ a+0(FP), DI + MOVQ b+8(FP), SI + MOVQ n+16(FP), DX + MOVQ ret+24(FP), CX + BYTE $0x55 // pushq %rbp + WORD $0x8948; BYTE $0xe5 // movq %rsp, %rbp + BYTE $0x53 // pushq %rbx + LONG $0xf8e48348 // andq $-8, %rsp + LONG $0x0f428d4c // leaq 15(%rdx), %r8 + WORD $0x8548; BYTE $0xd2 // testq %rdx, %rdx + LONG $0xc2490f4c // cmovnsq %rdx, %r8 + LONG $0x04f8c149 // sarq $4, %r8 + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe0c1; BYTE $0x04 // shll $4, %eax + WORD $0xc229 // subl %eax, %edx + WORD $0x8545; BYTE $0xc0 // testl %r8d, %r8d + JLE LBB5_1 + LONG $0x487cf162; WORD $0x0710 // vmovups (%rdi), %zmm0 + LONG $0x487cf162; WORD $0x065c // vsubps (%rsi), %zmm0, %zmm0 + LONG $0x487cf162; WORD $0xc059 // vmulps %zmm0, %zmm0, %zmm0 + LONG $0x40c78348 // addq $64, %rdi + LONG $0x40c68348 // addq $64, %rsi + LONG $0x01f88341 // cmpl $1, %r8d + JE LBB5_9 + WORD $0x894d; BYTE $0xc1 // movq %r8, %r9 + LONG $0x06e1c149 // shlq $6, %r9 + QUAD $0x003fffffff80b848; WORD $0x0000 // movabsq $274877906816, %rax # imm = 0x3FFFFFFF80 + WORD $0x0149; BYTE $0xc1 // addq %rax, %r9 + LONG $0x40c88348 // orq $64, %rax + WORD $0x214c; BYTE $0xc8 // andq %r9, %rax + LONG $0xff588d45 // leal -1(%r8), %r11d + LONG $0xfe488d45 // leal -2(%r8), %r9d + LONG $0x03f98341 // cmpl $3, %r9d + JAE LBB5_18 + WORD $0x8949; BYTE $0xf9 // movq %rdi, %r9 + WORD $0x8949; BYTE $0xf2 // movq %rsi, %r10 + JMP LBB5_5 + +LBB5_1: + JMP LBB5_9 + +LBB5_18: + WORD $0x8944; BYTE $0xdb // movl %r11d, %ebx + WORD $0xe383; BYTE $0xfc // andl $-4, %ebx + WORD $0x8949; BYTE $0xf9 // movq %rdi, %r9 + WORD $0x8949; BYTE $0xf2 // movq %rsi, %r10 + +LBB5_19: + LONG $0x487cd162; WORD $0x0910 // vmovups (%r9), %zmm1 + LONG $0x487cd162; WORD $0x5110; BYTE $0x01 // vmovups 64(%r9), %zmm2 + LONG $0x487cd162; WORD $0x5910; BYTE $0x02 // vmovups 128(%r9), %zmm3 + LONG $0x487cd162; WORD $0x6110; BYTE $0x03 // vmovups 192(%r9), %zmm4 + LONG $0x4874d162; WORD $0x0a5c // vsubps (%r10), %zmm1, %zmm1 + LONG $0x4874f162; WORD $0xc959 // vmulps %zmm1, %zmm1, %zmm1 + LONG $0x487cf162; WORD $0xc158 // vaddps %zmm1, %zmm0, %zmm0 + LONG $0x486cd162; WORD $0x4a5c; BYTE $0x01 // vsubps 64(%r10), %zmm2, %zmm1 + LONG $0x4874f162; WORD $0xc959 // vmulps %zmm1, %zmm1, %zmm1 + LONG $0x487cf162; WORD $0xc158 // vaddps %zmm1, %zmm0, %zmm0 + LONG $0x4864d162; WORD $0x4a5c; BYTE $0x02 // vsubps 128(%r10), %zmm3, %zmm1 + LONG $0x4874f162; WORD $0xc959 // vmulps %zmm1, %zmm1, %zmm1 + LONG $0x487cf162; WORD $0xc158 // vaddps %zmm1, %zmm0, %zmm0 + LONG $0x485cd162; WORD $0x4a5c; BYTE $0x03 // vsubps 192(%r10), %zmm4, %zmm1 + LONG $0x4874f162; WORD $0xc959 // vmulps %zmm1, %zmm1, %zmm1 + LONG $0x487cf162; WORD $0xc158 // vaddps %zmm1, %zmm0, %zmm0 + LONG $0x00c18149; WORD $0x0001; BYTE $0x00 // addq $256, %r9 # imm = 0x100 + LONG $0x00c28149; WORD $0x0001; BYTE $0x00 // addq $256, %r10 # imm = 0x100 + WORD $0xc383; BYTE $0xfc // addl $-4, %ebx + JNE LBB5_19 + +LBB5_5: + LONG $0x40588d48 // leaq 64(%rax), %rbx + LONG $0x03c3f641 // testb $3, %r11b + JE LBB5_8 + WORD $0xfe41; BYTE $0xc8 // decb %r8b + LONG $0xc0b60f45 // movzbl %r8b, %r8d + LONG $0x03e08341 // andl $3, %r8d + LONG $0x06e0c141 // shll $6, %r8d + WORD $0x3145; BYTE $0xdb // xorl %r11d, %r11d + +LBB5_7: + LONG $0x487c9162; WORD $0x0c10; BYTE $0x19 // vmovups (%r9,%r11), %zmm1 + LONG $0x48749162; WORD $0x0c5c; BYTE $0x1a // vsubps (%r10,%r11), %zmm1, %zmm1 + LONG $0x4874f162; WORD $0xc959 // vmulps %zmm1, %zmm1, %zmm1 + LONG $0x487cf162; WORD $0xc158 // vaddps %zmm1, %zmm0, %zmm0 + LONG $0x40c38349 // addq $64, %r11 + WORD $0x3945; BYTE $0xd8 // cmpl %r11d, %r8d + JNE LBB5_7 + +LBB5_8: + WORD $0x0148; BYTE $0xc7 // addq %rax, %rdi + LONG $0x40c78348 // addq $64, %rdi + WORD $0x0148; BYTE $0xde // addq %rbx, %rsi + +LBB5_9: + LONG $0x48fdf362; WORD $0xc11b; BYTE $0x01 // vextractf64x4 $1, %zmm0, %ymm1 + LONG $0xc058f4c5 // vaddps %ymm0, %ymm1, %ymm0 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 $1, %ymm0, %xmm1 + LONG $0xc058f0c5 // vaddps %xmm0, %xmm1, %xmm0 + LONG $0xc8c6f9c5; BYTE $0x03 // vshufpd $3, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,1] + LONG $0xc158f8c5 // vaddps %xmm1, %xmm0, %xmm0 + LONG $0xc816fac5 // vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] + LONG $0xc158f8c5 // vaddps %xmm1, %xmm0, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + WORD $0xfa83; BYTE $0x07 // cmpl $7, %edx + JLE LBB5_11 + LONG $0x0f10fcc5 // vmovups (%rdi), %ymm1 + LONG $0x0e5cf4c5 // vsubps (%rsi), %ymm1, %ymm1 + LONG $0xc959f4c5 // vmulps %ymm1, %ymm1, %ymm1 + LONG $0x20c78348 // addq $32, %rdi + LONG $0x20c68348 // addq $32, %rsi + LONG $0x197de3c4; WORD $0x01ca // vextractf128 $1, %ymm1, %xmm2 + LONG $0xc958e8c5 // vaddps %xmm1, %xmm2, %xmm1 + LONG $0xd1c6f1c5; BYTE $0x01 // vshufpd $1, %xmm1, %xmm1, %xmm2 # xmm2 = xmm1[1,0] + LONG $0xca58f0c5 // vaddps %xmm2, %xmm1, %xmm1 + LONG $0xd116fac5 // vmovshdup %xmm1, %xmm2 # xmm2 = xmm1[1,1,3,3] + LONG $0xca58f2c5 // vaddss %xmm2, %xmm1, %xmm1 + LONG $0xc158fac5 // vaddss %xmm1, %xmm0, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + WORD $0xc283; BYTE $0xf8 // addl $-8, %edx + +LBB5_11: + WORD $0xd285 // testl %edx, %edx + JLE LBB5_17 + WORD $0x8941; BYTE $0xd0 // movl %edx, %r8d + WORD $0x8944; BYTE $0xc0 // movl %r8d, %eax + WORD $0xe083; BYTE $0x03 // andl $3, %eax + WORD $0xfa83; BYTE $0x04 // cmpl $4, %edx + JAE LBB5_20 + WORD $0xd231 // xorl %edx, %edx + JMP LBB5_14 + +LBB5_20: + LONG $0xfce08141; WORD $0xffff; BYTE $0x7f // andl $2147483644, %r8d # imm = 0x7FFFFFFC + WORD $0xd231 // xorl %edx, %edx + +LBB5_21: + LONG $0x0c10fac5; BYTE $0x97 // vmovss (%rdi,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x0c5cf2c5; BYTE $0x96 // vsubss (%rsi,%rdx,4), %xmm1, %xmm1 + LONG $0xa971e2c4; BYTE $0xc8 // vfmadd213ss %xmm0, %xmm1, %xmm1 # xmm1 = (xmm1 * xmm1) + xmm0 + LONG $0x0911fac5 // vmovss %xmm1, (%rcx) + LONG $0x4410fac5; WORD $0x0497 // vmovss 4(%rdi,%rdx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x445cfac5; WORD $0x0496 // vsubss 4(%rsi,%rdx,4), %xmm0, %xmm0 + LONG $0xa979e2c4; BYTE $0xc1 // vfmadd213ss %xmm1, %xmm0, %xmm0 # xmm0 = (xmm0 * xmm0) + xmm1 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + LONG $0x4c10fac5; WORD $0x0897 // vmovss 8(%rdi,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x4c5cf2c5; WORD $0x0896 // vsubss 8(%rsi,%rdx,4), %xmm1, %xmm1 + LONG $0xa971e2c4; BYTE $0xc8 // vfmadd213ss %xmm0, %xmm1, %xmm1 # xmm1 = (xmm1 * xmm1) + xmm0 + LONG $0x0911fac5 // vmovss %xmm1, (%rcx) + LONG $0x4410fac5; WORD $0x0c97 // vmovss 12(%rdi,%rdx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero + LONG $0x445cfac5; WORD $0x0c96 // vsubss 12(%rsi,%rdx,4), %xmm0, %xmm0 + LONG $0xa979e2c4; BYTE $0xc1 // vfmadd213ss %xmm1, %xmm0, %xmm0 # xmm0 = (xmm0 * xmm0) + xmm1 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + LONG $0x04c28348 // addq $4, %rdx + WORD $0x3949; BYTE $0xd0 // cmpq %rdx, %r8 + JNE LBB5_21 + +LBB5_14: + WORD $0x8548; BYTE $0xc0 // testq %rax, %rax + JE LBB5_17 + LONG $0x96348d48 // leaq (%rsi,%rdx,4), %rsi + LONG $0x97148d48 // leaq (%rdi,%rdx,4), %rdx + WORD $0xff31 // xorl %edi, %edi + +LBB5_16: + LONG $0x0c10fac5; BYTE $0xba // vmovss (%rdx,%rdi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero + LONG $0x0c5cf2c5; BYTE $0xbe // vsubss (%rsi,%rdi,4), %xmm1, %xmm1 + LONG $0xb971e2c4; BYTE $0xc1 // vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + WORD $0xff48; BYTE $0xc7 // incq %rdi + WORD $0x3948; BYTE $0xf8 // cmpq %rdi, %rax + JNE LBB5_16 + +LBB5_17: + LONG $0xc051fac5 // vsqrtss %xmm0, %xmm0, %xmm0 + LONG $0x0111fac5 // vmovss %xmm0, (%rcx) + LONG $0xf8658d48 // leaq -8(%rbp), %rsp BYTE $0x5b // popq %rbx - WORD $0x5e41 // popq %r14 BYTE $0x5d // popq %rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // retq diff --git a/base/floats/floats_neon.go b/base/floats/floats_neon.go index c70b9b71d..4c25724d4 100644 --- a/base/floats/floats_neon.go +++ b/base/floats/floats_neon.go @@ -1,6 +1,5 @@ //go:build !noasm && arm64 - -// AUTO-GENERATED BY GOAT -- DO NOT EDIT +// Code generated by GoAT. DO NOT EDIT. package floats @@ -20,3 +19,6 @@ func vmul_to(a, b, c, n unsafe.Pointer) //go:noescape func vdot(a, b, n, ret unsafe.Pointer) + +//go:noescape +func veuclidean(a, b, n, ret unsafe.Pointer) diff --git a/base/floats/floats_neon.s b/base/floats/floats_neon.s index f54050aeb..2522e4957 100644 --- a/base/floats/floats_neon.s +++ b/base/floats/floats_neon.s @@ -1,5 +1,5 @@ //go:build !noasm && arm64 -// AUTO-GENERATED BY GOAT -- DO NOT EDIT +// Code generated by GoAT. DO NOT EDIT. TEXT ·vmul_const_add_to(SB), $0-32 MOVD a+0(FP), R0 @@ -9,97 +9,80 @@ TEXT ·vmul_const_add_to(SB), $0-32 WORD $0xa9bf7bfd // stp x29, x30, [sp, WORD $0x91000c68 // add x8, x3, WORD $0xf100007f // cmp x3, - WORD $0x9a83b108 // csel x8, x8, x3, lt - WORD $0xd342fd09 // lsr x9, x8, - WORD $0x927ef508 // and x8, x8, - WORD $0x7100053f // cmp w9, - WORD $0xcb08006b // sub x11, x3, x8 WORD $0x910003fd // mov x29, sp - WORD $0x540001cb // b.lt .LBB0_4 - WORD $0xaa0203e8 // mov x8, x2 + WORD $0x9a83b109 // csel x9, x8, x3, lt + WORD $0xd342fd28 // lsr x8, x9, + WORD $0x927ef529 // and x9, x9, + WORD $0xcb090069 // sub x9, x3, x9 + WORD $0x7100051f // cmp w8, + WORD $0x5400010b // b.lt .LBB0_2 -LBB0_2: +LBB0_1: WORD $0x3cc10400 // ldr q0, [x0], - WORD $0x3cc10501 // ldr q1, [x8], + WORD $0x3dc00041 // ldr q1, [x2] WORD $0xbd400022 // ldr s2, [x1] - WORD $0x71000529 // subs w9, w9, - WORD $0x4f829000 // fmul v0.4s, v0.4s, v2.s[0] - WORD $0x4e20d420 // fadd v0.4s, v1.4s, v0.4s - WORD $0x3d800040 // str q0, [x2] - WORD $0xaa0803e2 // mov x2, x8 - WORD $0x54ffff01 // b.ne .LBB0_2 - WORD $0x7100057f // cmp w11, - WORD $0x540000aa // b.ge .LBB0_5 - WORD $0x14000038 // b .LBB0_14 - -LBB0_4: - WORD $0xaa0203e8 // mov x8, x2 - WORD $0x7100057f // cmp w11, - WORD $0x540006ab // b.lt .LBB0_14 + WORD $0x71000508 // subs w8, w8, + WORD $0x4f821001 // fmla v1.4s, v0.4s, v2.s[0] + WORD $0x3c810441 // str q1, [x2], + WORD $0x54ffff41 // b.ne .LBB0_1 -LBB0_5: - WORD $0x92407d69 // and x9, x11, - WORD $0xf1001d3f // cmp x9, - WORD $0x54000068 // b.hi .LBB0_7 +LBB0_2: + WORD $0x7100053f // cmp w9, + WORD $0x540005ab // b.lt .LBB0_12 + WORD $0x92407d28 // and x8, x9, + WORD $0xf100211f // cmp x8, + WORD $0x54000062 // b.hs .LBB0_5 WORD $0xaa1f03ea // mov x10, xzr - WORD $0x14000024 // b .LBB0_12 + WORD $0x1400001d // b .LBB0_10 -LBB0_7: - WORD $0xd37ef52c // lsl x12, x9, - WORD $0x8b0c010e // add x14, x8, x12 - WORD $0x8b0c000c // add x12, x0, x12 - WORD $0xeb0c011f // cmp x8, x12 - WORD $0x9100042d // add x13, x1, - WORD $0x1a9f27ec // cset w12, lo - WORD $0xeb0e001f // cmp x0, x14 - WORD $0x1a9f27ef // cset w15, lo - WORD $0xeb0801bf // cmp x13, x8 - WORD $0x0a0f018f // and w15, w12, w15 - WORD $0x1a9f97ec // cset w12, hi - WORD $0xeb0101df // cmp x14, x1 +LBB0_5: + WORD $0xd37ef50a // lsl x10, x8, + WORD $0x9100102b // add x11, x1, + WORD $0xeb0b005f // cmp x2, x11 + WORD $0x8b0a004c // add x12, x2, x10 + WORD $0x8b0a000a // add x10, x0, x10 + WORD $0xfa413180 // ccmp x12, x1, + WORD $0x1a9f97eb // cset w11, hi + WORD $0xeb0c001f // cmp x0, x12 + WORD $0xfa4a3042 // ccmp x2, x10, WORD $0xaa1f03ea // mov x10, xzr - WORD $0x1a9f97ed // cset w13, hi - WORD $0x370002af // tbnz w15, - WORD $0x0a0d018c // and w12, w12, w13 - WORD $0x3700026c // tbnz w12, + WORD $0x54000243 // b.lo .LBB0_10 + WORD $0x3700022b // tbnz w11, + WORD $0x92400929 // and x9, x9, WORD $0x4d40c820 // ld1r { v0.4s }, [x1] - WORD $0x9240096b // and x11, x11, - WORD $0xcb0b012a // sub x10, x9, x11 - WORD $0x9100400c // add x12, x0, - WORD $0x9100410d // add x13, x8, - WORD $0xaa0a03ee // mov x14, x10 + WORD $0x9100400b // add x11, x0, + WORD $0xcb09010a // sub x10, x8, x9 + WORD $0x9100404c // add x12, x2, + WORD $0xaa0a03ed // mov x13, x10 -LBB0_10: - WORD $0xad7f8981 // ldp q1, q2, [x12, - WORD $0xad7f91a3 // ldp q3, q4, [x13, +LBB0_8: + WORD $0xad7f9181 // ldp q1, q4, [x12, + WORD $0xf10021ad // subs x13, x13, + WORD $0xad7f8d62 // ldp q2, q3, [x11, + WORD $0x9100816b // add x11, x11, + WORD $0x4e22cc01 // fmla v1.4s, v0.4s, v2.4s + WORD $0x4e23cc04 // fmla v4.4s, v0.4s, v3.4s + WORD $0xad3f9181 // stp q1, q4, [x12, WORD $0x9100818c // add x12, x12, - WORD $0xf10021ce // subs x14, x14, - WORD $0x6e20dc21 // fmul v1.4s, v1.4s, v0.4s - WORD $0x6e20dc42 // fmul v2.4s, v2.4s, v0.4s - WORD $0x4e21d461 // fadd v1.4s, v3.4s, v1.4s - WORD $0x4e22d482 // fadd v2.4s, v4.4s, v2.4s - WORD $0xad3f89a1 // stp q1, q2, [x13, - WORD $0x910081ad // add x13, x13, - WORD $0x54fffec1 // b.ne .LBB0_10 - WORD $0xb40001ab // cbz x11, .LBB0_14 + WORD $0x54ffff01 // b.ne .LBB0_8 + WORD $0xb4000189 // cbz x9, .LBB0_12 -LBB0_12: +LBB0_10: WORD $0xd37ef54b // lsl x11, x10, - WORD $0x8b0b0108 // add x8, x8, x11 + WORD $0xcb0a0108 // sub x8, x8, x10 + WORD $0x8b0b0049 // add x9, x2, x11 WORD $0x8b0b000b // add x11, x0, x11 - WORD $0xcb0a0129 // sub x9, x9, x10 -LBB0_13: +LBB0_11: WORD $0xbc404560 // ldr s0, [x11], WORD $0xbd400021 // ldr s1, [x1] - WORD $0xbd400102 // ldr s2, [x8] - WORD $0xf1000529 // subs x9, x9, - WORD $0x1e210800 // fmul s0, s0, s1 - WORD $0x1e202840 // fadd s0, s2, s0 - WORD $0xbc004500 // str s0, [x8], - WORD $0x54ffff21 // b.ne .LBB0_13 + WORD $0xbd400122 // ldr s2, [x9] + WORD $0xf1000508 // subs x8, x8, + WORD $0x1f010800 // fmadd s0, s0, s1, s2 + WORD $0xbc004520 // str s0, [x9], + WORD $0x54ffff41 // b.ne .LBB0_11 -LBB0_14: +LBB0_12: WORD $0xa8c17bfd // ldp x29, x30, [sp], WORD $0xd65f03c0 // ret @@ -111,12 +94,12 @@ TEXT ·vmul_const_to(SB), $0-32 WORD $0xa9bf7bfd // stp x29, x30, [sp, WORD $0x91000c68 // add x8, x3, WORD $0xf100007f // cmp x3, + WORD $0x910003fd // mov x29, sp WORD $0x9a83b109 // csel x9, x8, x3, lt WORD $0xd342fd28 // lsr x8, x9, WORD $0x927ef529 // and x9, x9, - WORD $0x7100051f // cmp w8, WORD $0xcb090069 // sub x9, x3, x9 - WORD $0x910003fd // mov x29, sp + WORD $0x7100051f // cmp w8, WORD $0x540000eb // b.lt .LBB1_2 LBB1_1: @@ -129,42 +112,37 @@ LBB1_1: LBB1_2: WORD $0x7100053f // cmp w9, - WORD $0x5400060b // b.lt .LBB1_12 + WORD $0x5400056b // b.lt .LBB1_12 WORD $0x92407d28 // and x8, x9, - WORD $0xf1001d1f // cmp x8, - WORD $0x54000068 // b.hi .LBB1_5 + WORD $0xf100211f // cmp x8, + WORD $0x54000062 // b.hs .LBB1_5 WORD $0xaa1f03ea // mov x10, xzr - WORD $0x14000021 // b .LBB1_10 + WORD $0x1400001c // b .LBB1_10 LBB1_5: - WORD $0xd37ef50b // lsl x11, x8, - WORD $0x8b0b004d // add x13, x2, x11 - WORD $0x8b0b000b // add x11, x0, x11 + WORD $0xd37ef50a // lsl x10, x8, + WORD $0x9100102b // add x11, x1, WORD $0xeb0b005f // cmp x2, x11 - WORD $0x9100042c // add x12, x1, - WORD $0x1a9f27eb // cset w11, lo - WORD $0xeb0d001f // cmp x0, x13 - WORD $0x1a9f27ee // cset w14, lo - WORD $0xeb02019f // cmp x12, x2 - WORD $0x0a0e016e // and w14, w11, w14 + WORD $0x8b0a004c // add x12, x2, x10 + WORD $0x8b0a000a // add x10, x0, x10 + WORD $0xfa413180 // ccmp x12, x1, WORD $0x1a9f97eb // cset w11, hi - WORD $0xeb0101bf // cmp x13, x1 + WORD $0xeb0c001f // cmp x0, x12 + WORD $0xfa4a3042 // ccmp x2, x10, WORD $0xaa1f03ea // mov x10, xzr - WORD $0x1a9f97ec // cset w12, hi - WORD $0x3700024e // tbnz w14, - WORD $0x0a0c016b // and w11, w11, w12 + WORD $0x54000223 // b.lo .LBB1_10 WORD $0x3700020b // tbnz w11, - WORD $0x4d40c820 // ld1r { v0.4s }, [x1] WORD $0x92400929 // and x9, x9, - WORD $0xcb09010a // sub x10, x8, x9 + WORD $0x4d40c820 // ld1r { v0.4s }, [x1] WORD $0x9100400b // add x11, x0, + WORD $0xcb09010a // sub x10, x8, x9 WORD $0x9100404c // add x12, x2, WORD $0xaa0a03ed // mov x13, x10 LBB1_8: WORD $0xad7f8961 // ldp q1, q2, [x11, - WORD $0x9100816b // add x11, x11, WORD $0xf10021ad // subs x13, x13, + WORD $0x9100816b // add x11, x11, WORD $0x6e20dc21 // fmul v1.4s, v1.4s, v0.4s WORD $0x6e20dc42 // fmul v2.4s, v2.4s, v0.4s WORD $0xad3f8981 // stp q1, q2, [x12, @@ -174,9 +152,9 @@ LBB1_8: LBB1_10: WORD $0xd37ef54b // lsl x11, x10, + WORD $0xcb0a0108 // sub x8, x8, x10 WORD $0x8b0b0049 // add x9, x2, x11 WORD $0x8b0b000b // add x11, x0, x11 - WORD $0xcb0a0108 // sub x8, x8, x10 LBB1_11: WORD $0xbc404560 // ldr s0, [x11], @@ -197,79 +175,71 @@ TEXT ·vmul_const(SB), $0-32 WORD $0xa9bf7bfd // stp x29, x30, [sp, WORD $0x91000c48 // add x8, x2, WORD $0xf100005f // cmp x2, - WORD $0x9a82b108 // csel x8, x8, x2, lt - WORD $0xd342fd09 // lsr x9, x8, - WORD $0x927ef508 // and x8, x8, - WORD $0x7100053f // cmp w9, - WORD $0xcb08004a // sub x10, x2, x8 WORD $0x910003fd // mov x29, sp - WORD $0x5400018b // b.lt .LBB2_4 - WORD $0xaa0003e8 // mov x8, x0 + WORD $0x9a82b109 // csel x9, x8, x2, lt + WORD $0xd342fd28 // lsr x8, x9, + WORD $0x927ef529 // and x9, x9, + WORD $0xcb090049 // sub x9, x2, x9 + WORD $0x7100051f // cmp w8, + WORD $0x540000eb // b.lt .LBB2_2 -LBB2_2: - WORD $0x3cc10500 // ldr q0, [x8], +LBB2_1: + WORD $0x3dc00000 // ldr q0, [x0] WORD $0xbd400021 // ldr s1, [x1] - WORD $0x71000529 // subs w9, w9, + WORD $0x71000508 // subs w8, w8, WORD $0x4f819000 // fmul v0.4s, v0.4s, v1.s[0] - WORD $0x3d800000 // str q0, [x0] - WORD $0xaa0803e0 // mov x0, x8 - WORD $0x54ffff41 // b.ne .LBB2_2 - WORD $0x7100055f // cmp w10, - WORD $0x540000aa // b.ge .LBB2_5 - WORD $0x14000016 // b .LBB2_11 + WORD $0x3c810400 // str q0, [x0], + WORD $0x54ffff61 // b.ne .LBB2_1 -LBB2_4: - WORD $0xaa0003e8 // mov x8, x0 - WORD $0x7100055f // cmp w10, - WORD $0x5400026b // b.lt .LBB2_11 - -LBB2_5: - WORD $0x92407d49 // and x9, x10, - WORD $0xf1001d3f // cmp x9, - WORD $0x540000e9 // b.ls .LBB2_8 - WORD $0x9100042b // add x11, x1, - WORD $0xeb08017f // cmp x11, x8 - WORD $0x540001e9 // b.ls .LBB2_12 - WORD $0x8b09090b // add x11, x8, x9, lsl - WORD $0xeb01017f // cmp x11, x1 - WORD $0x54000189 // b.ls .LBB2_12 - -LBB2_8: - WORD $0xaa1f03ea // mov x10, xzr +LBB2_2: + WORD $0x7100053f // cmp w9, + WORD $0x5400026b // b.lt .LBB2_9 + WORD $0x92407d28 // and x8, x9, + WORD $0xf100211f // cmp x8, + WORD $0x540000e3 // b.lo .LBB2_6 + WORD $0x9100102a // add x10, x1, + WORD $0xeb0a001f // cmp x0, x10 + WORD $0x540001e2 // b.hs .LBB2_10 + WORD $0x8b08080a // add x10, x0, x8, lsl + WORD $0xeb01015f // cmp x10, x1 + WORD $0x54000189 // b.ls .LBB2_10 + +LBB2_6: + WORD $0xaa1f03e9 // mov x9, xzr -LBB2_9: - WORD $0x8b0a0908 // add x8, x8, x10, lsl - WORD $0xcb0a0129 // sub x9, x9, x10 +LBB2_7: + WORD $0x8b09080a // add x10, x0, x9, lsl + WORD $0xcb090108 // sub x8, x8, x9 -LBB2_10: +LBB2_8: WORD $0xbd400020 // ldr s0, [x1] - WORD $0xbd400101 // ldr s1, [x8] - WORD $0xf1000529 // subs x9, x9, + WORD $0xbd400141 // ldr s1, [x10] + WORD $0xf1000508 // subs x8, x8, WORD $0x1e210800 // fmul s0, s0, s1 - WORD $0xbc004500 // str s0, [x8], - WORD $0x54ffff61 // b.ne .LBB2_10 + WORD $0xbc004540 // str s0, [x10], + WORD $0x54ffff61 // b.ne .LBB2_8 -LBB2_11: +LBB2_9: WORD $0xa8c17bfd // ldp x29, x30, [sp], WORD $0xd65f03c0 // ret -LBB2_12: +LBB2_10: + WORD $0x9240092a // and x10, x9, WORD $0x4d40c820 // ld1r { v0.4s }, [x1] - WORD $0x9240094b // and x11, x10, - WORD $0xcb0b012a // sub x10, x9, x11 - WORD $0x9100410c // add x12, x8, - WORD $0xaa0a03ed // mov x13, x10 + WORD $0x9100400b // add x11, x0, + WORD $0xcb0a0109 // sub x9, x8, x10 + WORD $0xaa0903ec // mov x12, x9 -LBB2_13: - WORD $0xad7f8981 // ldp q1, q2, [x12, - WORD $0xf10021ad // subs x13, x13, +LBB2_11: + WORD $0xad7f8961 // ldp q1, q2, [x11, + WORD $0xf100218c // subs x12, x12, WORD $0x6e21dc01 // fmul v1.4s, v0.4s, v1.4s WORD $0x6e22dc02 // fmul v2.4s, v0.4s, v2.4s - WORD $0xad3f8981 // stp q1, q2, [x12, - WORD $0x9100818c // add x12, x12, - WORD $0x54ffff41 // b.ne .LBB2_13 - WORD $0xb5fffd4b // cbnz x11, .LBB2_9 - WORD $0x17fffff1 // b .LBB2_11 + WORD $0xad3f8961 // stp q1, q2, [x11, + WORD $0x9100816b // add x11, x11, + WORD $0x54ffff41 // b.ne .LBB2_11 + WORD $0xb5fffd4a // cbnz x10, .LBB2_7 + WORD $0x17fffff1 // b .LBB2_9 TEXT ·vmul_to(SB), $0-32 MOVD a+0(FP), R0 @@ -279,64 +249,54 @@ TEXT ·vmul_to(SB), $0-32 WORD $0xa9bf7bfd // stp x29, x30, [sp, WORD $0x91000c68 // add x8, x3, WORD $0xf100007f // cmp x3, + WORD $0x910003fd // mov x29, sp WORD $0x9a83b109 // csel x9, x8, x3, lt WORD $0xd342fd28 // lsr x8, x9, WORD $0x927ef529 // and x9, x9, - WORD $0x7100051f // cmp w8, WORD $0xcb09006a // sub x10, x3, x9 - WORD $0x910003fd // mov x29, sp + WORD $0x7100051f // cmp w8, WORD $0x540000eb // b.lt .LBB3_2 LBB3_1: WORD $0x3cc10400 // ldr q0, [x0], - WORD $0x3cc10421 // ldr q1, [x1], WORD $0x71000508 // subs w8, w8, + WORD $0x3cc10421 // ldr q1, [x1], WORD $0x6e21dc00 // fmul v0.4s, v0.4s, v1.4s WORD $0x3c810440 // str q0, [x2], WORD $0x54ffff61 // b.ne .LBB3_1 LBB3_2: WORD $0x7100055f // cmp w10, - WORD $0x5400066b // b.lt .LBB3_12 + WORD $0x5400052b // b.lt .LBB3_12 WORD $0x92407d48 // and x8, x10, - WORD $0xf1001d1f // cmp x8, - WORD $0x54000068 // b.hi .LBB3_5 + WORD $0xf100211f // cmp x8, + WORD $0x54000062 // b.hs .LBB3_5 WORD $0xaa1f03e9 // mov x9, xzr - WORD $0x14000023 // b .LBB3_10 + WORD $0x14000019 // b .LBB3_10 LBB3_5: - WORD $0xd37ef50b // lsl x11, x8, - WORD $0x8b0b000d // add x13, x0, x11 - WORD $0x8b0b004c // add x12, x2, x11 - WORD $0xeb0d005f // cmp x2, x13 - WORD $0x8b0b002b // add x11, x1, x11 - WORD $0x1a9f27ed // cset w13, lo - WORD $0xeb0c001f // cmp x0, x12 - WORD $0x1a9f27ee // cset w14, lo - WORD $0xeb0b005f // cmp x2, x11 - WORD $0x1a9f27eb // cset w11, lo - WORD $0xeb0c003f // cmp x1, x12 + WORD $0xcb000049 // sub x9, x2, x0 + WORD $0xf100813f // cmp x9, WORD $0xaa1f03e9 // mov x9, xzr - WORD $0x0a0e01ad // and w13, w13, w14 - WORD $0x1a9f27ec // cset w12, lo - WORD $0x3700028d // tbnz w13, - WORD $0x0a0c016b // and w11, w11, w12 - WORD $0x3700024b // tbnz w11, + WORD $0x540002a3 // b.lo .LBB3_10 + WORD $0xcb01004b // sub x11, x2, x1 + WORD $0xf100817f // cmp x11, + WORD $0x54000243 // b.lo .LBB3_10 WORD $0x9240094a // and x10, x10, - WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, WORD $0x9100402c // add x12, x1, + WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100404d // add x13, x2, WORD $0xaa0903ee // mov x14, x9 LBB3_8: - WORD $0xad7f8560 // ldp q0, q1, [x11, - WORD $0xad7f8d82 // ldp q2, q3, [x12, + WORD $0xad7f8d80 // ldp q0, q3, [x12, + WORD $0xf10021ce // subs x14, x14, + WORD $0xad7f8961 // ldp q1, q2, [x11, WORD $0x9100816b // add x11, x11, WORD $0x9100818c // add x12, x12, - WORD $0xf10021ce // subs x14, x14, - WORD $0x6e22dc00 // fmul v0.4s, v0.4s, v2.4s - WORD $0x6e23dc21 // fmul v1.4s, v1.4s, v3.4s + WORD $0x6e20dc20 // fmul v0.4s, v1.4s, v0.4s + WORD $0x6e23dc41 // fmul v1.4s, v2.4s, v3.4s WORD $0xad3f85a0 // stp q0, q1, [x13, WORD $0x910081ad // add x13, x13, WORD $0x54fffee1 // b.ne .LBB3_8 @@ -344,15 +304,15 @@ LBB3_8: LBB3_10: WORD $0xd37ef52c // lsl x12, x9, + WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 - WORD $0xcb090108 // sub x8, x8, x9 LBB3_11: WORD $0xbc404580 // ldr s0, [x12], - WORD $0xbc404561 // ldr s1, [x11], WORD $0xf1000508 // subs x8, x8, + WORD $0xbc404561 // ldr s1, [x11], WORD $0x1e210800 // fmul s0, s0, s1 WORD $0xbc004540 // str s0, [x10], WORD $0x54ffff61 // b.ne .LBB3_11 @@ -369,60 +329,237 @@ TEXT ·vdot(SB), $0-32 WORD $0xa9bf7bfd // stp x29, x30, [sp, WORD $0x91000c48 // add x8, x2, WORD $0xf100005f // cmp x2, + WORD $0x910003fd // mov x29, sp WORD $0x9a82b108 // csel x8, x8, x2, lt WORD $0x9342fd0a // asr x10, x8, WORD $0x927ef508 // and x8, x8, + WORD $0xcb080049 // sub x9, x2, x8 WORD $0x7100055f // cmp w10, - WORD $0xcb080048 // sub x8, x2, x8 - WORD $0x910003fd // mov x29, sp - WORD $0x540002ab // b.lt .LBB4_5 + WORD $0x5400028b // b.lt .LBB4_5 WORD $0x3cc10400 // ldr q0, [x0], + WORD $0x71000548 // subs w8, w10, WORD $0x3cc10421 // ldr q1, [x1], - WORD $0x71000549 // subs w9, w10, WORD $0x6e21dc00 // fmul v0.4s, v0.4s, v1.4s - WORD $0x54000200 // b.eq .LBB4_6 - WORD $0xb27d7beb // mov x11, - WORD $0x8b0a096a // add x10, x11, x10, lsl - WORD $0x927e7d4a // and x10, x10, - WORD $0x9100114b // add x11, x10, - WORD $0x8b0b080a // add x10, x0, x11, lsl + WORD $0x540001e0 // b.eq .LBB4_6 + WORD $0xb27b7beb // mov x11, WORD $0xaa0103ec // mov x12, x1 + WORD $0x8b0a116a // add x10, x11, x10, lsl + WORD $0x927c7d4a // and x10, x10, + WORD $0x9100414b // add x11, x10, + WORD $0x8b0b000a // add x10, x0, x11 LBB4_3: WORD $0x3cc10401 // ldr q1, [x0], + WORD $0x71000508 // subs w8, w8, WORD $0x3cc10582 // ldr q2, [x12], - WORD $0x71000529 // subs w9, w9, - WORD $0x6e22dc21 // fmul v1.4s, v1.4s, v2.4s - WORD $0x4e21d400 // fadd v0.4s, v0.4s, v1.4s - WORD $0x54ffff61 // b.ne .LBB4_3 - WORD $0x8b0b0821 // add x1, x1, x11, lsl + WORD $0x4e21cc40 // fmla v0.4s, v2.4s, v1.4s + WORD $0x54ffff81 // b.ne .LBB4_3 + WORD $0x8b0b0021 // add x1, x1, x11 WORD $0xaa0a03e0 // mov x0, x10 WORD $0x14000001 // b .LBB4_6 LBB4_5: LBB4_6: - WORD $0x1e2703e1 // fmov s1, wzr + WORD $0x2f00e401 // movi d1, WORD $0x5e0c0402 // mov s2, v0.s[1] - WORD $0x5e140403 // mov s3, v0.s[2] - WORD $0x5e1c0404 // mov s4, v0.s[3] + WORD $0x7100013f // cmp w9, + WORD $0x1e212801 // fadd s1, s0, s1 + WORD $0x1e212841 // fadd s1, s2, s1 + WORD $0x5e140402 // mov s2, v0.s[2] + WORD $0x5e1c0400 // mov s0, v0.s[3] + WORD $0x1e212841 // fadd s1, s2, s1 + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0xbd000060 // str s0, [x3] + WORD $0x5400072d // b.le .LBB4_16 + WORD $0x92407d28 // and x8, x9, + WORD $0xf100311f // cmp x8, + WORD $0x54000062 // b.hs .LBB4_9 + WORD $0xaa1f03ea // mov x10, xzr + WORD $0x1400002a // b .LBB4_14 + +LBB4_9: + WORD $0xd37ef50a // lsl x10, x8, + WORD $0x9100106c // add x12, x3, + WORD $0x8b0a002b // add x11, x1, x10 + WORD $0x8b0a000a // add x10, x0, x10 + WORD $0xeb03017f // cmp x11, x3 + WORD $0xfa4c8022 // ccmp x1, x12, + WORD $0x1a9f27eb // cset w11, lo + WORD $0xeb0c001f // cmp x0, x12 + WORD $0xfa433140 // ccmp x10, x3, + WORD $0xaa1f03ea // mov x10, xzr + WORD $0x540003e8 // b.hi .LBB4_14 + WORD $0x370003cb // tbnz w11, + WORD $0x92400929 // and x9, x9, + WORD $0x9100400b // add x11, x0, + WORD $0x9100402c // add x12, x1, + WORD $0xcb09010a // sub x10, x8, x9 + WORD $0xaa0a03ed // mov x13, x10 + +LBB4_12: + WORD $0xad7f9181 // ldp q1, q4, [x12, + WORD $0xf10021ad // subs x13, x13, + WORD $0xad7f8d62 // ldp q2, q3, [x11, + WORD $0x9100816b // add x11, x11, + WORD $0x9100818c // add x12, x12, + WORD $0x6e21dc41 // fmul v1.4s, v2.4s, v1.4s + WORD $0x5e0c0422 // mov s2, v1.s[1] + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0x5e140425 // mov s5, v1.s[2] + WORD $0x5e1c0421 // mov s1, v1.s[3] + WORD $0x1e222800 // fadd s0, s0, s2 + WORD $0x6e24dc62 // fmul v2.4s, v3.4s, v4.4s + WORD $0x1e252800 // fadd s0, s0, s5 + WORD $0x5e140443 // mov s3, v2.s[2] + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0x5e0c0441 // mov s1, v2.s[1] + WORD $0x1e222800 // fadd s0, s0, s2 WORD $0x1e212800 // fadd s0, s0, s1 - WORD $0x1e202840 // fadd s0, s2, s0 - WORD $0x1e202860 // fadd s0, s3, s0 - WORD $0x1e202880 // fadd s0, s4, s0 - WORD $0x7100011f // cmp w8, + WORD $0x5e1c0441 // mov s1, v2.s[3] + WORD $0x1e232800 // fadd s0, s0, s3 + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0x54fffd61 // b.ne .LBB4_12 WORD $0xbd000060 // str s0, [x3] - WORD $0x5400012d // b.le .LBB4_9 - WORD $0x92407d08 // and x8, x8, + WORD $0xb4000169 // cbz x9, .LBB4_16 -LBB4_8: - WORD $0xbc404401 // ldr s1, [x0], - WORD $0xbc404422 // ldr s2, [x1], +LBB4_14: + WORD $0xd37ef54b // lsl x11, x10, + WORD $0xcb0a0108 // sub x8, x8, x10 + WORD $0x8b0b0029 // add x9, x1, x11 + WORD $0x8b0b000b // add x11, x0, x11 + +LBB4_15: + WORD $0xbc404561 // ldr s1, [x11], WORD $0xf1000508 // subs x8, x8, - WORD $0x1e220821 // fmul s1, s1, s2 + WORD $0xbc404522 // ldr s2, [x9], + WORD $0x1f020020 // fmadd s0, s1, s2, s0 + WORD $0xbd000060 // str s0, [x3] + WORD $0x54ffff61 // b.ne .LBB4_15 + +LBB4_16: + WORD $0xa8c17bfd // ldp x29, x30, [sp], + WORD $0xd65f03c0 // ret + +TEXT ·veuclidean(SB), $0-32 + MOVD a+0(FP), R0 + MOVD b+8(FP), R1 + MOVD n+16(FP), R2 + MOVD ret+24(FP), R3 + WORD $0xa9bf7bfd // stp x29, x30, [sp, + WORD $0x91000c48 // add x8, x2, + WORD $0xf100005f // cmp x2, + WORD $0x910003fd // mov x29, sp + WORD $0x9a82b108 // csel x8, x8, x2, lt + WORD $0x9342fd0a // asr x10, x8, + WORD $0x927ef508 // and x8, x8, + WORD $0xcb080049 // sub x9, x2, x8 + WORD $0x7100055f // cmp w10, + WORD $0x540002cb // b.lt .LBB5_5 + WORD $0x3cc10400 // ldr q0, [x0], + WORD $0x71000548 // subs w8, w10, + WORD $0x3cc10421 // ldr q1, [x1], + WORD $0x4ea1d400 // fsub v0.4s, v0.4s, v1.4s + WORD $0x6e20dc00 // fmul v0.4s, v0.4s, v0.4s + WORD $0x54000200 // b.eq .LBB5_6 + WORD $0xb27b7beb // mov x11, + WORD $0xaa0103ec // mov x12, x1 + WORD $0x8b0a116a // add x10, x11, x10, lsl + WORD $0x927c7d4a // and x10, x10, + WORD $0x9100414b // add x11, x10, + WORD $0x8b0b000a // add x10, x0, x11 + +LBB5_3: + WORD $0x3cc10401 // ldr q1, [x0], + WORD $0x71000508 // subs w8, w8, + WORD $0x3cc10582 // ldr q2, [x12], + WORD $0x4ea2d421 // fsub v1.4s, v1.4s, v2.4s + WORD $0x4e21cc20 // fmla v0.4s, v1.4s, v1.4s + WORD $0x54ffff61 // b.ne .LBB5_3 + WORD $0x8b0b0021 // add x1, x1, x11 + WORD $0xaa0a03e0 // mov x0, x10 + WORD $0x14000001 // b .LBB5_6 + +LBB5_5: +LBB5_6: + WORD $0x7e30d801 // faddp s1, v0.2s + WORD $0x5e140402 // mov s2, v0.s[2] + WORD $0x7100013f // cmp w9, + WORD $0x5e1c0400 // mov s0, v0.s[3] + WORD $0x1e212841 // fadd s1, s2, s1 + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0xbd000060 // str s0, [x3] + WORD $0x5400078d // b.le .LBB5_16 + WORD $0x92407d28 // and x8, x9, + WORD $0xf100311f // cmp x8, + WORD $0x54000062 // b.hs .LBB5_9 + WORD $0xaa1f03ea // mov x10, xzr + WORD $0x1400002c // b .LBB5_14 + +LBB5_9: + WORD $0xd37ef50a // lsl x10, x8, + WORD $0x9100106c // add x12, x3, + WORD $0x8b0a002b // add x11, x1, x10 + WORD $0x8b0a000a // add x10, x0, x10 + WORD $0xeb03017f // cmp x11, x3 + WORD $0xfa4c8022 // ccmp x1, x12, + WORD $0x1a9f27eb // cset w11, lo + WORD $0xeb0c001f // cmp x0, x12 + WORD $0xfa433140 // ccmp x10, x3, + WORD $0xaa1f03ea // mov x10, xzr + WORD $0x54000428 // b.hi .LBB5_14 + WORD $0x3700040b // tbnz w11, + WORD $0x92400929 // and x9, x9, + WORD $0x9100400b // add x11, x0, + WORD $0x9100402c // add x12, x1, + WORD $0xcb09010a // sub x10, x8, x9 + WORD $0xaa0a03ed // mov x13, x10 + +LBB5_12: + WORD $0xad7f9181 // ldp q1, q4, [x12, + WORD $0xf10021ad // subs x13, x13, + WORD $0xad7f8d62 // ldp q2, q3, [x11, + WORD $0x9100816b // add x11, x11, + WORD $0x9100818c // add x12, x12, + WORD $0x4ea1d441 // fsub v1.4s, v2.4s, v1.4s + WORD $0x6e21dc21 // fmul v1.4s, v1.4s, v1.4s + WORD $0x5e0c0422 // mov s2, v1.s[1] + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0x5e140425 // mov s5, v1.s[2] + WORD $0x5e1c0421 // mov s1, v1.s[3] + WORD $0x1e222800 // fadd s0, s0, s2 + WORD $0x4ea4d462 // fsub v2.4s, v3.4s, v4.4s + WORD $0x1e252800 // fadd s0, s0, s5 + WORD $0x6e22dc42 // fmul v2.4s, v2.4s, v2.4s + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0x5e0c0441 // mov s1, v2.s[1] + WORD $0x5e140443 // mov s3, v2.s[2] + WORD $0x1e222800 // fadd s0, s0, s2 WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0x5e1c0441 // mov s1, v2.s[3] + WORD $0x1e232800 // fadd s0, s0, s3 + WORD $0x1e212800 // fadd s0, s0, s1 + WORD $0x54fffd21 // b.ne .LBB5_12 WORD $0xbd000060 // str s0, [x3] - WORD $0x54ffff41 // b.ne .LBB4_8 + WORD $0xb4000189 // cbz x9, .LBB5_16 -LBB4_9: +LBB5_14: + WORD $0xd37ef54b // lsl x11, x10, + WORD $0xcb0a0108 // sub x8, x8, x10 + WORD $0x8b0b0029 // add x9, x1, x11 + WORD $0x8b0b000b // add x11, x0, x11 + +LBB5_15: + WORD $0xbc404561 // ldr s1, [x11], + WORD $0xf1000508 // subs x8, x8, + WORD $0xbc404522 // ldr s2, [x9], + WORD $0x1e223821 // fsub s1, s1, s2 + WORD $0x1f010020 // fmadd s0, s1, s1, s0 + WORD $0xbd000060 // str s0, [x3] + WORD $0x54ffff41 // b.ne .LBB5_15 + +LBB5_16: + WORD $0xfd400060 // ldr d0, [x3] + WORD $0x2ea1f800 // fsqrt v0.2s, v0.2s + WORD $0xbd000060 // str s0, [x3] WORD $0xa8c17bfd // ldp x29, x30, [sp], WORD $0xd65f03c0 // ret diff --git a/base/floats/floats_noasm.go b/base/floats/floats_noasm.go index b60c85894..150fb3dc6 100644 --- a/base/floats/floats_noasm.go +++ b/base/floats/floats_noasm.go @@ -36,3 +36,7 @@ func (implementation) mulConst(a []float32, b float32) { func (implementation) dot(a, b []float32) float32 { return dot(a, b) } + +func (implementation) euclidean(a, b []float32) float32 { + return euclidean(a, b) +} diff --git a/base/floats/floats_test.go b/base/floats/floats_test.go index f093a90f5..18c7ae75f 100644 --- a/base/floats/floats_test.go +++ b/base/floats/floats_test.go @@ -142,12 +142,25 @@ func TestDot(t *testing.T) { assert.Panics(t, func() { Dot([]float32{1}, nil) }) } +func TestEuclidean(t *testing.T) { + a := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + b := []float32{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20} + assert.Equal(t, float32(19.621416), Euclidean(a, b)) + assert.Panics(t, func() { Euclidean([]float32{1}, nil) }) +} + func TestNative_Dot(t *testing.T) { a := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} b := []float32{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20} assert.Equal(t, float32(770), dot(a, b)) } +func TestNative_Euclidean(t *testing.T) { + a := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + b := []float32{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20} + assert.Equal(t, float32(19.621416), euclidean(a, b)) +} + func TestNative_MulConstAddTo(t *testing.T) { a := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} dst := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} diff --git a/base/floats/src/Makefile b/base/floats/src/Makefile index 7a134dacb..ac8beb156 100644 --- a/base/floats/src/Makefile +++ b/base/floats/src/Makefile @@ -4,8 +4,8 @@ ifeq ($(shell uname -m),x86_64) SOURCES += floats_avx.c floats_avx512.c CFLAGS = -O3 -mavx -mavx512f -mavx512dq else ifeq ($(shell uname -m),aarch64) - SOURCES += floats_neon.c - CFLAGS = -O3 + SOURCES += floats_neon.c floats_sve2.c + CFLAGS = -O3 -march=armv8-a+sve endif OBJECTS = $(SOURCES:.c=.o) @@ -13,7 +13,7 @@ DEPENDENCES = $(SOURCES:.c=.d) EXECUTE = floats_test $(EXECUTE): $(OBJECTS) - $(CC) $(OBJECTS) -o $(EXECUTE) + $(CC) $(OBJECTS) -lm -o $(EXECUTE) test: $(EXECUTE) ./${EXECUTE} diff --git a/base/floats/src/floats_avx.c b/base/floats/src/floats_avx.c index d0d7644d9..b97077ee0 100644 --- a/base/floats/src/floats_avx.c +++ b/base/floats/src/floats_avx.c @@ -128,3 +128,46 @@ void _mm256_dot(float *a, float *b, int64_t n, float *ret) *ret += a[i] * b[i]; } } + +void _mm256_euclidean(float *a, float *b, int64_t n, float *ret) +{ + int epoch = n / 8; + int remain = n % 8; + __m256 sum; + if (epoch > 0) + { + __m256 v1 = _mm256_loadu_ps(a); + __m256 v2 = _mm256_loadu_ps(b); + __m256 v = _mm256_sub_ps(v1, v2); + sum = _mm256_mul_ps(v, v); + a += 8; + b += 8; + } + for (int i = 1; i < epoch; i++) + { + __m256 v1 = _mm256_loadu_ps(a); + __m256 v2 = _mm256_loadu_ps(b); + __m256 v = _mm256_sub_ps(v1, v2); + v = _mm256_mul_ps(v, v); + sum = _mm256_add_ps(v, sum); + a += 8; + b += 8; + } + __m128 s7_6_5_4 = _mm256_extractf128_ps(sum, 1); + __m128 s3_2_1_0 = _mm256_castps256_ps128(sum); + __m128 s37_26_15_04 = _mm_add_ps(s7_6_5_4, s3_2_1_0); + __m128 sxx_15_04 = s37_26_15_04; + __m128 sxx_37_26 = _mm_movehl_ps(s37_26_15_04, s37_26_15_04); + const __m128 sxx_1357_0246 = _mm_add_ps(sxx_15_04, sxx_37_26); + const __m128 sxxx_0246 = sxx_1357_0246; + const __m128 sxxx_1357 = _mm_shuffle_ps(sxx_1357_0246, sxx_1357_0246, 0x1); + __m128 sxxx_01234567 = _mm_add_ss(sxxx_0246, sxxx_1357); + *ret = _mm_cvtss_f32(sxxx_01234567); + for (int i = 0; i < remain; i++) + { + *ret += (a[i] - b[i]) * (a[i] - b[i]); + } + __m128 v = _mm_set1_ps(*ret); + __m128 r = _mm_sqrt_ss(v); + *ret = _mm_cvtss_f32(r); +} diff --git a/base/floats/src/floats_avx512.c b/base/floats/src/floats_avx512.c index 2db1a97f1..7b6aa12c1 100644 --- a/base/floats/src/floats_avx512.c +++ b/base/floats/src/floats_avx512.c @@ -194,3 +194,73 @@ void _mm512_dot(float *a, float *b, int64_t n, float *ret) *ret += a[i] * b[i]; } } + +void _mm512_euclidean(float *a, float *b, int64_t n, float *ret) +{ + int epoch = n / 16; + int remain = n % 16; + __m512 s; + if (epoch > 0) + { + __m512 v1 = _mm512_loadu_ps(a); + __m512 v2 = _mm512_loadu_ps(b); + __m512 v = _mm512_sub_ps(v1, v2); + s = _mm512_mul_ps(v, v); + a += 16; + b += 16; + } + for (int i = 1; i < epoch; i++) + { + __m512 v1 = _mm512_loadu_ps(a); + __m512 v2 = _mm512_loadu_ps(b); + __m512 v = _mm512_sub_ps(v1, v2); + v = _mm512_mul_ps(v, v); + s = _mm512_add_ps(v, s); + a += 16; + b += 16; + } + __m256 sf_e_d_c_b_a_9_8 = _mm512_extractf32x8_ps(s, 1); + __m256 s7_6_5_4_3_2_1_0 = _mm512_castps512_ps256(s); + __m256 s7f_6e_5d_4c_3b_2a_19_08 = _mm256_add_ps(sf_e_d_c_b_a_9_8, s7_6_5_4_3_2_1_0); + __m128 s7f_6e_5d_4c = _mm256_extractf128_ps(s7f_6e_5d_4c_3b_2a_19_08, 1); + __m128 s3b_2a_19_08 = _mm256_castps256_ps128(s7f_6e_5d_4c_3b_2a_19_08); + __m128 s37bf_26ae_159d_048c = _mm_add_ps(s7f_6e_5d_4c, s3b_2a_19_08); + __m128 sxx_159d_048c = s37bf_26ae_159d_048c; + __m128 sxx_37bf_26ae = _mm_movehl_ps(sxx_159d_048c, s37bf_26ae_159d_048c); + const __m128 sxx_13579bdf_02468ace = _mm_add_ps(sxx_159d_048c, sxx_37bf_26ae); + const __m128 sxxx_02468ace = sxx_13579bdf_02468ace; + const __m128 sxxx_13579bdf = _mm_shuffle_ps(sxx_13579bdf_02468ace, sxx_13579bdf_02468ace, 0x1); + __m128 sxxx_0123456789abcdef = _mm_add_ps(sxxx_02468ace, sxxx_13579bdf); + *ret = _mm_cvtss_f32(sxxx_0123456789abcdef); + + if (remain >= 8) + { + __m256 s; + __m256 v1 = _mm256_loadu_ps(a); + __m256 v2 = _mm256_loadu_ps(b); + __m256 v = _mm256_sub_ps(v1, v2); + v = _mm256_mul_ps(v, v); + a += 8; + b += 8; + __m128 s7_6_5_4 = _mm256_extractf128_ps(v, 1); + __m128 s3_2_1_0 = _mm256_castps256_ps128(v); + __m128 s37_26_15_04 = _mm_add_ps(s7_6_5_4, s3_2_1_0); + __m128 sxx_15_04 = s37_26_15_04; + __m128 sxx_37_26 = _mm_movehl_ps(s37_26_15_04, s37_26_15_04); + const __m128 sxx_1357_0246 = _mm_add_ps(sxx_15_04, sxx_37_26); + const __m128 sxxx_0246 = sxx_1357_0246; + const __m128 sxxx_1357 = _mm_shuffle_ps(sxx_1357_0246, sxx_1357_0246, 0x1); + __m128 sxxx_01234567 = _mm_add_ss(sxxx_0246, sxxx_1357); + *ret += _mm_cvtss_f32(sxxx_01234567); + remain -= 8; + } + + for (int i = 0; i < remain; i++) + { + *ret += (a[i] - b[i]) * (a[i] - b[i]); + } + + __m128 v = _mm_set1_ps(*ret); + __m128 r = _mm_sqrt_ss(v); + *ret = _mm_cvtss_f32(r); +} diff --git a/base/floats/src/floats_neon.c b/base/floats/src/floats_neon.c index 56159da94..d31f269ed 100644 --- a/base/floats/src/floats_neon.c +++ b/base/floats/src/floats_neon.c @@ -105,3 +105,37 @@ void vdot(float *a, float *b, long n, float* ret) { *ret += a[i] * b[i]; } } + +void veuclidean(float *a, float *b, long n, float *ret) { + int epoch = n / 4; + int remain = n % 4; + float32x4_t s; + if (epoch > 0) { + float32x4_t v1 = vld1q_f32(a); + float32x4_t v2 = vld1q_f32(b); + float32x4_t v = vsubq_f32(v1, v2); + s = vmulq_f32(v, v); + a += 4; + b += 4; + } + for (int i = 1; i < epoch; i++) { + float32x4_t v1 = vld1q_f32(a); + float32x4_t v2 = vld1q_f32(b); + float32x4_t v = vsubq_f32(v1, v2); + s = vmlaq_f32(s, v, v); + a += 4; + b += 4; + } + float partial[4]; + vst1q_f32(partial, s); + *ret = 0; + for (int i = 0; i < 4; i++) { + *ret += partial[i]; + } + for (int i = 0; i < remain; i++) { + *ret += (a[i] - b[i]) * (a[i] - b[i]); + } + float32x2_t v = vld1_f32(ret); + float32x2_t r = vsqrt_f32(v); + *ret = vget_lane_f32(r, 0); +} diff --git a/base/floats/src/floats_test.c b/base/floats/src/floats_test.c index 59721d6bd..083ba8eca 100644 --- a/base/floats/src/floats_test.c +++ b/base/floats/src/floats_test.c @@ -47,6 +47,16 @@ void dot(float *a, float *b, int64_t n, float *ret) } } +void euclidean(float *a, float *b, int64_t n, float *ret) +{ + *ret = 0; + for (int64_t i = 0; i < n; i++) + { + *ret += powf(a[i] - b[i], 2); + } + *ret = sqrtf(*ret); +} + int rand_float(float *a, int64_t n) { for (int i = 0; i < n; i++) @@ -62,12 +72,14 @@ void _mm256_mul_const_to(float *a, float *b, float *c, int64_t n); void _mm256_mul_const(float *a, float *b, int64_t n); void _mm256_mul_to(float *a, float *b, float *c, int64_t n); void _mm256_dot(float *a, float *b, int64_t n, float *ret); +void _mm256_euclidean(float *a, float *b, int64_t n, float *ret); void _mm512_mul_const_add_to(float *a, float *b, float *c, int64_t n); void _mm512_mul_const_to(float *a, float *b, float *c, int64_t n); void _mm512_mul_const(float *a, float *b, int64_t n); void _mm512_mul_to(float *a, float *b, float *c, int64_t n); void _mm512_dot(float *a, float *b, int64_t n, float *ret); +void _mm512_euclidean(float *a, float *b, int64_t n, float *ret); MunitResult mm256_mul_const_add_to_test(const MunitParameter params[], void *user_data_or_fixture) { @@ -132,12 +144,25 @@ MunitResult mm256_dot_test(const MunitParameter params[], void *user_data_or_fix return MUNIT_OK; } +MunitResult mm256_euclidean_test(const MunitParameter params[], void *user_data_or_fixture) +{ + float a[kVectorLength], b[kVectorLength], expect, actual; + rand_float(a, kVectorLength); + rand_float(b, kVectorLength); + + euclidean(a, b, kVectorLength, &expect); + _mm256_euclidean(a, b, kVectorLength, &actual); + munit_assert_float_equal(expect, actual, 5); + return MUNIT_OK; +} + MunitTest mm256_tests[] = { {"mul_const_add_to", mm256_mul_const_add_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_const_to", mm256_mul_const_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_const", mm256_mul_const_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_to", mm256_mul_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"dot", mm256_dot_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"euclidean", mm256_euclidean_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; static const MunitSuite mm256_suite = { @@ -206,12 +231,25 @@ MunitResult mm512_dot_test(const MunitParameter params[], void *user_data_or_fix return MUNIT_OK; } +MunitResult mm512_euclidean_test(const MunitParameter params[], void *user_data_or_fixture) +{ + float a[kVectorLength], b[kVectorLength], expect, actual; + rand_float(a, kVectorLength); + rand_float(b, kVectorLength); + + euclidean(a, b, kVectorLength, &expect); + _mm512_euclidean(a, b, kVectorLength, &actual); + munit_assert_float_equal(expect, actual, 5); + return MUNIT_OK; +} + MunitTest mm512_tests[] = { {"mul_const_add_to", mm512_mul_const_add_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_const_to", mm512_mul_const_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_const", mm512_mul_const_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_to", mm512_mul_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"dot", mm512_dot_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"euclidean", mm512_euclidean_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; static const MunitSuite mm512_suite = { @@ -231,6 +269,7 @@ void vmul_const_to(float *a, float *b, float *c, int64_t n); void vmul_const(float *a, float *b, int64_t n); void vmul_to(float *a, float *b, float *c, int64_t n); void vdot(float *a, float *b, int64_t n, float *ret); +void veuclidean(float *a, float *b, int64_t n, float *ret); MunitResult vmul_const_add_to_test(const MunitParameter params[], void *user_data_or_fixture) { @@ -295,12 +334,25 @@ MunitResult vdot_test(const MunitParameter params[], void *user_data_or_fixture) return MUNIT_OK; } +MunitResult veuclidean_test(const MunitParameter params[], void *user_data_or_fixture) +{ + float a[kVectorLength], b[kVectorLength], expect, actual; + rand_float(a, kVectorLength); + rand_float(b, kVectorLength); + + euclidean(a, b, kVectorLength, &expect); + veuclidean(a, b, kVectorLength, &actual); + munit_assert_float_equal(expect, actual, 5); + return MUNIT_OK; +} + MunitTest vtests[] = { {"mul_const_add_to", vmul_const_add_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_const_to", vmul_const_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_const", vmul_const_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"mul_to", vmul_to_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {"dot", vdot_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, + {"euclidean", veuclidean_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; static const MunitSuite vsuite = { @@ -370,7 +422,7 @@ MunitTest svtests[] = { {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; static const MunitSuite svsuite = { - "v", svtests, NULL, kIteration, MUNIT_SUITE_OPTION_NONE}; + "sv", svtests, NULL, kIteration, MUNIT_SUITE_OPTION_NONE}; int main(int argc, char *const argv[MUNIT_ARRAY_PARAM(argc + 1)]) { diff --git a/cmd/goat/main.go b/cmd/goat/main.go deleted file mode 100644 index 186e54c24..000000000 --- a/cmd/goat/main.go +++ /dev/null @@ -1,325 +0,0 @@ -// Copyright 2022 gorse Project Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package main - -import ( - "errors" - "fmt" - "os" - "os/exec" - "path/filepath" - "runtime" - "sort" - "strings" - - mapset "github.com/deckarep/golang-set/v2" - "github.com/spf13/cobra" - "modernc.org/cc/v3" -) - -var supportedTypes = mapset.NewSet("int64_t", "long") - -type TranslateUnit struct { - Source string - Assembly string - Object string - GoAssembly string - Go string - Package string - Options []string - Offset int -} - -func NewTranslateUnit(source string, outputDir string, options ...string) TranslateUnit { - sourceExt := filepath.Ext(source) - noExtSourcePath := source[:len(source)-len(sourceExt)] - noExtSourceBase := filepath.Base(noExtSourcePath) - return TranslateUnit{ - Source: source, - Assembly: noExtSourcePath + ".s", - Object: noExtSourcePath + ".o", - GoAssembly: filepath.Join(outputDir, noExtSourceBase+".s"), - Go: filepath.Join(outputDir, noExtSourceBase+".go"), - Package: filepath.Base(outputDir), - Options: options, - } -} - -// parseSource parse C source file and extract functions declarations. -func (t *TranslateUnit) parseSource() ([]Function, error) { - // List include paths. - includePaths, err := listIncludePaths() - if err != nil { - return nil, err - } - source, err := t.fixSource(t.Source) - if err != nil { - return nil, err - } - ast, err := cc.Parse(&cc.Config{}, nil, includePaths, - []cc.Source{{Name: t.Source, Value: source}}) - if err != nil { - return nil, err - } - var functions []Function - for _, nodes := range ast.Scope { - if len(nodes) != 1 || nodes[0].Position().Filename != t.Source { - continue - } - node := nodes[0] - if declarator, ok := node.(*cc.Declarator); ok { - funcIdent := declarator.DirectDeclarator - if funcIdent.Case != cc.DirectDeclaratorFuncParam { - continue - } - if function, err := t.convertFunction(funcIdent); err != nil { - return nil, err - } else { - functions = append(functions, function) - } - } - } - sort.Slice(functions, func(i, j int) bool { - return functions[i].Position < functions[j].Position - }) - return functions, nil -} - -func (t *TranslateUnit) generateGoStubs(functions []Function) error { - // generate code - var builder strings.Builder - builder.WriteString(buildTags) - builder.WriteString("// AUTO-GENERATED BY GOAT -- DO NOT EDIT\n\n") - builder.WriteString(fmt.Sprintf("package %v\n\n", t.Package)) - builder.WriteString("import \"unsafe\"\n") - for _, function := range functions { - builder.WriteString("\n//go:noescape\n") - builder.WriteString(fmt.Sprintf("func %v(%s unsafe.Pointer)\n", - function.Name, strings.Join(function.Parameters, ", "))) - } - - // write file - f, err := os.Create(t.Go) - if err != nil { - return err - } - defer func(f *os.File) { - if err = f.Close(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - }(f) - _, err = f.WriteString(builder.String()) - return err -} - -func (t *TranslateUnit) compile(args ...string) error { - args = append(args, "-mno-red-zone", "-mstackrealign", "-mllvm", "-inline-threshold=1000", - "-fno-asynchronous-unwind-tables", "-fno-exceptions", "-fno-rtti") - _, err := runCommand("clang", append([]string{"-S", "-c", t.Source, "-o", t.Assembly}, args...)...) - if err != nil { - return err - } - _, err = runCommand("clang", append([]string{"-c", t.Assembly, "-o", t.Object}, args...)...) - return err -} - -func (t *TranslateUnit) Translate() error { - functions, err := t.parseSource() - if err != nil { - return err - } - if err = t.generateGoStubs(functions); err != nil { - return err - } - if err = t.compile(t.Options...); err != nil { - return err - } - assembly, err := parseAssembly(t.Assembly) - if err != nil { - return err - } - dump, err := runCommand("objdump", "-d", t.Object, "--insn-width", "16") - if err != nil { - return err - } - err = parseObjectDump(dump, assembly) - if err != nil { - return err - } - for i, name := range functions { - functions[i].Lines = assembly[name.Name] - } - return generateGoAssembly(t.GoAssembly, functions) -} - -// fixSource fixes compile errors in source. -func (t *TranslateUnit) fixSource(path string) (string, error) { - bytes, err := os.ReadFile(path) - if err != nil { - return "", err - } - if runtime.GOARCH == "amd64" { - t.Offset = -1 - var builder strings.Builder - builder.WriteString("#define __STDC_HOSTED__ 1\n") - builder.Write(bytes) - return builder.String(), nil - } else if runtime.GOARCH == "arm64" { - var ( - builder strings.Builder - clauseCount int - ) - for _, line := range strings.Split(string(bytes), "\n") { - if strings.HasPrefix(line, "#include") { - // Do nothing - } else if strings.Contains(line, "{") { - if clauseCount == 0 { - builder.WriteString(line[:strings.Index(line, "{")+1]) - } - clauseCount++ - } else if strings.Contains(line, "}") { - clauseCount-- - if clauseCount == 0 { - builder.WriteString(line[strings.Index(line, "}"):]) - } - } else if clauseCount == 0 { - builder.WriteString(line) - } - builder.WriteRune('\n') - } - return builder.String(), nil - } - return "", fmt.Errorf("unsupported arch: %s", runtime.GOARCH) -} - -// listIncludePaths lists include paths used by clang. -func listIncludePaths() ([]string, error) { - out, err := runCommand("bash", "-c", "echo | gcc -xc -E -v -") - if err != nil { - return nil, err - } - var start bool - var paths []string - for _, line := range strings.Split(out, "\n") { - if strings.HasPrefix(line, "#include <...> search starts here:") { - start = true - } else if strings.HasPrefix(line, "End of search list.") { - start = false - } else if start { - path := strings.TrimSpace(line) - paths = append(paths, path) - } - } - return paths, nil -} - -type Function struct { - Name string - Position int - Parameters []string - Lines []Line -} - -// convertFunction extracts the function definition from cc.DirectDeclarator. -func (t *TranslateUnit) convertFunction(declarator *cc.DirectDeclarator) (Function, error) { - params, err := t.convertFunctionParameters(declarator.ParameterTypeList.ParameterList) - if err != nil { - return Function{}, err - } - return Function{ - Name: declarator.DirectDeclarator.Token.Value.String(), - Position: declarator.Position().Line, - Parameters: params, - }, nil -} - -// convertFunctionParameters extracts function parameters from cc.ParameterList. -func (t *TranslateUnit) convertFunctionParameters(params *cc.ParameterList) ([]string, error) { - declaration := params.ParameterDeclaration - paramName := declaration.Declarator.DirectDeclarator.Token.Value - paramType := declaration.DeclarationSpecifiers.TypeSpecifier.Token.Value - isPointer := declaration.Declarator.Pointer != nil - if !isPointer && !supportedTypes.Contains(paramType.String()) { - position := declaration.Position() - return nil, fmt.Errorf("%v:%v:%v: error: unsupported type: %v\n", - position.Filename, position.Line+t.Offset, position.Column, paramType) - } - paramNames := []string{paramName.String()} - if params.ParameterList != nil { - if nextParamNames, err := t.convertFunctionParameters(params.ParameterList); err != nil { - return nil, err - } else { - paramNames = append(paramNames, nextParamNames...) - } - } - return paramNames, nil -} - -// runCommand runs a command and extract its output. -func runCommand(name string, arg ...string) (string, error) { - cmd := exec.Command(name, arg...) - output, err := cmd.CombinedOutput() - if err != nil { - if output != nil { - return "", errors.New(string(output)) - } else { - return "", err - } - } - return string(output), nil -} - -var command = &cobra.Command{ - Use: "goat source [-o output_directory]", - Args: cobra.ExactArgs(1), - Run: func(cmd *cobra.Command, args []string) { - output, _ := cmd.PersistentFlags().GetString("output") - if output == "" { - var err error - if output, err = os.Getwd(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - } - var options []string - machineOptions, _ := cmd.PersistentFlags().GetStringSlice("machine-option") - for _, m := range machineOptions { - options = append(options, "-m"+m) - } - extraOptions, _ := cmd.PersistentFlags().GetStringSlice("extra-option") - options = append(options, extraOptions...) - optimizeLevel, _ := cmd.PersistentFlags().GetInt("optimize-level") - options = append(options, fmt.Sprintf("-O%d", optimizeLevel)) - file := NewTranslateUnit(args[0], output, options...) - if err := file.Translate(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - }, -} - -func init() { - command.PersistentFlags().StringP("output", "o", "", "output directory of generated files") - command.PersistentFlags().StringSliceP("machine-option", "m", nil, "machine option for clang") - command.PersistentFlags().StringSliceP("extra-option", "e", nil, "extra option for clang") - command.PersistentFlags().IntP("optimize-level", "O", 0, "optimization level for clang") -} - -func main() { - if err := command.Execute(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } -} diff --git a/cmd/goat/parser_amd64.go b/cmd/goat/parser_amd64.go deleted file mode 100644 index d10d17aed..000000000 --- a/cmd/goat/parser_amd64.go +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2022 gorse Project Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package main - -import ( - "bufio" - "fmt" - "github.com/klauspost/asmfmt" - "os" - "regexp" - "strings" - "unicode" -) - -const buildTags = "//go:build !noasm && amd64\n" - -var ( - attributeLine = regexp.MustCompile(`^\s+\..+$`) - nameLine = regexp.MustCompile(`^\w+:.+$`) - labelLine = regexp.MustCompile(`^\.\w+_\d+:.*$`) - codeLine = regexp.MustCompile(`^\s+\w+.+$`) - - symbolLine = regexp.MustCompile(`^\w+\s+<\w+>:$`) - dataLine = regexp.MustCompile(`^\w+:\s+\w+\s+.+$`) - - registers = []string{"DI", "SI", "DX", "CX"} -) - -type Line struct { - Label string - Assembly string - Binary []string -} - -func (line *Line) String() string { - var builder strings.Builder - if len(line.Label) > 0 { - builder.WriteString(line.Label) - builder.WriteString(":\n") - } - builder.WriteString("\t") - if strings.HasPrefix(line.Assembly, "j") { - splits := strings.Split(line.Assembly, ".") - op := strings.TrimSpace(splits[0]) - operand := splits[1] - builder.WriteString(fmt.Sprintf("%s %s", strings.ToUpper(op), operand)) - } else { - pos := 0 - for pos < len(line.Binary) { - if pos > 0 { - builder.WriteString("; ") - } - if len(line.Binary)-pos >= 8 { - builder.WriteString(fmt.Sprintf("QUAD $0x%v%v%v%v%v%v%v%v", - line.Binary[pos+7], line.Binary[pos+6], line.Binary[pos+5], line.Binary[pos+4], - line.Binary[pos+3], line.Binary[pos+2], line.Binary[pos+1], line.Binary[pos])) - pos += 8 - } else if len(line.Binary)-pos >= 4 { - builder.WriteString(fmt.Sprintf("LONG $0x%v%v%v%v", - line.Binary[pos+3], line.Binary[pos+2], line.Binary[pos+1], line.Binary[pos])) - pos += 4 - } else if len(line.Binary)-pos >= 2 { - builder.WriteString(fmt.Sprintf("WORD $0x%v%v", line.Binary[pos+1], line.Binary[pos])) - pos += 2 - } else { - builder.WriteString(fmt.Sprintf("BYTE $0x%v", line.Binary[pos])) - pos += 1 - } - } - builder.WriteString("\t// ") - builder.WriteString(line.Assembly) - } - builder.WriteString("\n") - return builder.String() -} - -func parseAssembly(path string) (map[string][]Line, error) { - file, err := os.Open(path) - if err != nil { - return nil, err - } - defer func(file *os.File) { - if err = file.Close(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - }(file) - - var ( - functions = make(map[string][]Line) - functionName string - labelName string - ) - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - if attributeLine.MatchString(line) { - continue - } else if nameLine.MatchString(line) { - functionName = strings.Split(line, ":")[0] - functions[functionName] = make([]Line, 0) - } else if labelLine.MatchString(line) { - labelName = strings.Split(line, ":")[0] - labelName = labelName[1:] - functions[functionName] = append(functions[functionName], Line{Label: labelName}) - } else if codeLine.MatchString(line) { - asm := strings.Split(line, "#")[0] - asm = strings.TrimSpace(asm) - if labelName == "" { - functions[functionName] = append(functions[functionName], Line{Assembly: asm}) - } else { - lines := functions[functionName] - lines[len(lines)-1].Assembly = asm - labelName = "" - } - } - } - - if err = scanner.Err(); err != nil { - return nil, err - } - return functions, nil -} - -func parseObjectDump(dump string, functions map[string][]Line) error { - var ( - functionName string - lineNumber int - ) - for i, line := range strings.Split(dump, "\n") { - line = strings.TrimSpace(line) - if symbolLine.MatchString(line) { - functionName = strings.Split(line, "<")[1] - functionName = strings.Split(functionName, ">")[0] - lineNumber = 0 - } else if dataLine.MatchString(line) { - data := strings.Split(line, ":")[1] - data = strings.TrimSpace(data) - splits := strings.Split(data, " ") - var ( - binary []string - assembly string - ) - for i, s := range splits { - if s == "" || unicode.IsSpace(rune(s[0])) { - assembly = strings.Join(splits[i:], " ") - assembly = strings.TrimSpace(assembly) - break - } - binary = append(binary, s) - } - if assembly == "" { - return fmt.Errorf("try to increase --insn-width of objdump") - } else if strings.HasPrefix(assembly, "nop") || - assembly == "xchg %ax,%ax" { - continue - } - if lineNumber >= len(functions[functionName]) { - return fmt.Errorf("%d: unexpected objectdump line: %s", i, line) - } - functions[functionName][lineNumber].Binary = binary - lineNumber++ - } - } - return nil -} - -func generateGoAssembly(path string, functions []Function) error { - // generate code - var builder strings.Builder - builder.WriteString(buildTags) - builder.WriteString("// AUTO-GENERATED BY GOAT -- DO NOT EDIT\n") - for _, function := range functions { - builder.WriteString(fmt.Sprintf("\nTEXT ·%v(SB), $0-32\n", function.Name)) - for i, param := range function.Parameters { - builder.WriteString(fmt.Sprintf("\tMOVQ %s+%d(FP), %s\n", param, i*8, registers[i])) - } - for _, line := range function.Lines { - builder.WriteString(line.String()) - } - } - - // write file - f, err := os.Create(path) - if err != nil { - return err - } - defer func(f *os.File) { - if err = f.Close(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - }(f) - bytes, err := asmfmt.Format(strings.NewReader(builder.String())) - if err != nil { - return err - } - _, err = f.Write(bytes) - return err -} diff --git a/cmd/goat/parser_arm64.go b/cmd/goat/parser_arm64.go deleted file mode 100644 index d41f67d6a..000000000 --- a/cmd/goat/parser_arm64.go +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright 2022 gorse Project Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package main - -import ( - "bufio" - "fmt" - "os" - "regexp" - "strings" - "unicode" - - "github.com/klauspost/asmfmt" -) - -const buildTags = "//go:build !noasm && arm64\n" - -var ( - attributeLine = regexp.MustCompile(`^\s+\..+$`) - nameLine = regexp.MustCompile(`^\w+:.+$`) - labelLine = regexp.MustCompile(`^\.\w+_\d+:.*$`) - codeLine = regexp.MustCompile(`^\s+\w+.+$`) - - symbolLine = regexp.MustCompile(`^\w+\s+<\w+>:$`) - dataLine = regexp.MustCompile(`^\w+:\s+\w+\s+.+$`) - - registers = []string{"R0", "R1", "R2", "R3"} -) - -type Line struct { - Labels []string - Assembly string - Binary string -} - -func (line *Line) String() string { - var builder strings.Builder - for _, label := range line.Labels { - builder.WriteString(label) - builder.WriteString(":\n") - } - builder.WriteString("\t") - builder.WriteString(fmt.Sprintf("WORD $0x%v", line.Binary)) - builder.WriteString("\t// ") - builder.WriteString(line.Assembly) - builder.WriteString("\n") - return builder.String() -} - -func parseAssembly(path string) (map[string][]Line, error) { - file, err := os.Open(path) - if err != nil { - return nil, err - } - defer func(file *os.File) { - if err = file.Close(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - }(file) - - var ( - functions = make(map[string][]Line) - functionName string - labelName string - ) - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - if attributeLine.MatchString(line) { - continue - } else if nameLine.MatchString(line) { - functionName = strings.Split(line, ":")[0] - functions[functionName] = make([]Line, 0) - } else if labelLine.MatchString(line) { - labelName = strings.Split(line, ":")[0] - labelName = labelName[1:] - lines := functions[functionName] - if len(lines) == 1 || lines[len(lines)-1].Assembly != "" { - functions[functionName] = append(functions[functionName], Line{Labels: []string{labelName}}) - } else { - lines[len(lines)-1].Labels = append(lines[len(lines)-1].Labels, labelName) - } - } else if codeLine.MatchString(line) { - asm := strings.Split(line, "#")[0] - asm = strings.TrimSpace(asm) - if labelName == "" { - functions[functionName] = append(functions[functionName], Line{Assembly: asm}) - } else { - lines := functions[functionName] - lines[len(lines)-1].Assembly = asm - labelName = "" - } - } - } - - if err = scanner.Err(); err != nil { - return nil, err - } - return functions, nil -} - -func parseObjectDump(dump string, functions map[string][]Line) error { - var ( - functionName string - lineNumber int - ) - for i, line := range strings.Split(dump, "\n") { - line = strings.TrimSpace(line) - if symbolLine.MatchString(line) { - functionName = strings.Split(line, "<")[1] - functionName = strings.Split(functionName, ">")[0] - lineNumber = 0 - } else if dataLine.MatchString(line) { - data := strings.Split(line, ":")[1] - data = strings.TrimSpace(data) - splits := strings.Split(data, " ") - var ( - binary string - assembly string - ) - for i, s := range splits { - if s == "" || unicode.IsSpace(rune(s[0])) { - assembly = strings.Join(splits[i:], " ") - assembly = strings.TrimSpace(assembly) - break - } - binary = s - } - if lineNumber >= len(functions[functionName]) { - return fmt.Errorf("%d: unexpected objectdump line: %s", i, line) - } - functions[functionName][lineNumber].Binary = binary - lineNumber++ - } - } - return nil -} - -func generateGoAssembly(path string, functions []Function) error { - // generate code - var builder strings.Builder - builder.WriteString(buildTags) - builder.WriteString("// AUTO-GENERATED BY GOAT -- DO NOT EDIT\n") - for _, function := range functions { - builder.WriteString(fmt.Sprintf("\nTEXT ·%v(SB), $0-32\n", function.Name)) - for i, param := range function.Parameters { - builder.WriteString(fmt.Sprintf("\tMOVD %s+%d(FP), %s\n", param, i*8, registers[i])) - } - for _, line := range function.Lines { - builder.WriteString(line.String()) - } - } - - // write file - f, err := os.Create(path) - if err != nil { - return err - } - defer func(f *os.File) { - if err = f.Close(); err != nil { - _, _ = fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } - }(f) - bytes, err := asmfmt.Format(strings.NewReader(builder.String())) - if err != nil { - return err - } - _, err = f.Write(bytes) - return err -}