Skip to content

Commit

Permalink
blake2b: port blake2b_amd64.s to Avo
Browse files Browse the repository at this point in the history
This implementation utilizes the same registers found in the reference
implementation, aiming to produce a minimal semantic diff between the
Avo-generated output and the original hand-written assembly.

To verify the Avo implementation, the reference and Avo-generated
assembly files are fed to `go tool asm`, capturing the debug output into
corresponding temp files. The debug output contains supplementary
metadata (line numbers, instruction offsets, and source file references)
that must be removed in order to obtain a semantic diff of the two
files. This is accomplished via a small utility script written in awk.

Commands used to verify Avo output:

GOROOT=$(go env GOROOT)
ASM_PATH="blake2b/blake2b_amd64.s"
REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340"

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  <(git cat-file -p "$REFERENCE:$ASM_PATH") \
  > /tmp/reference.s

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  "$ASM_PATH" \
  > /tmp/avo.s

normalize(){
  awk '{
    $1=$2=$3="";
    print substr($0,4)
  }'
}

diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s)

Change-Id: I6dd59fb0b0365674aa5e43b69a57ea60fbcc4ba1
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600456
Reviewed-by: Dmitri Shuralyov <[email protected]>
Reviewed-by: Roland Shoemaker <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: Filippo Valsorda <[email protected]>
  • Loading branch information
Garrett-Bodley authored and rolandshoemaker committed Sep 4, 2024
1 parent 0484c26 commit 82942cf
Show file tree
Hide file tree
Showing 4 changed files with 1,810 additions and 259 deletions.
361 changes: 361 additions & 0 deletions blake2b/_asm/standard/blake2b_amd64_asm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package main

import (
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
_ "golang.org/x/crypto/blake2b"
)

//go:generate go run . -out ../../blake2b_amd64.s -pkg blake2b

const ThatPeskyUnicodeDot = "\u00b7"

var iv0_DATA_ptr, iv1_DATA_ptr, iv2_DATA_ptr, iv3_DATA_ptr, c40_DATA_ptr, c48_DATA_ptr *Mem

func main() {
Package("golang.org/x/crypto/blake2b")
ConstraintExpr("amd64,gc,!purego")
hashBlocksSSE4()
Generate()
}

func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
MOVO(v4, t1)
MOVO(v5, v4)
MOVO(t1, v5)
MOVO(v6, t1)
PUNPCKLQDQ(v6, t2)
PUNPCKHQDQ(v7, v6)
PUNPCKHQDQ(t2, v6)
PUNPCKLQDQ(v7, t2)
MOVO(t1, v7)
MOVO(v2, t1)
PUNPCKHQDQ(t2, v7)
PUNPCKLQDQ(v3, t2)
PUNPCKHQDQ(t2, v2)
PUNPCKLQDQ(t1, t2)
PUNPCKHQDQ(t2, v3)
}

func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
MOVO(v4, t1)
MOVO(v5, v4)
MOVO(t1, v5)
MOVO(v2, t1)
PUNPCKLQDQ(v2, t2)
PUNPCKHQDQ(v3, v2)
PUNPCKHQDQ(t2, v2)
PUNPCKLQDQ(v3, t2)
MOVO(t1, v3)
MOVO(v6, t1)
PUNPCKHQDQ(t2, v3)
PUNPCKLQDQ(v7, t2)
PUNPCKHQDQ(t2, v6)
PUNPCKLQDQ(t1, t2)
PUNPCKHQDQ(t2, v7)
}

func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7 VecPhysical, m0, m1, m2, m3 Op, t0, c40, c48 VecPhysical) {
PADDQ(m0, v0)
PADDQ(m1, v1)
PADDQ(v2, v0)
PADDQ(v3, v1)
PXOR(v0, v6)
PXOR(v1, v7)
PSHUFD(Imm(0xB1), v6, v6)
PSHUFD(Imm(0xB1), v7, v7)
PADDQ(v6, v4)
PADDQ(v7, v5)
PXOR(v4, v2)
PXOR(v5, v3)
PSHUFB(c40, v2)
PSHUFB(c40, v3)
PADDQ(m2, v0)
PADDQ(m3, v1)
PADDQ(v2, v0)
PADDQ(v3, v1)
PXOR(v0, v6)
PXOR(v1, v7)
PSHUFB(c48, v6)
PSHUFB(c48, v7)
PADDQ(v6, v4)
PADDQ(v7, v5)
PXOR(v4, v2)
PXOR(v5, v3)
MOVOU(v2, t0)
PADDQ(v2, t0)
PSRLQ(Imm(63), v2)
PXOR(t0, v2)
MOVOU(v3, t0)
PADDQ(v3, t0)
PSRLQ(Imm(63), v3)
PXOR(t0, v3)
}

func LOAD_MSG(m0, m1, m2, m3 VecPhysical, src GPPhysical, i0, i1, i2, i3, i4, i5, i6, i7 int) {
MOVQ(Mem{Base: src}.Offset(i0*8), m0)
PINSRQ(Imm(1), Mem{Base: src}.Offset(i1*8), m0)
MOVQ(Mem{Base: src}.Offset(i2*8), m1)
PINSRQ(Imm(1), Mem{Base: src}.Offset(i3*8), m1)
MOVQ(Mem{Base: src}.Offset(i4*8), m2)
PINSRQ(Imm(1), Mem{Base: src}.Offset(i5*8), m2)
MOVQ(Mem{Base: src}.Offset(i6*8), m3)
PINSRQ(Imm(1), Mem{Base: src}.Offset(i7*8), m3)
}

func hashBlocksSSE4() {
Implement("hashBlocksSSE4")
Attributes(4)
AllocLocal(288) // frame size = 272 + 16 byte alignment

Load(Param("h"), RAX)
Load(Param("c"), RBX)
Load(Param("flag"), RCX)
Load(Param("blocks").Base(), RSI)
Load(Param("blocks").Len(), RDI)

MOVQ(RSP, R10)
ADDQ(Imm(15), R10)
ANDQ(I32(-16), R10)

iv3 := iv3_DATA()
MOVOU(iv3, X0)
MOVO(X0, Mem{Base: R10}.Offset(0))
XORQ(RCX, Mem{Base: R10}.Offset(0)) // 0(R10) = ·iv3 ^ (CX || 0)

c40 := c40_DATA()
c48 := c48_DATA()
MOVOU(c40, X13)
MOVOU(c48, X14)

MOVOU(Mem{Base: AX}.Offset(0), X12)
MOVOU(Mem{Base: AX}.Offset(16), X15)

MOVQ(Mem{Base: BX}.Offset(0), R8)
MOVQ(Mem{Base: BX}.Offset(8), R9)

Label("loop")
ADDQ(Imm(128), R8)
CMPQ(R8, Imm(128))
JGE(LabelRef("noinc"))
INCQ(R9)

Label("noinc")
MOVQ(R8, X8)
PINSRQ(Imm(1), R9, X8)

iv0 := iv0_DATA()
iv1 := iv1_DATA()
iv2 := iv2_DATA()

MOVO(X12, X0)
MOVO(X15, X1)
MOVOU(Mem{Base: AX}.Offset(32), X2)
MOVOU(Mem{Base: AX}.Offset(48), X3)
MOVOU(iv0, X4)
MOVOU(iv1, X5)
MOVOU(iv2, X6)

PXOR(X8, X6)
MOVO(Mem{Base: R10}.Offset(0), X7)

LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
MOVO(X8, Mem{Base: R10}.Offset(16))
MOVO(X9, Mem{Base: R10}.Offset(32))
MOVO(X10, Mem{Base: R10}.Offset(48))
MOVO(X11, Mem{Base: R10}.Offset(64))
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
MOVO(X8, Mem{Base: R10}.Offset(80))
MOVO(X9, Mem{Base: R10}.Offset(96))
MOVO(X10, Mem{Base: R10}.Offset(112))
MOVO(X11, Mem{Base: R10}.Offset(128))
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
MOVO(X8, Mem{Base: R10}.Offset(144))
MOVO(X9, Mem{Base: R10}.Offset(160))
MOVO(X10, Mem{Base: R10}.Offset(176))
MOVO(X11, Mem{Base: R10}.Offset(192))
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
MOVO(X8, Mem{Base: R10}.Offset(208))
MOVO(X9, Mem{Base: R10}.Offset(224))
MOVO(X10, Mem{Base: R10}.Offset(240))
MOVO(X11, Mem{Base: R10}.Offset(256))
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(16), Mem{Base: R10}.Offset(32), Mem{Base: R10}.Offset(48), Mem{Base: R10}.Offset(64), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(80), Mem{Base: R10}.Offset(96), Mem{Base: R10}.Offset(112), Mem{Base: R10}.Offset(128), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(144), Mem{Base: R10}.Offset(160), Mem{Base: R10}.Offset(176), Mem{Base: R10}.Offset(192), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(208), Mem{Base: R10}.Offset(224), Mem{Base: R10}.Offset(240), Mem{Base: R10}.Offset(256), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)

MOVOU(Mem{Base: AX}.Offset(32), X10)
MOVOU(Mem{Base: AX}.Offset(48), X11)
PXOR(X0, X12)
PXOR(X1, X15)
PXOR(X2, X10)
PXOR(X3, X11)
PXOR(X4, X12)
PXOR(X5, X15)
PXOR(X6, X10)
PXOR(X7, X11)
MOVOU(X10, Mem{Base: AX}.Offset(32))
MOVOU(X11, Mem{Base: AX}.Offset(48))

LEAQ(Mem{Base: SI}.Offset(128), RSI)
SUBQ(Imm(128), RDI)
JNE(LabelRef("loop"))

MOVOU(X12, Mem{Base: AX}.Offset(0))
MOVOU(X15, Mem{Base: AX}.Offset(16))

MOVQ(R8, Mem{Base: BX}.Offset(0))
MOVQ(R9, Mem{Base: BX}.Offset(8))

RET()
}

// #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##

func iv0_DATA() Mem {
if iv0_DATA_ptr != nil {
return *iv0_DATA_ptr
}

iv0 := GLOBL(ThatPeskyUnicodeDot+"iv0", NOPTR|RODATA)
iv0_DATA_ptr = &iv0
DATA(0x00, U64(0x6a09e667f3bcc908))
DATA(0x08, U64(0xbb67ae8584caa73b))
return iv0
}

func iv1_DATA() Mem {
if iv1_DATA_ptr != nil {
return *iv1_DATA_ptr
}

iv1 := GLOBL(ThatPeskyUnicodeDot+"iv1", NOPTR|RODATA)
iv1_DATA_ptr = &iv1
DATA(0x00, U64(0x3c6ef372fe94f82b))
DATA(0x08, U64(0xa54ff53a5f1d36f1))
return iv1
}

func iv2_DATA() Mem {
if iv2_DATA_ptr != nil {
return *iv2_DATA_ptr
}

iv2 := GLOBL(ThatPeskyUnicodeDot+"iv2", NOPTR|RODATA)
iv2_DATA_ptr = &iv2
DATA(0x00, U64(0x510e527fade682d1))
DATA(0x08, U64(0x9b05688c2b3e6c1f))
return iv2
}

func iv3_DATA() Mem {
if iv3_DATA_ptr != nil {
return *iv3_DATA_ptr
}

iv3 := GLOBL(ThatPeskyUnicodeDot+"iv3", NOPTR|RODATA)
iv3_DATA_ptr = &iv3
DATA(0x00, U64(0x1f83d9abfb41bd6b))
DATA(0x08, U64(0x5be0cd19137e2179))
return iv3
}

func c40_DATA() Mem {
if c40_DATA_ptr != nil {
return *c40_DATA_ptr
}

c40 := GLOBL(ThatPeskyUnicodeDot+"c40", NOPTR|RODATA)
c40_DATA_ptr = &c40
DATA(0x00, U64(0x0201000706050403))
DATA(0x08, U64(0x0a09080f0e0d0c0b))
return c40
}

func c48_DATA() Mem {
if c48_DATA_ptr != nil {
return *c48_DATA_ptr
}

c48 := GLOBL(ThatPeskyUnicodeDot+"c48", NOPTR|RODATA)
c48_DATA_ptr = &c48
DATA(0x00, U64(0x0100070605040302))
DATA(0x08, U64(0x09080f0e0d0c0b0a))
return c48
}
15 changes: 15 additions & 0 deletions blake2b/_asm/standard/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module blake2b/_asm

go 1.23

require (
github.com/mmcloughlin/avo v0.6.0
golang.org/x/crypto v0.26.0
)

require (
golang.org/x/mod v0.20.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.24.0 // indirect
golang.org/x/tools v0.24.0 // indirect
)
12 changes: 12 additions & 0 deletions blake2b/_asm/standard/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
Loading

0 comments on commit 82942cf

Please sign in to comment.