all: add '#pragma target' directive (#10)

Here I add support for a new directive #pragma target for configuring the target instruction set. Opcode names are resolved against the instruction set, and choosing a target also configures use of PUSH0.
fjl · Nov 23, 2024 · 681ba22 · 681ba22
1 parent e5be52b
commit 681ba22
Show file tree

Hide file tree

Showing 22 changed files with 1,135 additions and 649 deletions.
diff --git a/README.md b/README.md
@@ -242,10 +242,50 @@ main.eas:
         push 2
         %StoreSum  ;; calling global macro defined in lib.evm
 
+### Configuring the target instruction set
+
+The EVM is a changing environment. Opcodes may be added (and sometimes removed) as new
+versions of the EVM are released in protocol forks. Geas is aware of EVM forks and their
+respective instruction sets.
+
+Geas always operates on a specific EVM instruction set. It targets the latest known eth
+mainnet fork by default, i.e. all opcodes available in that fork can be used, and opcodes
+that have been removed in any prior fork cannot.
+
+Use the `#pragma target` directive to change the target instruction set. The basic syntax is
+
+    #pragma target "name"
+
+where `name` is a lower-case execution-layer fork name like `homestead`, `berlin`, or `prague`.
+
+Here is an example. This contract uses the CHAINID instruction to check if it is running
+on mainnet, and destroys itself otherwise. CHAINID became available in the "istanbul"
+fork, and SELFDESTRUCT was removed in a later revision of the EVM, so this program is only
+applicable to a certain range of past EVM versions.
+
+    #pragma target "berlin"
+
+        chainid                ; [id]
+        push 1                 ; [1, id]
+        eq                     ; [id = 1]
+        jumpi @mainnet         ; []
+        push 0x0               ; [zeroaddr]
+        selfdestruct           ; []
+    mainnet:
+
+Note that declaring the target instruction set using `#pragma target` will not prevent the
+output bytecode from running on a different EVM version, since it is just a compiler
+setting. The example program above will start behaving differently from its intended
+version on EVM version "cancun", because SELFDESTRUCT was turned into SENDALL in that
+fork. It may even stop working entirely in a later fork.
+
+`#pragma target` can only appear in the program once. It cannot be placed in an include
+file. You have to put the directive in the main program file.
+
 ### #assemble
 
 When writing contract constructors and advanced CALL scenarios, it can be necessary to
-include subprogram bytecode as-is. The `#assemble` directive can do this for you.
+include subprogram bytecode as-is. The `#assemble` directive does this for you.
 
 Using `#assemble` runs the assembler on the specified file, and includes the resulting
 bytecode into the current program. Labels of the subprogram will start at offset zero.
@@ -261,5 +301,8 @@ Unlike with `#include`, global definitions of the subprogram are not imported.
     #assemble "subprogram.eas"
     .end
 
+If a target instruction set is configured with `#pragma target`, it will also be used for
+assembling the subprogram. However, the subprogram file can override the instruction set
+using its own `#pragma target` directive.
 
 [^1]: Under no circumstances must it be called the geth assembler.
diff --git a/asm/compiler.go b/asm/compiler.go
@@ -27,6 +27,7 @@ import (
 	"strings"
 
 	"github.com/fjl/geas/internal/ast"
+	"github.com/fjl/geas/internal/evm"
 )
 
 // Compiler performs the assembling.
@@ -35,7 +36,7 @@ type Compiler struct {
 	lexDebug    bool
 	maxIncDepth int
 	maxErrors   int
-	usePush0    bool
+	defaultFork string
 
 	globals    *globalScope
 	errors     []error
@@ -51,7 +52,7 @@ func NewCompiler(fsys fs.FS) *Compiler {
 		includes:    make(map[*ast.IncludeSt]*ast.Document),
 		maxIncDepth: 128,
 		maxErrors:   10,
-		usePush0:    true,
+		defaultFork: evm.LatestFork,
 	}
 }
 
@@ -60,10 +61,9 @@ func (c *Compiler) SetDebugLexer(on bool) {
 	c.lexDebug = on
 }
 
-// SetUsePush0 enables/disables use of the PUSH0 instruction.
-// It's on by default.
-func (c *Compiler) SetUsePush0(on bool) {
-	c.usePush0 = on
+// SetDefaultFork sets the EVM instruction set used by default.
+func (c *Compiler) SetDefaultFork(f string) {
+	c.defaultFork = f
 }
 
 // SetDebugLexer enables/disables printing of the token stream to stdout.
@@ -135,22 +135,26 @@ func (c *Compiler) addErrors(errs []error) {
 
 // compile is the toplevel entry point into the compiler.
 func (c *Compiler) compile(doc *ast.Document) (output []byte) {
-	prevGlobals := c.globals
-	c.globals = newGlobalScope()
-	defer func() { c.globals = prevGlobals }()
-
 	defer func() {
 		panicking := recover()
 		if panicking != nil && panicking != errCancelCompilation {
 			panic(panicking)
 		}
 	}()
 
+	c.globals = newGlobalScope()
+	prog := newCompilerProg(doc)
+
 	// First, load all #include files and register their definitions.
-	c.processIncludes(doc, nil)
+	// This also configures the instruction set if specified by a #pragma.
+	c.processIncludes(doc, prog, nil)
+
+	// Choose latest eth mainnet instruction set if not configured.
+	if prog.evm == nil {
+		prog.evm = evm.FindInstructionSet(c.defaultFork)
+	}
 
 	// Next, the AST document tree is expanded into a flat list of instructions.
-	prog := newCompilerProg(doc)
 	c.expand(doc, prog)
 	if prog.cur != prog.toplevel {
 		panic("section stack was not unwound by expansion")
@@ -184,38 +188,53 @@ func (c *Compiler) compile(doc *ast.Document) (output []byte) {
 }
 
 // processIncludes reads all #included documents.
-func (c *Compiler) processIncludes(doc *ast.Document, stack []ast.Statement) {
+func (c *Compiler) processIncludes(doc *ast.Document, prog *compilerProg, stack []ast.Statement) {
 	errs := c.globals.registerDefinitions(doc)
 	c.addErrors(errs)
 
 	var list []*ast.IncludeSt
-	for _, inst := range doc.Statements {
-		inc, ok := inst.(*ast.IncludeSt)
-		if !ok {
-			continue
-		}
-		file, err := resolveRelative(doc.File, inc.Filename)
-		if err != nil {
-			c.addError(inst, err)
-			continue
-		}
-		incdoc := c.parseIncludeFile(file, inc, len(stack)+1)
-		if incdoc == nil {
-			continue // there were parse errors
+	for _, st := range doc.Statements {
+		switch st := st.(type) {
+		case *ast.IncludeSt:
+			file, err := resolveRelative(doc.File, st.Filename)
+			if err != nil {
+				c.addError(st, err)
+				continue
+			}
+			incdoc := c.parseIncludeFile(file, st, len(stack)+1)
+			if incdoc != nil {
+				c.includes[st] = incdoc
+				list = append(list, st)
+			}
+
+		case *ast.PragmaSt:
+			switch st.Option {
+			case "target":
+				if len(stack) != 0 {
+					c.addError(st, ecPragmaTargetInIncludeFile)
+				}
+				if prog.evm != nil {
+					c.addError(st, ecPragmaTargetConflict)
+				}
+				prog.evm = evm.FindInstructionSet(st.Value)
+				if prog.evm == nil {
+					c.addError(st, fmt.Errorf("%w %q", ecPragmaTargetUnknown, st.Value))
+				}
+			default:
+				c.addError(st, fmt.Errorf("%w %s", ecUnknownPragma, st.Option))
+			}
 		}
-		c.includes[inc] = incdoc
-		list = append(list, inc)
 	}
 
 	// Process includes in macros.
 	for _, m := range doc.InstrMacros() {
-		c.processIncludes(m.Body, append(stack, m))
+		c.processIncludes(m.Body, prog, append(stack, m))
 	}
 
 	// Recurse.
 	for _, inst := range list {
 		incdoc := c.includes[inst]
-		c.processIncludes(incdoc, append(stack, inst))
+		c.processIncludes(incdoc, prog, append(stack, inst))
 	}
 }
 
@@ -264,19 +283,48 @@ func (c *Compiler) generateOutput(prog *compilerProg) []byte {
 	if len(c.errors) > 0 {
 		return nil
 	}
+
 	var output []byte
 	for _, inst := range prog.iterInstructions() {
 		if len(output) != inst.pc {
 			panic(fmt.Sprintf("BUG: instruction pc=%d, but output has size %d", inst.pc, len(output)))
 		}
-		if inst.op != "" {
-			opcode, ok := inst.opcode()
-			if !ok {
+
+		switch {
+		case isPush(inst.op):
+			if inst.pushSize > 32 {
+				panic("BUG: pushSize > 32")
+			}
+			if len(inst.data) > inst.pushSize {
+				panic(fmt.Sprintf("BUG: push inst.data %d > inst.pushSize %d", len(inst.data), inst.pushSize))
+			}
+
+			// resolve the op
+			var op *evm.Op
+			if inst.op == "PUSH" {
+				op = prog.evm.PushBySize(inst.pushSize)
+			} else {
+				op = prog.evm.OpByName(inst.op)
+			}
+			if op == nil {
+				panic(fmt.Sprintf("BUG: opcode for %q (size %d) not found", inst.op, inst.pushSize))
+			}
+
+			// Add opcode and data padding to output.
+			output = append(output, op.Code)
+			if len(inst.data) < inst.pushSize {
+				output = append(output, make([]byte, inst.pushSize-len(inst.data))...)
+			}
+
+		case inst.op != "":
+			op := prog.evm.OpByName(inst.op)
+			if op == nil {
 				c.addError(inst.ast, fmt.Errorf("%w %s", ecUnknownOpcode, inst.op))
-				continue
 			}
-			output = append(output, byte(opcode))
+			output = append(output, op.Code)
 		}
+
+		// Instruction data is always added to output.
 		output = append(output, inst.data...)
 	}
 	return output

diff --git a/asm/compiler_eval.go b/asm/compiler_eval.go
@@ -48,7 +48,7 @@ func (c *Compiler) assignInitialPushSizes(e *evaluator, prog *compilerProg) {
 			c.addError(inst.ast, err)
 			continue
 		}
-		if err := c.assignPushArg(inst, v, true); err != nil {
+		if err := prog.assignPushArg(inst, v, true); err != nil {
 			c.addError(inst.ast, err)
 			continue
 		}
@@ -91,7 +91,7 @@ func (c *Compiler) assignArgs(e *evaluator, prog *compilerProg) (inst *instructi
 		if err != nil {
 			return inst, err
 		}
-		if err := c.assignPushArg(inst, v, false); err != nil {
+		if err := prog.assignPushArg(inst, v, false); err != nil {
 			return inst, err
 		}
 	}
@@ -103,7 +103,7 @@ func (c *Compiler) assignArgs(e *evaluator, prog *compilerProg) (inst *instructi
 //
 // If setSize is true, the pushSize of variable-size "PUSH" instructions will be assigned
 // based on the value.
-func (c *Compiler) assignPushArg(inst *instruction, v *big.Int, setSize bool) error {
+func (prog *compilerProg) assignPushArg(inst *instruction, v *big.Int, setSize bool) error {
 	if v.Sign() < 0 {
 		return ecNegativeResult
 	}
@@ -115,7 +115,7 @@ func (c *Compiler) assignPushArg(inst *instruction, v *big.Int, setSize bool) er
 
 	_, hasExplicitSize := inst.explicitPushSize()
 	if setSize && !hasExplicitSize {
-		inst.pushSize = c.autoPushSize(b)
+		inst.pushSize = prog.autoPushSize(b)
 	}
 	if len(b) > inst.pushSize {
 		if !hasExplicitSize {
@@ -124,22 +124,18 @@ func (c *Compiler) assignPushArg(inst *instruction, v *big.Int, setSize bool) er
 		return ecFixedSizePushOverflow
 	}
 
-	// Store data padded.
-	inst.data = make([]byte, inst.pushSize)
-	copy(inst.data[len(inst.data)-len(b):], b)
+	// Store data. Note there is no padding applied here.
+	// Padding will be added at the bytecode output stage.
+	inst.data = b
 	return nil
 }
 
-func (c *Compiler) autoPushSize(value []byte) int {
+func (prog *compilerProg) autoPushSize(value []byte) int {
 	if len(value) > 32 {
 		panic("value too big")
 	}
-	if len(value) == 0 {
-		if c.usePush0 {
-			return 0
-		} else {
-			return 1
-		}
+	if len(value) == 0 && !prog.evm.SupportsPush0() {
+		return 1
 	}
 	return len(value)
 }