[experiment] add alternative wasm sqlite3 implementation available via build-tag (#2863)

This allows for building GoToSocial with [SQLite transpiled to WASM](https://github.com/ncruces/go-sqlite3) and accessed through [Wazero](https://wazero.io/).
2025-10-31 21:22:26 -05:00 · 2024-05-27 15:46:15 +00:00 · 2024-05-27 15:46:15 +00:00 · 1e7b32490d
commit 1e7b32490d
parent cce21c11cb
398 changed files with 86174 additions and 684 deletions
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
@ -0,0 +1,170 @@
+package backend
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	// FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature.
+	FunctionABI struct {
+		Initialized bool
+
+		Args, Rets                 []ABIArg
+		ArgStackSize, RetStackSize int64
+
+		ArgIntRealRegs   byte
+		ArgFloatRealRegs byte
+		RetIntRealRegs   byte
+		RetFloatRealRegs byte
+	}
+
+	// ABIArg represents either argument or return value's location.
+	ABIArg struct {
+		// Index is the index of the argument.
+		Index int
+		// Kind is the kind of the argument.
+		Kind ABIArgKind
+		// Reg is valid if Kind == ABIArgKindReg.
+		// This VReg must be based on RealReg.
+		Reg regalloc.VReg
+		// Offset is valid if Kind == ABIArgKindStack.
+		// This is the offset from the beginning of either arg or ret stack slot.
+		Offset int64
+		// Type is the type of the argument.
+		Type ssa.Type
+	}
+
+	// ABIArgKind is the kind of ABI argument.
+	ABIArgKind byte
+)
+
+const (
+	// ABIArgKindReg represents an argument passed in a register.
+	ABIArgKindReg = iota
+	// ABIArgKindStack represents an argument passed in the stack.
+	ABIArgKindStack
+)
+
+// String implements fmt.Stringer.
+func (a *ABIArg) String() string {
+	return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind)
+}
+
+// String implements fmt.Stringer.
+func (a ABIArgKind) String() string {
+	switch a {
+	case ABIArgKindReg:
+		return "reg"
+	case ABIArgKindStack:
+		return "stack"
+	default:
+		panic("BUG")
+	}
+}
+
+// Init initializes the abiImpl for the given signature.
+func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) {
+	if len(a.Rets) < len(sig.Results) {
+		a.Rets = make([]ABIArg, len(sig.Results))
+	}
+	a.Rets = a.Rets[:len(sig.Results)]
+	a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats)
+	if argsNum := len(sig.Params); len(a.Args) < argsNum {
+		a.Args = make([]ABIArg, argsNum)
+	}
+	a.Args = a.Args[:len(sig.Params)]
+	a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats)
+
+	// Gather the real registers usages in arg/return.
+	a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0
+	a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0
+	for i := range a.Rets {
+		r := &a.Rets[i]
+		if r.Kind == ABIArgKindReg {
+			if r.Type.IsInt() {
+				a.RetIntRealRegs++
+			} else {
+				a.RetFloatRealRegs++
+			}
+		}
+	}
+	for i := range a.Args {
+		arg := &a.Args[i]
+		if arg.Kind == ABIArgKindReg {
+			if arg.Type.IsInt() {
+				a.ArgIntRealRegs++
+			} else {
+				a.ArgFloatRealRegs++
+			}
+		}
+	}
+
+	a.Initialized = true
+}
+
+// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types)
+// where if len(s) > len(types), the last elements of s is for the multi-return slot.
+func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) {
+	il, fl := len(ints), len(floats)
+
+	var stackOffset int64
+	intParamIndex, floatParamIndex := 0, 0
+	for i, typ := range types {
+		arg := &s[i]
+		arg.Index = i
+		arg.Type = typ
+		if typ.IsInt() {
+			if intParamIndex >= il {
+				arg.Kind = ABIArgKindStack
+				const slotSize = 8 // Align 8 bytes.
+				arg.Offset = stackOffset
+				stackOffset += slotSize
+			} else {
+				arg.Kind = ABIArgKindReg
+				arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt)
+				intParamIndex++
+			}
+		} else {
+			if floatParamIndex >= fl {
+				arg.Kind = ABIArgKindStack
+				slotSize := int64(8)   // Align at least 8 bytes.
+				if typ.Bits() == 128 { // Vector.
+					slotSize = 16
+				}
+				arg.Offset = stackOffset
+				stackOffset += slotSize
+			} else {
+				arg.Kind = ABIArgKindReg
+				arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat)
+				floatParamIndex++
+			}
+		}
+	}
+	return stackOffset
+}
+
+func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 {
+	stackSlotSize := a.RetStackSize + a.ArgStackSize
+	// Align stackSlotSize to 16 bytes.
+	stackSlotSize = (stackSlotSize + 15) &^ 15
+	// Check overflow 32-bit.
+	if stackSlotSize > 0xFFFFFFFF {
+		panic("ABI stack slot size overflow")
+	}
+	return uint32(stackSlotSize)
+}
+
+func (a *FunctionABI) ABIInfoAsUint64() uint64 {
+	return uint64(a.ArgIntRealRegs)<<56 |
+		uint64(a.ArgFloatRealRegs)<<48 |
+		uint64(a.RetIntRealRegs)<<40 |
+		uint64(a.RetFloatRealRegs)<<32 |
+		uint64(a.AlignedArgResultStackSlotSize())
+}
+
+func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) {
+	return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
@ -0,0 +1,3 @@
+// Package backend must be free of Wasm-specific concept. In other words,
+// this package must not import internal/wasm package.
+package backend
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
@ -0,0 +1,417 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// NewCompiler returns a new Compiler that can generate a machine code.
+func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler {
+	return newCompiler(ctx, mach, builder)
+}
+
+func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler {
+	argResultInts, argResultFloats := mach.ArgsResultsRegs()
+	c := &compiler{
+		mach: mach, ssaBuilder: builder,
+		nextVRegID:      regalloc.VRegIDNonReservedBegin,
+		argResultInts:   argResultInts,
+		argResultFloats: argResultFloats,
+	}
+	mach.SetCompiler(c)
+	return c
+}
+
+// Compiler is the backend of wazevo which takes ssa.Builder and Machine,
+// use the information there to emit the final machine code.
+type Compiler interface {
+	// SSABuilder returns the ssa.Builder used by this compiler.
+	SSABuilder() ssa.Builder
+
+	// Compile executes the following steps:
+	// 	1. Lower()
+	// 	2. RegAlloc()
+	// 	3. Finalize()
+	// 	4. Encode()
+	//
+	// Each step can be called individually for testing purpose, therefore they are exposed in this interface too.
+	//
+	// The returned byte slices are the machine code and the relocation information for the machine code.
+	// The caller is responsible for copying them immediately since the compiler may reuse the buffer.
+	Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error)
+
+	// Lower lowers the given ssa.Instruction to the machine-specific instructions.
+	Lower()
+
+	// RegAlloc performs the register allocation after Lower is called.
+	RegAlloc()
+
+	// Finalize performs the finalization of the compilation, including machine code emission.
+	// This must be called after RegAlloc.
+	Finalize(ctx context.Context) error
+
+	// Buf returns the buffer of the encoded machine code. This is only used for testing purpose.
+	Buf() []byte
+
+	BufPtr() *[]byte
+
+	// Format returns the debug string of the current state of the compiler.
+	Format() string
+
+	// Init initializes the internal state of the compiler for the next compilation.
+	Init()
+
+	// AllocateVReg allocates a new virtual register of the given type.
+	AllocateVReg(typ ssa.Type) regalloc.VReg
+
+	// ValueDefinition returns the definition of the given value.
+	ValueDefinition(ssa.Value) *SSAValueDefinition
+
+	// VRegOf returns the virtual register of the given ssa.Value.
+	VRegOf(value ssa.Value) regalloc.VReg
+
+	// TypeOf returns the ssa.Type of the given virtual register.
+	TypeOf(regalloc.VReg) ssa.Type
+
+	// MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID,
+	// and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group.
+	MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool
+
+	// MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode,
+	// this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid.
+	//
+	// Note: caller should be careful to avoid excessive allocation on opcodes slice.
+	MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
+
+	// AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
+	AddRelocationInfo(funcRef ssa.FuncRef)
+
+	// AddSourceOffsetInfo appends the source offset information for the given offset.
+	AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset)
+
+	// SourceOffsetInfo returns the source offset information for the current buffer offset.
+	SourceOffsetInfo() []SourceOffsetInfo
+
+	// EmitByte appends a byte to the buffer. Used during the code emission.
+	EmitByte(b byte)
+
+	// Emit4Bytes appends 4 bytes to the buffer. Used during the code emission.
+	Emit4Bytes(b uint32)
+
+	// Emit8Bytes appends 8 bytes to the buffer. Used during the code emission.
+	Emit8Bytes(b uint64)
+
+	// GetFunctionABI returns the ABI information for the given signature.
+	GetFunctionABI(sig *ssa.Signature) *FunctionABI
+}
+
+// RelocationInfo represents the relocation information for a call instruction.
+type RelocationInfo struct {
+	// Offset represents the offset from the beginning of the machine code of either a function or the entire module.
+	Offset int64
+	// Target is the target function of the call instruction.
+	FuncRef ssa.FuncRef
+}
+
+// compiler implements Compiler.
+type compiler struct {
+	mach       Machine
+	currentGID ssa.InstructionGroupID
+	ssaBuilder ssa.Builder
+	// nextVRegID is the next virtual register ID to be allocated.
+	nextVRegID regalloc.VRegID
+	// ssaValueToVRegs maps ssa.ValueID to regalloc.VReg.
+	ssaValueToVRegs [] /* VRegID to */ regalloc.VReg
+	// ssaValueDefinitions maps ssa.ValueID to its definition.
+	ssaValueDefinitions []SSAValueDefinition
+	// ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts().
+	ssaValueRefCounts []int
+	// returnVRegs is the list of virtual registers that store the return values.
+	returnVRegs  []regalloc.VReg
+	varEdges     [][2]regalloc.VReg
+	varEdgeTypes []ssa.Type
+	constEdges   []struct {
+		cInst *ssa.Instruction
+		dst   regalloc.VReg
+	}
+	vRegSet         []bool
+	vRegIDs         []regalloc.VRegID
+	tempRegs        []regalloc.VReg
+	tmpVals         []ssa.Value
+	ssaTypeOfVRegID [] /* VRegID to */ ssa.Type
+	buf             []byte
+	relocations     []RelocationInfo
+	sourceOffsets   []SourceOffsetInfo
+	// abis maps ssa.SignatureID to the ABI implementation.
+	abis                           []FunctionABI
+	argResultInts, argResultFloats []regalloc.RealReg
+}
+
+// SourceOffsetInfo is a data to associate the source offset with the executable offset.
+type SourceOffsetInfo struct {
+	// SourceOffset is the source offset in the original source code.
+	SourceOffset ssa.SourceOffset
+	// ExecutableOffset is the offset in the compiled executable.
+	ExecutableOffset int64
+}
+
+// Compile implements Compiler.Compile.
+func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) {
+	c.Lower()
+	if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format())
+	}
+	c.RegAlloc()
+	if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format())
+	}
+	if err := c.Finalize(ctx); err != nil {
+		return nil, nil, err
+	}
+	if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format())
+	}
+	return c.buf, c.relocations, nil
+}
+
+// RegAlloc implements Compiler.RegAlloc.
+func (c *compiler) RegAlloc() {
+	c.mach.RegAlloc()
+}
+
+// Finalize implements Compiler.Finalize.
+func (c *compiler) Finalize(ctx context.Context) error {
+	c.mach.PostRegAlloc()
+	return c.mach.Encode(ctx)
+}
+
+// setCurrentGroupID sets the current instruction group ID.
+func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) {
+	c.currentGID = gid
+}
+
+// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder.
+func (c *compiler) assignVirtualRegisters() {
+	builder := c.ssaBuilder
+	refCounts := builder.ValueRefCounts()
+	c.ssaValueRefCounts = refCounts
+
+	need := len(refCounts)
+	if need >= len(c.ssaValueToVRegs) {
+		c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...)
+	}
+	if need >= len(c.ssaValueDefinitions) {
+		c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...)
+	}
+
+	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+		// First we assign a virtual register to each parameter.
+		for i := 0; i < blk.Params(); i++ {
+			p := blk.Param(i)
+			pid := p.ID()
+			typ := p.Type()
+			vreg := c.AllocateVReg(typ)
+			c.ssaValueToVRegs[pid] = vreg
+			c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg}
+			c.ssaTypeOfVRegID[vreg.ID()] = p.Type()
+		}
+
+		// Assigns each value to a virtual register produced by instructions.
+		for cur := blk.Root(); cur != nil; cur = cur.Next() {
+			r, rs := cur.Returns()
+			var N int
+			if r.Valid() {
+				id := r.ID()
+				ssaTyp := r.Type()
+				typ := r.Type()
+				vReg := c.AllocateVReg(typ)
+				c.ssaValueToVRegs[id] = vReg
+				c.ssaValueDefinitions[id] = SSAValueDefinition{
+					Instr:    cur,
+					N:        0,
+					RefCount: refCounts[id],
+				}
+				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+				N++
+			}
+			for _, r := range rs {
+				id := r.ID()
+				ssaTyp := r.Type()
+				vReg := c.AllocateVReg(ssaTyp)
+				c.ssaValueToVRegs[id] = vReg
+				c.ssaValueDefinitions[id] = SSAValueDefinition{
+					Instr:    cur,
+					N:        N,
+					RefCount: refCounts[id],
+				}
+				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+				N++
+			}
+		}
+	}
+
+	for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ {
+		typ := retBlk.Param(i).Type()
+		vReg := c.AllocateVReg(typ)
+		c.returnVRegs = append(c.returnVRegs, vReg)
+		c.ssaTypeOfVRegID[vReg.ID()] = typ
+	}
+}
+
+// AllocateVReg implements Compiler.AllocateVReg.
+func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg {
+	regType := regalloc.RegTypeOf(typ)
+	r := regalloc.VReg(c.nextVRegID).SetRegType(regType)
+
+	id := r.ID()
+	if int(id) >= len(c.ssaTypeOfVRegID) {
+		c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...)
+	}
+	c.ssaTypeOfVRegID[id] = typ
+	c.nextVRegID++
+	return r
+}
+
+// Init implements Compiler.Init.
+func (c *compiler) Init() {
+	c.currentGID = 0
+	c.nextVRegID = regalloc.VRegIDNonReservedBegin
+	c.returnVRegs = c.returnVRegs[:0]
+	c.mach.Reset()
+	c.varEdges = c.varEdges[:0]
+	c.constEdges = c.constEdges[:0]
+	c.buf = c.buf[:0]
+	c.sourceOffsets = c.sourceOffsets[:0]
+	c.relocations = c.relocations[:0]
+}
+
+// ValueDefinition implements Compiler.ValueDefinition.
+func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition {
+	return &c.ssaValueDefinitions[value.ID()]
+}
+
+// VRegOf implements Compiler.VRegOf.
+func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg {
+	return c.ssaValueToVRegs[value.ID()]
+}
+
+// Format implements Compiler.Format.
+func (c *compiler) Format() string {
+	return c.mach.Format()
+}
+
+// TypeOf implements Compiler.Format.
+func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type {
+	return c.ssaTypeOfVRegID[v.ID()]
+}
+
+// MatchInstr implements Compiler.MatchInstr.
+func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool {
+	instr := def.Instr
+	return def.IsFromInstr() &&
+		instr.Opcode() == opcode &&
+		instr.GroupID() == c.currentGID &&
+		def.RefCount < 2
+}
+
+// MatchInstrOneOf implements Compiler.MatchInstrOneOf.
+func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
+	instr := def.Instr
+	if !def.IsFromInstr() {
+		return ssa.OpcodeInvalid
+	}
+
+	if instr.GroupID() != c.currentGID {
+		return ssa.OpcodeInvalid
+	}
+
+	if def.RefCount >= 2 {
+		return ssa.OpcodeInvalid
+	}
+
+	opcode := instr.Opcode()
+	for _, op := range opcodes {
+		if opcode == op {
+			return opcode
+		}
+	}
+	return ssa.OpcodeInvalid
+}
+
+// SSABuilder implements Compiler .SSABuilder.
+func (c *compiler) SSABuilder() ssa.Builder {
+	return c.ssaBuilder
+}
+
+// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo.
+func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) {
+	c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{
+		SourceOffset:     sourceOffset,
+		ExecutableOffset: executableOffset,
+	})
+}
+
+// SourceOffsetInfo implements Compiler.SourceOffsetInfo.
+func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo {
+	return c.sourceOffsets
+}
+
+// AddRelocationInfo implements Compiler.AddRelocationInfo.
+func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) {
+	c.relocations = append(c.relocations, RelocationInfo{
+		Offset:  int64(len(c.buf)),
+		FuncRef: funcRef,
+	})
+}
+
+// Emit8Bytes implements Compiler.Emit8Bytes.
+func (c *compiler) Emit8Bytes(b uint64) {
+	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56))
+}
+
+// Emit4Bytes implements Compiler.Emit4Bytes.
+func (c *compiler) Emit4Bytes(b uint32) {
+	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24))
+}
+
+// EmitByte implements Compiler.EmitByte.
+func (c *compiler) EmitByte(b byte) {
+	c.buf = append(c.buf, b)
+}
+
+// Buf implements Compiler.Buf.
+func (c *compiler) Buf() []byte {
+	return c.buf
+}
+
+// BufPtr implements Compiler.BufPtr.
+func (c *compiler) BufPtr() *[]byte {
+	return &c.buf
+}
+
+func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI {
+	if int(sig.ID) >= len(c.abis) {
+		c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...)
+	}
+
+	abi := &c.abis[sig.ID]
+	if abi.Initialized {
+		return abi
+	}
+
+	abi.Init(sig, c.argResultInts, c.argResultFloats)
+	return abi
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
@ -0,0 +1,226 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// Lower implements Compiler.Lower.
+func (c *compiler) Lower() {
+	c.assignVirtualRegisters()
+	c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature()))
+	c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax())
+	c.lowerBlocks()
+}
+
+// lowerBlocks lowers each block in the ssa.Builder.
+func (c *compiler) lowerBlocks() {
+	builder := c.ssaBuilder
+	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+		c.lowerBlock(blk)
+	}
+
+	ectx := c.mach.ExecutableContext()
+	// After lowering all blocks, we need to link adjacent blocks to layout one single instruction list.
+	var prev ssa.BasicBlock
+	for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() {
+		if prev != nil {
+			ectx.LinkAdjacentBlocks(prev, next)
+		}
+		prev = next
+	}
+}
+
+func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
+	mach := c.mach
+	ectx := mach.ExecutableContext()
+	ectx.StartBlock(blk)
+
+	// We traverse the instructions in reverse order because we might want to lower multiple
+	// instructions together.
+	cur := blk.Tail()
+
+	// First gather the branching instructions at the end of the blocks.
+	var br0, br1 *ssa.Instruction
+	if cur.IsBranching() {
+		br0 = cur
+		cur = cur.Prev()
+		if cur != nil && cur.IsBranching() {
+			br1 = cur
+			cur = cur.Prev()
+		}
+	}
+
+	if br0 != nil {
+		c.lowerBranches(br0, br1)
+	}
+
+	if br1 != nil && br0 == nil {
+		panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?")
+	}
+
+	// Now start lowering the non-branching instructions.
+	for ; cur != nil; cur = cur.Prev() {
+		c.setCurrentGroupID(cur.GroupID())
+		if cur.Lowered() {
+			continue
+		}
+
+		switch cur.Opcode() {
+		case ssa.OpcodeReturn:
+			rets := cur.ReturnVals()
+			if len(rets) > 0 {
+				c.mach.LowerReturns(rets)
+			}
+			c.mach.InsertReturn()
+		default:
+			mach.LowerInstr(cur)
+		}
+		ectx.FlushPendingInstructions()
+	}
+
+	// Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg.
+	if blk.EntryBlock() {
+		c.lowerFunctionArguments(blk)
+	}
+
+	ectx.EndBlock()
+}
+
+// lowerBranches is called right after StartBlock and before any LowerInstr call if
+// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists.
+// At least br0 is not nil, but br1 can be nil if there's no branching before br0.
+//
+// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock.
+func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
+	ectx := c.mach.ExecutableContext()
+
+	c.setCurrentGroupID(br0.GroupID())
+	c.mach.LowerSingleBranch(br0)
+	ectx.FlushPendingInstructions()
+	if br1 != nil {
+		c.setCurrentGroupID(br1.GroupID())
+		c.mach.LowerConditionalBranch(br1)
+		ectx.FlushPendingInstructions()
+	}
+
+	if br0.Opcode() == ssa.OpcodeJump {
+		_, args, target := br0.BranchData()
+		argExists := len(args) != 0
+		if argExists && br1 != nil {
+			panic("BUG: critical edge split failed")
+		}
+		if argExists && target.ReturnBlock() {
+			if len(args) > 0 {
+				c.mach.LowerReturns(args)
+			}
+		} else if argExists {
+			c.lowerBlockArguments(args, target)
+		}
+	}
+	ectx.FlushPendingInstructions()
+}
+
+func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) {
+	ectx := c.mach.ExecutableContext()
+
+	c.tmpVals = c.tmpVals[:0]
+	for i := 0; i < entry.Params(); i++ {
+		p := entry.Param(i)
+		if c.ssaValueRefCounts[p.ID()] > 0 {
+			c.tmpVals = append(c.tmpVals, p)
+		} else {
+			// If the argument is not used, we can just pass an invalid value.
+			c.tmpVals = append(c.tmpVals, ssa.ValueInvalid)
+		}
+	}
+	c.mach.LowerParams(c.tmpVals)
+	ectx.FlushPendingInstructions()
+}
+
+// lowerBlockArguments lowers how to pass arguments to the given successor block.
+func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) {
+	if len(args) != succ.Params() {
+		panic("BUG: mismatched number of arguments")
+	}
+
+	c.varEdges = c.varEdges[:0]
+	c.varEdgeTypes = c.varEdgeTypes[:0]
+	c.constEdges = c.constEdges[:0]
+	for i := 0; i < len(args); i++ {
+		dst := succ.Param(i)
+		src := args[i]
+
+		dstReg := c.VRegOf(dst)
+		srcDef := c.ssaValueDefinitions[src.ID()]
+		if srcDef.IsFromInstr() && srcDef.Instr.Constant() {
+			c.constEdges = append(c.constEdges, struct {
+				cInst *ssa.Instruction
+				dst   regalloc.VReg
+			}{cInst: srcDef.Instr, dst: dstReg})
+		} else {
+			srcReg := c.VRegOf(src)
+			// Even when the src=dst, insert the move so that we can keep such registers keep-alive.
+			c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg})
+			c.varEdgeTypes = append(c.varEdgeTypes, src.Type())
+		}
+	}
+
+	// Check if there's an overlap among the dsts and srcs in varEdges.
+	c.vRegIDs = c.vRegIDs[:0]
+	for _, edge := range c.varEdges {
+		src := edge[0].ID()
+		if int(src) >= len(c.vRegSet) {
+			c.vRegSet = append(c.vRegSet, make([]bool, src+1)...)
+		}
+		c.vRegSet[src] = true
+		c.vRegIDs = append(c.vRegIDs, src)
+	}
+	separated := true
+	for _, edge := range c.varEdges {
+		dst := edge[1].ID()
+		if int(dst) >= len(c.vRegSet) {
+			c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...)
+		} else {
+			if c.vRegSet[dst] {
+				separated = false
+				break
+			}
+		}
+	}
+	for _, id := range c.vRegIDs {
+		c.vRegSet[id] = false // reset for the next use.
+	}
+
+	if separated {
+		// If there's no overlap, we can simply move the source to destination.
+		for i, edge := range c.varEdges {
+			src, dst := edge[0], edge[1]
+			c.mach.InsertMove(dst, src, c.varEdgeTypes[i])
+		}
+	} else {
+		// Otherwise, we allocate a temporary registers and move the source to the temporary register,
+		//
+		// First move all of them to temporary registers.
+		c.tempRegs = c.tempRegs[:0]
+		for i, edge := range c.varEdges {
+			src := edge[0]
+			typ := c.varEdgeTypes[i]
+			temp := c.AllocateVReg(typ)
+			c.tempRegs = append(c.tempRegs, temp)
+			c.mach.InsertMove(temp, src, typ)
+		}
+		// Then move the temporary registers to the destination.
+		for i, edge := range c.varEdges {
+			temp := c.tempRegs[i]
+			dst := edge[1]
+			c.mach.InsertMove(dst, temp, c.varEdgeTypes[i])
+		}
+	}
+
+	// Finally, move the constants.
+	for _, edge := range c.constEdges {
+		cInst, dst := edge.cInst, edge.dst
+		c.mach.InsertLoadConstantBlockArg(cInst, dst)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
@ -0,0 +1,219 @@
+package backend
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type ExecutableContext interface {
+	// StartLoweringFunction is called when the lowering of the given function is started.
+	// maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function.
+	StartLoweringFunction(maximumBlockID ssa.BasicBlockID)
+
+	// LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
+	LinkAdjacentBlocks(prev, next ssa.BasicBlock)
+
+	// StartBlock is called when the compilation of the given block is started.
+	// The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
+	// ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
+	StartBlock(ssa.BasicBlock)
+
+	// EndBlock is called when the compilation of the current block is finished.
+	EndBlock()
+
+	// FlushPendingInstructions flushes the pending instructions to the buffer.
+	// This will be called after the lowering of each SSA Instruction.
+	FlushPendingInstructions()
+}
+
+type ExecutableContextT[Instr any] struct {
+	CurrentSSABlk ssa.BasicBlock
+
+	// InstrPool is the InstructionPool of instructions.
+	InstructionPool wazevoapi.Pool[Instr]
+	asNop           func(*Instr)
+	setNext         func(*Instr, *Instr)
+	setPrev         func(*Instr, *Instr)
+
+	// RootInstr is the root instruction of the executable.
+	RootInstr         *Instr
+	labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
+	NextLabel         Label
+	// LabelPositions maps a label to the instructions of the region which the label represents.
+	LabelPositions     map[Label]*LabelPosition[Instr]
+	OrderedBlockLabels []*LabelPosition[Instr]
+
+	// PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
+	PerBlockHead, PerBlockEnd *Instr
+	// PendingInstructions are the instructions which are not yet emitted into the instruction list.
+	PendingInstructions []*Instr
+
+	// SsaBlockIDToLabels maps an SSA block ID to the label.
+	SsaBlockIDToLabels []Label
+}
+
+func NewExecutableContextT[Instr any](
+	resetInstruction func(*Instr),
+	setNext func(*Instr, *Instr),
+	setPrev func(*Instr, *Instr),
+	asNop func(*Instr),
+) *ExecutableContextT[Instr] {
+	return &ExecutableContextT[Instr]{
+		InstructionPool:   wazevoapi.NewPool[Instr](resetInstruction),
+		asNop:             asNop,
+		setNext:           setNext,
+		setPrev:           setPrev,
+		labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
+		LabelPositions:    make(map[Label]*LabelPosition[Instr]),
+		NextLabel:         LabelInvalid,
+	}
+}
+
+func resetLabelPosition[T any](l *LabelPosition[T]) {
+	*l = LabelPosition[T]{}
+}
+
+// StartLoweringFunction implements ExecutableContext.
+func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) {
+	imax := int(max)
+	if len(e.SsaBlockIDToLabels) <= imax {
+		// Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration.
+		e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...)
+	}
+}
+
+func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
+	e.CurrentSSABlk = blk
+
+	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+	if l == LabelInvalid {
+		l = e.AllocateLabel()
+		e.SsaBlockIDToLabels[blk.ID()] = l
+	}
+
+	end := e.allocateNop0()
+	e.PerBlockHead, e.PerBlockEnd = end, end
+
+	labelPos, ok := e.LabelPositions[l]
+	if !ok {
+		labelPos = e.AllocateLabelPosition(l)
+		e.LabelPositions[l] = labelPos
+	}
+	e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
+	labelPos.Begin, labelPos.End = end, end
+	labelPos.SB = blk
+}
+
+// EndBlock implements ExecutableContext.
+func (e *ExecutableContextT[T]) EndBlock() {
+	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
+	e.insertAtPerBlockHead(e.allocateNop0())
+
+	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+	e.LabelPositions[l].Begin = e.PerBlockHead
+
+	if e.CurrentSSABlk.EntryBlock() {
+		e.RootInstr = e.PerBlockHead
+	}
+}
+
+func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) {
+	if e.PerBlockHead == nil {
+		e.PerBlockHead = i
+		e.PerBlockEnd = i
+		return
+	}
+	e.setNext(i, e.PerBlockHead)
+	e.setPrev(e.PerBlockHead, i)
+	e.PerBlockHead = i
+}
+
+// FlushPendingInstructions implements ExecutableContext.
+func (e *ExecutableContextT[T]) FlushPendingInstructions() {
+	l := len(e.PendingInstructions)
+	if l == 0 {
+		return
+	}
+	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
+		e.insertAtPerBlockHead(e.PendingInstructions[i])
+	}
+	e.PendingInstructions = e.PendingInstructions[:0]
+}
+
+func (e *ExecutableContextT[T]) Reset() {
+	e.labelPositionPool.Reset()
+	e.InstructionPool.Reset()
+	for l := Label(0); l <= e.NextLabel; l++ {
+		delete(e.LabelPositions, l)
+	}
+	e.PendingInstructions = e.PendingInstructions[:0]
+	e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
+	e.RootInstr = nil
+	e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0]
+	e.PerBlockHead, e.PerBlockEnd = nil, nil
+	e.NextLabel = LabelInvalid
+}
+
+// AllocateLabel allocates an unused label.
+func (e *ExecutableContextT[T]) AllocateLabel() Label {
+	e.NextLabel++
+	return e.NextLabel
+}
+
+func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
+	l := e.labelPositionPool.Allocate()
+	l.L = la
+	return l
+}
+
+func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
+	if blk.ReturnBlock() {
+		return LabelReturn
+	}
+	l := e.SsaBlockIDToLabels[blk.ID()]
+	if l == LabelInvalid {
+		l = e.AllocateLabel()
+		e.SsaBlockIDToLabels[blk.ID()] = l
+	}
+	return l
+}
+
+func (e *ExecutableContextT[T]) allocateNop0() *T {
+	i := e.InstructionPool.Allocate()
+	e.asNop(i)
+	return i
+}
+
+// LinkAdjacentBlocks implements backend.Machine.
+func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
+	prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)]
+	nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)]
+	e.setNext(prevLabelPos.End, nextLabelPos.Begin)
+}
+
+// LabelPosition represents the regions of the generated code which the label represents.
+type LabelPosition[Instr any] struct {
+	SB           ssa.BasicBlock
+	L            Label
+	Begin, End   *Instr
+	BinaryOffset int64
+}
+
+// Label represents a position in the generated code which is either
+// a real instruction or the constant InstructionPool (e.g. jump tables).
+//
+// This is exactly the same as the traditional "label" in assembly code.
+type Label uint32
+
+const (
+	LabelInvalid Label = 0
+	LabelReturn  Label = math.MaxUint32
+)
+
+// String implements backend.Machine.
+func (l Label) String() string {
+	return fmt.Sprintf("L%d", l)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
@ -0,0 +1,33 @@
+package backend
+
+import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+
+// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call.
+// argBegin is the index of the first argument in the signature which is not either execution context or module context.
+func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) {
+	var paramNeededInBytes, resultNeededInBytes int64
+	for _, p := range sig.Params[argBegin:] {
+		s := int64(p.Size())
+		if s < 8 {
+			s = 8 // We use uint64 for all basic types, except SIMD v128.
+		}
+		paramNeededInBytes += s
+	}
+	for _, r := range sig.Results {
+		s := int64(r.Size())
+		if s < 8 {
+			s = 8 // We use uint64 for all basic types, except SIMD v128.
+		}
+		resultNeededInBytes += s
+	}
+
+	if paramNeededInBytes > resultNeededInBytes {
+		ret = paramNeededInBytes
+	} else {
+		ret = resultNeededInBytes
+	}
+	retUnaligned = ret
+	// Align to 16 bytes.
+	ret = (ret + 15) &^ 15
+	return
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
@ -0,0 +1,186 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// For the details of the ABI, see:
+// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture
+
+var (
+	intArgResultRegs   = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11}
+	floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+		regalloc.RegTypeInt: {
+			rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15,
+		},
+		regalloc.RegTypeFloat: {
+			xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+		},
+	},
+	CalleeSavedRegisters: regalloc.NewRegSet(
+		rdx, r12, r13, r14, r15,
+		xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+	),
+	CallerSavedRegisters: regalloc.NewRegSet(
+		rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11,
+		xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+	),
+	RealRegToVReg: []regalloc.VReg{
+		rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg,
+		r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg,
+		xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg,
+		xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg,
+		xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg,
+	},
+	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+		if r < xmm0 {
+			return regalloc.RegTypeInt
+		}
+		return regalloc.RegTypeFloat
+	},
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+	return intArgResultRegs, floatArgResultRegs
+}
+
+// LowerParams implements backend.Machine.
+func (m *machine) LowerParams(args []ssa.Value) {
+	a := m.currentABI
+
+	for i, ssaArg := range args {
+		if !ssaArg.Valid() {
+			continue
+		}
+		reg := m.c.VRegOf(ssaArg)
+		arg := &a.Args[i]
+		if arg.Kind == backend.ABIArgKindReg {
+			m.InsertMove(reg, arg.Reg, arg.Type)
+		} else {
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |
+			//          |      arg X      |
+			//          |     .......     |
+			//          |      arg 1      |
+			//          |      arg 0      |
+			//          |   ReturnAddress |
+			//          |    Caller_RBP   |
+			//          +-----------------+ <-- RBP
+			//          |   ...........   |
+			//          |   clobbered  M  |
+			//          |   ............  |
+			//          |   clobbered  0  |
+			//          |   spill slot N  |
+			//          |   ...........   |
+			//          |   spill slot 0  |
+			//   RSP--> +-----------------+
+			//             (low address)
+
+			// Load the value from the arg stack slot above the current RBP.
+			load := m.allocateInstr()
+			mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16)))
+			switch arg.Type {
+			case ssa.TypeI32:
+				load.asMovzxRmR(extModeLQ, mem, reg)
+			case ssa.TypeI64:
+				load.asMov64MR(mem, reg)
+			case ssa.TypeF32:
+				load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
+			case ssa.TypeF64:
+				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
+			case ssa.TypeV128:
+				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
+			default:
+				panic("BUG")
+			}
+			m.insert(load)
+		}
+	}
+}
+
+// LowerReturns implements backend.Machine.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+	// Load the XMM registers first as it might need a temporary register to inline
+	// constant return.
+	a := m.currentABI
+	for i, ret := range rets {
+		r := &a.Rets[i]
+		if !r.Type.IsInt() {
+			m.LowerReturn(ret, r)
+		}
+	}
+	// Then load the GPR registers.
+	for i, ret := range rets {
+		r := &a.Rets[i]
+		if r.Type.IsInt() {
+			m.LowerReturn(ret, r)
+		}
+	}
+}
+
+func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) {
+	reg := m.c.VRegOf(ret)
+	if def := m.c.ValueDefinition(ret); def.IsFromInstr() {
+		// Constant instructions are inlined.
+		if inst := def.Instr; inst.Constant() {
+			m.insertLoadConstant(inst, reg)
+		}
+	}
+	if r.Kind == backend.ABIArgKindReg {
+		m.InsertMove(r.Reg, reg, ret.Type())
+	} else {
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |   ReturnAddress |
+		//          |    Caller_RBP   |
+		//          +-----------------+ <-- RBP
+		//          |   ...........   |
+		//          |   clobbered  M  |
+		//          |   ............  |
+		//          |   clobbered  0  |
+		//          |   spill slot N  |
+		//          |   ...........   |
+		//          |   spill slot 0  |
+		//   RSP--> +-----------------+
+		//             (low address)
+
+		// Store the value to the return stack slot above the current RBP.
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset)))
+		switch r.Type {
+		case ssa.TypeI32:
+			store.asMovRM(reg, mem, 4)
+		case ssa.TypeI64:
+			store.asMovRM(reg, mem, 8)
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, reg, mem)
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
+		}
+		m.insert(store)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
@ -0,0 +1,9 @@
+package amd64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
@ -0,0 +1,29 @@
+#include "funcdata.h"
+#include "textflag.h"
+
+// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+	MOVQ preambleExecutable+0(FP), R11
+	MOVQ functionExectuable+8(FP), R14
+	MOVQ executionContextPtr+16(FP), AX       // First argument is passed in AX.
+	MOVQ moduleContextPtr+24(FP), BX          // Second argument is passed in BX.
+	MOVQ paramResultSlicePtr+32(FP), R12
+	MOVQ goAllocatedStackSlicePtr+40(FP), R13
+	JMP  R11
+
+// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+	MOVQ executable+0(FP), CX
+	MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX.
+
+	// Save the stack pointer and frame pointer.
+	MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer
+	MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer
+
+	// Then set the stack pointer and frame pointer to the values we got from the Go runtime.
+	MOVQ framePointer+24(FP), BP
+
+	// WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8.
+	MOVQ stackPointer+16(FP), SP
+
+	JMP CX
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
@ -0,0 +1,248 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var (
+	executionContextPtrReg = raxVReg
+
+	// Followings are callee saved registers. They can be used freely in the entry preamble
+	// since the preamble is called via Go assembly function which has stack-based ABI.
+
+	// savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue.
+	savedExecutionContextPtr = rdxVReg
+	// paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s.
+	paramResultSlicePtr = r12VReg
+	// goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s.
+	goAllocatedStackPtr = r13VReg
+	// functionExecutable must match with entrypoint function in abi_entry_amd64.s.
+	functionExecutable = r14VReg
+	tmpIntReg          = r15VReg
+	tmpXmmReg          = xmm15VReg
+)
+
+// CompileEntryPreamble implements backend.Machine.
+func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte {
+	root := m.compileEntryPreamble(sig)
+	m.encodeWithoutSSA(root)
+	buf := m.c.Buf()
+	return buf
+}
+
+func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction {
+	abi := backend.FunctionABI{}
+	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+
+	root := m.allocateNop()
+
+	//// ----------------------------------- prologue ----------------------------------- ////
+
+	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+	// 		mov %executionContextPtrReg, %savedExecutionContextPtr
+	cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root)
+
+	// Next is to save the original RBP and RSP into the execution context.
+	cur = m.saveOriginalRSPRBP(cur)
+
+	// Now set the RSP to the Go-allocated stack pointer.
+	// 		mov %goAllocatedStackPtr, %rsp
+	cur = m.move64(goAllocatedStackPtr, rspVReg, cur)
+
+	if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 {
+		// Allocate stack slots for the arguments and return values.
+		// 		sub $stackSlotSize, %rsp
+		spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true)
+		cur = linkInstr(cur, spDec)
+	}
+
+	var offset uint32
+	for i := range abi.Args {
+		if i < 2 {
+			// module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function.
+			continue
+		}
+		arg := &abi.Args[i]
+		cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg)
+		if arg.Type == ssa.TypeV128 {
+			offset += 16
+		} else {
+			offset += 8
+		}
+	}
+
+	// Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack.
+	zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true)
+	cur = linkInstr(cur, zerosRbp)
+
+	// Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated,
+	// which is aligned to 16 bytes.
+	call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi)
+	cur = linkInstr(cur, call)
+
+	//// ----------------------------------- epilogue ----------------------------------- ////
+
+	// Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr.
+	offset = 0
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize))
+		if r.Type == ssa.TypeV128 {
+			offset += 16
+		} else {
+			offset += 8
+		}
+	}
+
+	// Finally, restore the original RBP and RSP.
+	cur = m.restoreOriginalRSPRBP(cur)
+
+	ret := m.allocateInstr().asRet()
+	linkInstr(cur, ret)
+	return root
+}
+
+// saveOriginalRSPRBP saves the original RSP and RBP into the execution context.
+func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction {
+	// 		mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg)
+	// 		mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg)
+	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur)
+	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur)
+	return cur
+}
+
+// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context.
+func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction {
+	// 		mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp
+	// 		mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp
+	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur)
+	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur)
+	return cur
+}
+
+func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction {
+	mov := m.allocateInstr().asMovRR(src, dst, true)
+	return linkInstr(prev, mov)
+}
+
+func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction {
+	mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx))
+	instr := m.allocateInstr()
+	if store {
+		instr.asMovRM(r, mem, 8)
+	} else {
+		instr.asMov64MR(mem, r)
+	}
+	return linkInstr(prev, instr)
+}
+
+// This is for debugging.
+func (m *machine) linkUD2(cur *instruction) *instruction { //nolint
+	return linkInstr(cur, m.allocateInstr().asUD2())
+}
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction {
+	var dst regalloc.VReg
+	argTyp := arg.Type
+	if arg.Kind == backend.ABIArgKindStack {
+		// Caller saved registers ca
+		switch argTyp {
+		case ssa.TypeI32, ssa.TypeI64:
+			dst = tmpIntReg
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			dst = tmpXmmReg
+		default:
+			panic("BUG")
+		}
+	} else {
+		dst = arg.Reg
+	}
+
+	load := m.allocateInstr()
+	a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr))
+	switch arg.Type {
+	case ssa.TypeI32:
+		load.asMovzxRmR(extModeLQ, a, dst)
+	case ssa.TypeI64:
+		load.asMov64MR(a, dst)
+	case ssa.TypeF32:
+		load.asXmmUnaryRmR(sseOpcodeMovss, a, dst)
+	case ssa.TypeF64:
+		load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst)
+	case ssa.TypeV128:
+		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst)
+	}
+
+	cur = linkInstr(cur, load)
+	if arg.Kind == backend.ABIArgKindStack {
+		// Store back to the stack.
+		store := m.allocateInstr()
+		a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg))
+		switch arg.Type {
+		case ssa.TypeI32:
+			store.asMovRM(dst, a, 4)
+		case ssa.TypeI64:
+			store.asMovRM(dst, a, 8)
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, dst, a)
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, dst, a)
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, dst, a)
+		}
+		cur = linkInstr(cur, store)
+	}
+	return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction {
+	var r regalloc.VReg
+	if result.Kind == backend.ABIArgKindStack {
+		// Load the value to the temporary.
+		load := m.allocateInstr()
+		offset := resultStackSlotBeginOffset + uint32(result.Offset)
+		a := newOperandMem(m.newAmodeImmReg(offset, rspVReg))
+		switch result.Type {
+		case ssa.TypeI32:
+			r = tmpIntReg
+			load.asMovzxRmR(extModeLQ, a, r)
+		case ssa.TypeI64:
+			r = tmpIntReg
+			load.asMov64MR(a, r)
+		case ssa.TypeF32:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovss, a, r)
+		case ssa.TypeF64:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovsd, a, r)
+		case ssa.TypeV128:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+	} else {
+		r = result.Reg
+	}
+
+	store := m.allocateInstr()
+	a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr))
+	switch result.Type {
+	case ssa.TypeI32:
+		store.asMovRM(r, a, 4)
+	case ssa.TypeI64:
+		store.asMovRM(r, a, 8)
+	case ssa.TypeF32:
+		store.asXmmMovRM(sseOpcodeMovss, r, a)
+	case ssa.TypeF64:
+		store.asXmmMovRM(sseOpcodeMovsd, r, a)
+	case ssa.TypeV128:
+		store.asXmmMovRM(sseOpcodeMovdqu, r, a)
+	}
+
+	return linkInstr(cur, store)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
@ -0,0 +1,443 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedVRegs = []regalloc.VReg{
+	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+	ectx := m.ectx
+	argBegin := 1 // Skips exec context by default.
+	if needModuleContextPtr {
+		argBegin++
+	}
+
+	abi := &backend.FunctionABI{}
+	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+	m.currentABI = abi
+
+	cur := m.allocateNop()
+	ectx.RootInstr = cur
+
+	// Execution context is always the first argument.
+	execCtrPtr := raxVReg
+
+	// First we update RBP and RSP just like the normal prologue.
+	//
+	//                   (high address)                     (high address)
+	//       RBP ----> +-----------------+                +-----------------+
+	//                 |     .......     |                |     .......     |
+	//                 |      ret Y      |                |      ret Y      |
+	//                 |     .......     |                |     .......     |
+	//                 |      ret 0      |                |      ret 0      |
+	//                 |      arg X      |                |      arg X      |
+	//                 |     .......     |     ====>      |     .......     |
+	//                 |      arg 1      |                |      arg 1      |
+	//                 |      arg 0      |                |      arg 0      |
+	//                 |   Return Addr   |                |   Return Addr   |
+	//       RSP ----> +-----------------+                |    Caller_RBP   |
+	//                    (low address)                   +-----------------+ <----- RSP, RBP
+	//
+	cur = m.setupRBPRSP(cur)
+
+	goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+	cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur)
+
+	// Save the callee saved registers.
+	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+	if needModuleContextPtr {
+		moduleCtrPtr := rbxVReg // Module context is always the second argument.
+		mem := m.newAmodeImmReg(
+			wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(),
+			execCtrPtr)
+		store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8)
+		cur = linkInstr(cur, store)
+	}
+
+	// Now let's advance the RSP to the stack slot for the arguments.
+	//
+	//                (high address)                     (high address)
+	//              +-----------------+               +-----------------+
+	//              |     .......     |               |     .......     |
+	//              |      ret Y      |               |      ret Y      |
+	//              |     .......     |               |     .......     |
+	//              |      ret 0      |               |      ret 0      |
+	//              |      arg X      |               |      arg X      |
+	//              |     .......     |   =======>    |     .......     |
+	//              |      arg 1      |               |      arg 1      |
+	//              |      arg 0      |               |      arg 0      |
+	//              |   Return Addr   |               |   Return Addr   |
+	//              |    Caller_RBP   |               |    Caller_RBP   |
+	//  RBP,RSP --> +-----------------+               +-----------------+ <----- RBP
+	//                 (low address)                  |  arg[N]/ret[M]  |
+	//                                                |    ..........   |
+	//                                                |  arg[1]/ret[1]  |
+	//                                                |  arg[0]/ret[0]  |
+	//                                                +-----------------+ <----- RSP
+	//                                                   (low address)
+	//
+	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+	// the arguments/return values to/from Go function.
+	cur = m.addRSP(-int32(goSliceSizeAligned), cur)
+
+	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+	var offsetInGoSlice int32
+	for i := range abi.Args[argBegin:] {
+		arg := &abi.Args[argBegin+i]
+		var v regalloc.VReg
+		if arg.Kind == backend.ABIArgKindReg {
+			v = arg.Reg
+		} else {
+			// We have saved callee saved registers, so we can use them.
+			if arg.Type.IsInt() {
+				v = r15VReg
+			} else {
+				v = xmm15VReg
+			}
+			mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+			load := m.allocateInstr()
+			switch arg.Type {
+			case ssa.TypeI32:
+				load.asMovzxRmR(extModeLQ, mem, v)
+			case ssa.TypeI64:
+				load.asMov64MR(mem, v)
+			case ssa.TypeF32:
+				load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+			case ssa.TypeF64:
+				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+			case ssa.TypeV128:
+				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+			default:
+				panic("BUG")
+			}
+			cur = linkInstr(cur, load)
+		}
+
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+		switch arg.Type {
+		case ssa.TypeI32:
+			store.asMovRM(v, mem, 4)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeI64:
+			store.asMovRM(v, mem, 8)
+			offsetInGoSlice += 8
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, v, mem)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+			offsetInGoSlice += 8
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+			offsetInGoSlice += 16
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, store)
+	}
+
+	// Finally we push the size of the slice to the stack so the stack looks like:
+	//
+	//          (high address)
+	//       +-----------------+
+	//       |     .......     |
+	//       |      ret Y      |
+	//       |     .......     |
+	//       |      ret 0      |
+	//       |      arg X      |
+	//       |     .......     |
+	//       |      arg 1      |
+	//       |      arg 0      |
+	//       |   Return Addr   |
+	//       |    Caller_RBP   |
+	//       +-----------------+ <----- RBP
+	//       |  arg[N]/ret[M]  |
+	//       |    ..........   |
+	//       |  arg[1]/ret[1]  |
+	//       |  arg[0]/ret[0]  |
+	//       |    slice size   |
+	//       +-----------------+ <----- RSP
+	//         (low address)
+	//
+	// 		push $sliceSize
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned))))
+
+	// Load the exitCode to the register.
+	exitCodeReg := r12VReg // Callee saved which is already saved.
+	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false))
+
+	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+	cur = linkInstr(cur, setExitCode)
+	cur = linkInstr(cur, saveRsp)
+	cur = linkInstr(cur, saveRbp)
+
+	// Ready to exit the execution.
+	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+	// We don't need the slice size anymore, so pop it.
+	cur = m.addRSP(8, cur)
+
+	// Ready to set up the results.
+	offsetInGoSlice = 0
+	// To avoid overwriting with the execution context pointer by the result, we need to track the offset,
+	// and defer the restoration of the result to the end of this function.
+	var argOverlapWithExecCtxOffset int32 = -1
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		var v regalloc.VReg
+		isRegResult := r.Kind == backend.ABIArgKindReg
+		if isRegResult {
+			v = r.Reg
+			if v.RealReg() == execCtrPtr.RealReg() {
+				argOverlapWithExecCtxOffset = offsetInGoSlice
+				offsetInGoSlice += 8 // always uint64 rep.
+				continue
+			}
+		} else {
+			if r.Type.IsInt() {
+				v = r15VReg
+			} else {
+				v = xmm15VReg
+			}
+		}
+
+		load := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+		switch r.Type {
+		case ssa.TypeI32:
+			load.asMovzxRmR(extModeLQ, mem, v)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeI64:
+			load.asMov64MR(mem, v)
+			offsetInGoSlice += 8
+		case ssa.TypeF32:
+			load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeF64:
+			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+			offsetInGoSlice += 8
+		case ssa.TypeV128:
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+			offsetInGoSlice += 16
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+
+		if !isRegResult {
+			// We need to store it back to the result slot above rbp.
+			store := m.allocateInstr()
+			mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+			switch r.Type {
+			case ssa.TypeI32:
+				store.asMovRM(v, mem, 4)
+			case ssa.TypeI64:
+				store.asMovRM(v, mem, 8)
+			case ssa.TypeF32:
+				store.asXmmMovRM(sseOpcodeMovss, v, mem)
+			case ssa.TypeF64:
+				store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+			case ssa.TypeV128:
+				store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+			default:
+				panic("BUG")
+			}
+			cur = linkInstr(cur, store)
+		}
+	}
+
+	// Before return, we need to restore the callee saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+	if argOverlapWithExecCtxOffset >= 0 {
+		// At this point execCtt is not used anymore, so we can finally store the
+		// result to the register which overlaps with the execution context pointer.
+		mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg))
+		load := m.allocateInstr().asMov64MR(mem, execCtrPtr)
+		cur = linkInstr(cur, load)
+	}
+
+	// Finally ready to return.
+	cur = m.revertRBPRSP(cur)
+	linkInstr(cur, m.allocateInstr().asRet())
+
+	m.encodeWithoutSSA(ectx.RootInstr)
+	return m.c.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			store.asMovRM(v, mem, 8)
+		case regalloc.RegTypeFloat:
+			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, store)
+		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+	}
+	return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		load := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			load.asMov64MR(mem, v)
+		case regalloc.RegTypeFloat:
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+	}
+	return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction {
+	readRip := m.allocateInstr()
+	cur = linkInstr(cur, readRip)
+
+	ripReg := r12VReg // Callee saved which is already saved.
+	saveRip := m.allocateInstr().asMovRM(
+		ripReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
+		8,
+	)
+	cur = linkInstr(cur, saveRip)
+
+	exit := m.allocateExitSeq(execCtx)
+	cur = linkInstr(cur, exit)
+
+	nop, l := m.allocateBrTarget()
+	cur = linkInstr(cur, nop)
+	readRip.asLEA(newOperandLabel(l), ripReg)
+	return cur
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the
+// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it.
+var stackGrowSaveVRegs = []regalloc.VReg{
+	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+	rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg,
+	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+	xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg,
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+	ectx := m.ectx
+
+	cur := m.allocateNop()
+	ectx.RootInstr = cur
+
+	cur = m.setupRBPRSP(cur)
+
+	// Execution context is always the first argument.
+	execCtrPtr := raxVReg
+
+	// Save the callee saved and argument registers.
+	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+	// Load the exitCode to the register.
+	exitCodeReg := r12VReg // Already saved.
+	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false))
+
+	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+	cur = linkInstr(cur, setExitCode)
+	cur = linkInstr(cur, saveRsp)
+	cur = linkInstr(cur, saveRbp)
+
+	// Ready to exit the execution.
+	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+	// After the exit, restore the saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+	// Finally ready to return.
+	cur = m.revertRBPRSP(cur)
+	linkInstr(cur, m.allocateInstr().asRet())
+
+	m.encodeWithoutSSA(ectx.RootInstr)
+	return m.c.Buf()
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+	//		add $requiredStackSize, %rsp ;; Temporarily update the sp.
+	// 		cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp.
+	// 		ja .ok
+	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+	//      pushq r15 ;; save the temporary.
+	//		mov $requiredStackSize, %r15
+	//		mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context.
+	//      popq r15 ;; restore the temporary.
+	//		callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack.
+	//		jmp .cont
+	// .ok:
+	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+	// .cont:
+	cur = m.addRSP(-int32(requiredStackSize), cur)
+	cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)),
+		rspVReg, true))
+
+	ja := m.allocateInstr()
+	cur = linkInstr(cur, ja)
+
+	cur = m.addRSP(int32(requiredStackSize), cur)
+
+	// Save the temporary.
+
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg)))
+	// Load the required size to the temporary.
+	cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true))
+	// Set the required size in the execution context.
+	cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8))
+	// Restore the temporary.
+	cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg))
+	// Call the Go function to grow the stack.
+	cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg(
+		wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil))
+	// Jump to the continuation.
+	jmpToCont := m.allocateInstr()
+	cur = linkInstr(cur, jmpToCont)
+
+	// .ok:
+	okInstr, ok := m.allocateBrTarget()
+	cur = linkInstr(cur, okInstr)
+	ja.asJmpIf(condNBE, newOperandLabel(ok))
+	// On the ok path, we only need to reverse the temporary update.
+	cur = m.addRSP(int32(requiredStackSize), cur)
+
+	// .cont:
+	contInstr, cont := m.allocateBrTarget()
+	cur = linkInstr(cur, contInstr)
+	jmpToCont.asJmp(newOperandLabel(cont))
+
+	return cur
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
@ -0,0 +1,168 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type cond byte
+
+const (
+	// condO represents (overflow) condition.
+	condO cond = iota
+	// condNO represents (no overflow) condition.
+	condNO
+	// condB represents (< unsigned) condition.
+	condB
+	// condNB represents (>= unsigned) condition.
+	condNB
+	// condZ represents (zero) condition.
+	condZ
+	// condNZ represents (not-zero) condition.
+	condNZ
+	// condBE represents (<= unsigned) condition.
+	condBE
+	// condNBE represents (> unsigned) condition.
+	condNBE
+	// condS represents (negative) condition.
+	condS
+	// condNS represents (not-negative) condition.
+	condNS
+	// condP represents (parity) condition.
+	condP
+	// condNP represents (not parity) condition.
+	condNP
+	// condL represents (< signed) condition.
+	condL
+	// condNL represents (>= signed) condition.
+	condNL
+	// condLE represents (<= signed) condition.
+	condLE
+	// condNLE represents (> signed) condition.
+	condNLE
+
+	condInvalid
+)
+
+func (c cond) String() string {
+	switch c {
+	case condO:
+		return "o"
+	case condNO:
+		return "no"
+	case condB:
+		return "b"
+	case condNB:
+		return "nb"
+	case condZ:
+		return "z"
+	case condNZ:
+		return "nz"
+	case condBE:
+		return "be"
+	case condNBE:
+		return "nbe"
+	case condS:
+		return "s"
+	case condNS:
+		return "ns"
+	case condL:
+		return "l"
+	case condNL:
+		return "nl"
+	case condLE:
+		return "le"
+	case condNLE:
+		return "nle"
+	case condP:
+		return "p"
+	case condNP:
+		return "np"
+	default:
+		panic("unreachable")
+	}
+}
+
+func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond {
+	switch origin {
+	case ssa.IntegerCmpCondEqual:
+		return condZ
+	case ssa.IntegerCmpCondNotEqual:
+		return condNZ
+	case ssa.IntegerCmpCondSignedLessThan:
+		return condL
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+		return condNL
+	case ssa.IntegerCmpCondSignedGreaterThan:
+		return condNLE
+	case ssa.IntegerCmpCondSignedLessThanOrEqual:
+		return condLE
+	case ssa.IntegerCmpCondUnsignedLessThan:
+		return condB
+	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return condNB
+	case ssa.IntegerCmpCondUnsignedGreaterThan:
+		return condNBE
+	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		return condBE
+	default:
+		panic("unreachable")
+	}
+}
+
+func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond {
+	switch origin {
+	case ssa.FloatCmpCondGreaterThanOrEqual:
+		return condNB
+	case ssa.FloatCmpCondGreaterThan:
+		return condNBE
+	case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual:
+		panic(fmt.Sprintf("cond %s must be treated as a special case", origin))
+	default:
+		panic("unreachable")
+	}
+}
+
+func (c cond) encoding() byte {
+	return byte(c)
+}
+
+func (c cond) invert() cond {
+	switch c {
+	case condO:
+		return condNO
+	case condNO:
+		return condO
+	case condB:
+		return condNB
+	case condNB:
+		return condB
+	case condZ:
+		return condNZ
+	case condNZ:
+		return condZ
+	case condBE:
+		return condNBE
+	case condNBE:
+		return condBE
+	case condS:
+		return condNS
+	case condNS:
+		return condS
+	case condP:
+		return condNP
+	case condNP:
+		return condP
+	case condL:
+		return condNL
+	case condNL:
+		return condL
+	case condLE:
+		return condNLE
+	case condNLE:
+		return condLE
+	default:
+		panic("unreachable")
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
@ -0,0 +1,35 @@
+package amd64
+
+// extMode represents the mode of extension in movzx/movsx.
+type extMode byte
+
+const (
+	// extModeBL represents Byte -> Longword.
+	extModeBL extMode = iota
+	// extModeBQ represents Byte -> Quadword.
+	extModeBQ
+	// extModeWL represents Word -> Longword.
+	extModeWL
+	// extModeWQ represents Word -> Quadword.
+	extModeWQ
+	// extModeLQ represents Longword -> Quadword.
+	extModeLQ
+)
+
+// String implements fmt.Stringer.
+func (e extMode) String() string {
+	switch e {
+	case extModeBL:
+		return "bl"
+	case extModeBQ:
+		return "bq"
+	case extModeWL:
+		return "wl"
+	case extModeWQ:
+		return "wq"
+	case extModeLQ:
+		return "lq"
+	default:
+		panic("BUG: invalid ext mode")
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
@ -0,0 +1,71 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+
+	vr = m.c.AllocateVReg(valType)
+	m.insertLoadConstant(instr, vr)
+	return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+	m.insertLoadConstant(instr, vr)
+}
+
+func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+	v := instr.ConstantVal()
+
+	bits := valType.Bits()
+	if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+		v = v & ((1 << valType.Bits()) - 1)
+	}
+
+	switch valType {
+	case ssa.TypeF32, ssa.TypeF64:
+		m.lowerFconst(vr, v, bits == 64)
+	case ssa.TypeI32, ssa.TypeI64:
+		m.lowerIconst(vr, v, bits == 64)
+	default:
+		panic("BUG")
+	}
+}
+
+func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) {
+	if c == 0 {
+		xor := m.allocateInstr().asZeros(dst)
+		m.insert(xor)
+	} else {
+		var tmpType ssa.Type
+		if _64 {
+			tmpType = ssa.TypeI64
+		} else {
+			tmpType = ssa.TypeI32
+		}
+		tmpInt := m.c.AllocateVReg(tmpType)
+		loadToGP := m.allocateInstr().asImm(tmpInt, c, _64)
+		m.insert(loadToGP)
+
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64)
+		m.insert(movToXmm)
+	}
+}
+
+func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) {
+	i := m.allocateInstr()
+	if c == 0 {
+		i.asZeros(dst)
+	} else {
+		i.asImm(dst, c, _64)
+	}
+	m.insert(i)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
@ -0,0 +1,187 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl}
+
+type addend struct {
+	r     regalloc.VReg
+	off   int64
+	shift byte
+}
+
+func (a addend) String() string {
+	return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) {
+	def := m.c.ValueDefinition(ptr)
+
+	if offsetBase&0x80000000 != 0 {
+		// Special casing the huge base offset whose MSB is set. In x64, the immediate is always
+		// sign-extended, but our IR semantics requires the offset base is always unsigned.
+		// Note that this should be extremely rare or even this shouldn't hit in the real application,
+		// therefore we don't need to optimize this case in my opinion.
+
+		a := m.lowerAddend(def)
+		off64 := a.off + int64(offsetBase)
+		offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64)
+		m.lowerIconst(offsetBaseReg, uint64(off64), true)
+		if a.r != regalloc.VRegInvalid {
+			return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift)
+		} else {
+			return m.newAmodeImmReg(0, offsetBaseReg)
+		}
+	}
+
+	if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd {
+		add := def.Instr
+		x, y := add.Arg2()
+		xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+		ax := m.lowerAddend(xDef)
+		ay := m.lowerAddend(yDef)
+		add.MarkLowered()
+		return m.lowerAddendsToAmode(ax, ay, offsetBase)
+	} else {
+		// If it is not an Iadd, then we lower the one addend.
+		a := m.lowerAddend(def)
+		// off is always 0 if r is valid.
+		if a.r != regalloc.VRegInvalid {
+			if a.shift != 0 {
+				tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+				m.lowerIconst(tmpReg, 0, true)
+				return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift)
+			}
+			return m.newAmodeImmReg(offsetBase, a.r)
+		} else {
+			off64 := a.off + int64(offsetBase)
+			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(tmpReg, uint64(off64), true)
+			return m.newAmodeImmReg(0, tmpReg)
+		}
+	}
+}
+
+func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode {
+	if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 {
+		panic("invalid input")
+	}
+
+	u64 := uint64(x.off+y.off) + uint64(offBase)
+	if u64 != 0 {
+		if _, ok := asImm32(u64, false); !ok {
+			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(tmpReg, u64, true)
+			// Blank u64 as it has been already lowered.
+			u64 = 0
+
+			if x.r == regalloc.VRegInvalid {
+				x.r = tmpReg
+			} else if y.r == regalloc.VRegInvalid {
+				y.r = tmpReg
+			} else {
+				// We already know that either rx or ry is invalid,
+				// so we overwrite it with the temporary register.
+				panic("BUG")
+			}
+		}
+	}
+
+	u32 := uint32(u64)
+	switch {
+	// We assume rx, ry are valid iff offx, offy are 0.
+	case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+		switch {
+		case x.shift != 0 && y.shift != 0:
+			// Cannot absorb two shifted registers, must lower one to a shift instruction.
+			shifted := m.allocateInstr()
+			shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true)
+			m.insert(shifted)
+
+			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+		case x.shift != 0 && y.shift == 0:
+			// Swap base and index.
+			x, y = y, x
+			fallthrough
+		default:
+			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+		}
+	case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+		x, y = y, x
+		fallthrough
+	case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid:
+		if x.shift != 0 {
+			zero := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(zero, 0, true)
+			return m.newAmodeRegRegShift(u32, zero, x.r, x.shift)
+		}
+		return m.newAmodeImmReg(u32, x.r)
+	default: // Both are invalid: use the offset.
+		tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+		m.lowerIconst(tmpReg, u64, true)
+		return m.newAmodeImmReg(0, tmpReg)
+	}
+}
+
+func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend {
+	if x.IsFromBlockParam() {
+		return addend{x.BlkParamVReg, 0, 0}
+	}
+	// Ensure the addend is not referenced in multiple places; we will discard nested Iadds.
+	op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:])
+	if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd {
+		return m.lowerAddendFromInstr(x.Instr)
+	}
+	p := m.getOperand_Reg(x)
+	return addend{p.reg(), 0, 0}
+}
+
+// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode.
+// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register.
+// The offset is 0 if the addend can be lowered to a register.
+func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend {
+	instr.MarkLowered()
+	switch op := instr.Opcode(); op {
+	case ssa.OpcodeIconst:
+		u64 := instr.ConstantVal()
+		if instr.Return().Type().Bits() == 32 {
+			return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend.
+		} else {
+			return addend{regalloc.VRegInvalid, int64(u64), 0}
+		}
+	case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+		input := instr.Arg()
+		inputDef := m.c.ValueDefinition(input)
+		if input.Type().Bits() != 32 {
+			panic("BUG: invalid input type " + input.Type().String())
+		}
+		constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+		switch {
+		case constInst && op == ssa.OpcodeSExtend:
+			return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0}
+		case constInst && op == ssa.OpcodeUExtend:
+			return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend!
+		default:
+			r := m.getOperand_Reg(inputDef)
+			return addend{r.reg(), 0, 0}
+		}
+	case ssa.OpcodeIshl:
+		// If the addend is a shift, we can only handle it if the shift amount is a constant.
+		x, amount := instr.Arg2()
+		amountDef := m.c.ValueDefinition(amount)
+		if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 {
+			r := m.getOperand_Reg(m.c.ValueDefinition(x))
+			return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())}
+		}
+		r := m.getOperand_Reg(m.c.ValueDefinition(x))
+		return addend{r.reg(), 0, 0}
+	}
+	panic("BUG: invalid opcode")
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
@ -0,0 +1,304 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+	m.setupPrologue()
+	m.postRegAlloc()
+}
+
+func (m *machine) setupPrologue() {
+	cur := m.ectx.RootInstr
+	prevInitInst := cur.next
+
+	// At this point, we have the stack layout as follows:
+	//
+	//                   (high address)
+	//                 +-----------------+ <----- RBP (somewhere in the middle of the stack)
+	//                 |     .......     |
+	//                 |      ret Y      |
+	//                 |     .......     |
+	//                 |      ret 0      |
+	//                 |      arg X      |
+	//                 |     .......     |
+	//                 |      arg 1      |
+	//                 |      arg 0      |
+	//                 |   Return Addr   |
+	//       RSP ----> +-----------------+
+	//                    (low address)
+
+	// First, we push the RBP, and update the RBP to the current RSP.
+	//
+	//                   (high address)                     (high address)
+	//       RBP ----> +-----------------+                +-----------------+
+	//                 |     .......     |                |     .......     |
+	//                 |      ret Y      |                |      ret Y      |
+	//                 |     .......     |                |     .......     |
+	//                 |      ret 0      |                |      ret 0      |
+	//                 |      arg X      |                |      arg X      |
+	//                 |     .......     |     ====>      |     .......     |
+	//                 |      arg 1      |                |      arg 1      |
+	//                 |      arg 0      |                |      arg 0      |
+	//                 |   Return Addr   |                |   Return Addr   |
+	//       RSP ----> +-----------------+                |    Caller_RBP   |
+	//                    (low address)                   +-----------------+ <----- RSP, RBP
+	//
+	cur = m.setupRBPRSP(cur)
+
+	if !m.stackBoundsCheckDisabled {
+		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+	}
+
+	//
+	//            (high address)
+	//          +-----------------+                  +-----------------+
+	//          |     .......     |                  |     .......     |
+	//          |      ret Y      |                  |      ret Y      |
+	//          |     .......     |                  |     .......     |
+	//          |      ret 0      |                  |      ret 0      |
+	//          |      arg X      |                  |      arg X      |
+	//          |     .......     |                  |     .......     |
+	//          |      arg 1      |                  |      arg 1      |
+	//          |      arg 0      |                  |      arg 0      |
+	//          |      xxxxx      |                  |      xxxxx      |
+	//          |   Return Addr   |                  |   Return Addr   |
+	//          |    Caller_RBP   |      ====>       |    Caller_RBP   |
+	// RBP,RSP->+-----------------+                  +-----------------+ <----- RBP
+	//             (low address)                     |   clobbered M   |
+	//                                               |   clobbered 1   |
+	//                                               |   ...........   |
+	//                                               |   clobbered 0   |
+	//                                               +-----------------+ <----- RSP
+	//
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		for i := range regs {
+			r := regs[len(regs)-1-i] // Reverse order.
+			if r.RegType() == regalloc.RegTypeInt {
+				cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r)))
+			} else {
+				// Push the XMM register is not supported by the PUSH instruction.
+				cur = m.addRSP(-16, cur)
+				push := m.allocateInstr().asXmmMovRM(
+					sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)),
+				)
+				cur = linkInstr(cur, push)
+			}
+		}
+	}
+
+	if size := m.spillSlotSize; size > 0 {
+		// Simply decrease the RSP to allocate the spill slots.
+		// 		sub $size, %rsp
+		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true))
+
+		// At this point, we have the stack layout as follows:
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |   ReturnAddress |
+		//          |   Caller_RBP    |
+		//          +-----------------+ <--- RBP
+		//          |    clobbered M  |
+		//          |   ............  |
+		//          |    clobbered 1  |
+		//          |    clobbered 0  |
+		//          |   spill slot N  |
+		//          |   ............  |
+		//          |   spill slot 0  |
+		//          +-----------------+ <--- RSP
+		//             (low address)
+	}
+
+	linkInstr(cur, prevInitInst)
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Inserts the epilogue code.
+// 2. Removes the redundant copy instruction.
+// 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
+// 4. Lowering that is supposed to be done after regalloc.
+func (m *machine) postRegAlloc() {
+	ectx := m.ectx
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch k := cur.kind; k {
+		case ret:
+			m.setupEpilogueAfter(cur.prev)
+			continue
+		case fcvtToSintSequence, fcvtToUintSequence:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			if k == fcvtToSintSequence {
+				m.lowerFcvtToSintSequenceAfterRegalloc(cur)
+			} else {
+				m.lowerFcvtToUintSequenceAfterRegalloc(cur)
+			}
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case xmmCMov:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.lowerXmmCmovAfterRegAlloc(cur)
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case idivRemSequence:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.lowerIDivRemSequenceAfterRegAlloc(cur)
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case call, callIndirect:
+			// At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction
+			// right before/after the call instruction. If this is done before reg alloc, the stack slot
+			// can point to the wrong location and therefore results in a wrong value.
+			call := cur
+			next := call.next
+			_, _, _, _, size := backend.ABIInfoFromUint64(call.u2)
+			if size > 0 {
+				dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
+				linkInstr(call.prev, dec)
+				linkInstr(dec, call)
+				inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true)
+				linkInstr(call, inc)
+				linkInstr(inc, next)
+			}
+			continue
+		}
+
+		// Removes the redundant copy instruction.
+		if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() {
+			prev, next := cur.prev, cur.next
+			// Remove the copy instruction.
+			prev.next = next
+			if next != nil {
+				next.prev = prev
+			}
+		}
+	}
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+	prevNext := cur.next
+
+	// At this point, we have the stack layout as follows:
+	//
+	//            (high address)
+	//          +-----------------+
+	//          |     .......     |
+	//          |      ret Y      |
+	//          |     .......     |
+	//          |      ret 0      |
+	//          |      arg X      |
+	//          |     .......     |
+	//          |      arg 1      |
+	//          |      arg 0      |
+	//          |   ReturnAddress |
+	//          |   Caller_RBP    |
+	//          +-----------------+ <--- RBP
+	//          |    clobbered M  |
+	//          |   ............  |
+	//          |    clobbered 1  |
+	//          |    clobbered 0  |
+	//          |   spill slot N  |
+	//          |   ............  |
+	//          |   spill slot 0  |
+	//          +-----------------+ <--- RSP
+	//             (low address)
+
+	if size := m.spillSlotSize; size > 0 {
+		// Simply increase the RSP to free the spill slots.
+		// 		add $size, %rsp
+		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true))
+	}
+
+	//
+	//             (high address)
+	//            +-----------------+                     +-----------------+
+	//            |     .......     |                     |     .......     |
+	//            |      ret Y      |                     |      ret Y      |
+	//            |     .......     |                     |     .......     |
+	//            |      ret 0      |                     |      ret 0      |
+	//            |      arg X      |                     |      arg X      |
+	//            |     .......     |                     |     .......     |
+	//            |      arg 1      |                     |      arg 1      |
+	//            |      arg 0      |                     |      arg 0      |
+	//            |   ReturnAddress |                     |   ReturnAddress |
+	//            |    Caller_RBP   |                     |    Caller_RBP   |
+	//   RBP ---> +-----------------+      ========>      +-----------------+ <---- RSP, RBP
+	//            |    clobbered M  |
+	//            |   ............  |
+	//            |    clobbered 1  |
+	//            |    clobbered 0  |
+	//   RSP ---> +-----------------+
+	//               (low address)
+	//
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		for _, r := range regs {
+			if r.RegType() == regalloc.RegTypeInt {
+				cur = linkInstr(cur, m.allocateInstr().asPop64(r))
+			} else {
+				// Pop the XMM register is not supported by the POP instruction.
+				pop := m.allocateInstr().asXmmUnaryRmR(
+					sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r,
+				)
+				cur = linkInstr(cur, pop)
+				cur = m.addRSP(16, cur)
+			}
+		}
+	}
+
+	// Now roll back the RSP to RBP, and pop the caller's RBP.
+	cur = m.revertRBPRSP(cur)
+
+	linkInstr(cur, prevNext)
+}
+
+func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
+	if offset == 0 {
+		return cur
+	}
+	opcode := aluRmiROpcodeAdd
+	if offset < 0 {
+		opcode = aluRmiROpcodeSub
+		offset = -offset
+	}
+	return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true))
+}
+
+func (m *machine) setupRBPRSP(cur *instruction) *instruction {
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg)))
+	cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true))
+	return cur
+}
+
+func (m *machine) revertRBPRSP(cur *instruction) *instruction {
+	cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true))
+	cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg))
+	return cur
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
@ -0,0 +1,153 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	typ := src.RegType()
+	if typ != dst.RegType() {
+		panic("BUG: src and dst must have the same type")
+	}
+
+	mov := m.allocateInstr()
+	if typ == regalloc.RegTypeInt {
+		mov.asMovRR(src, dst, true)
+	} else {
+		mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
+	}
+
+	cur := instr.prev
+	prevNext := cur.next
+	cur = linkInstr(cur, mov)
+	linkInstr(cur, prevNext)
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.c.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	store := m.allocateInstr()
+	mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+	switch typ {
+	case ssa.TypeI32:
+		store.asMovRM(v, mem, 4)
+	case ssa.TypeI64:
+		store.asMovRM(v, mem, 8)
+	case ssa.TypeF32:
+		store.asXmmMovRM(sseOpcodeMovss, v, mem)
+	case ssa.TypeF64:
+		store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+	case ssa.TypeV128:
+		store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+	}
+
+	cur = linkInstr(cur, store)
+	return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.c.TypeOf(v)
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	// Load the value to the temporary.
+	load := m.allocateInstr()
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+	switch typ {
+	case ssa.TypeI32:
+		load.asMovzxRmR(extModeLQ, a, v)
+	case ssa.TypeI64:
+		load.asMov64MR(a, v)
+	case ssa.TypeF32:
+		load.asXmmUnaryRmR(sseOpcodeMovss, a, v)
+	case ssa.TypeF64:
+		load.asXmmUnaryRmR(sseOpcodeMovsd, a, v)
+	case ssa.TypeV128:
+		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v)
+	default:
+		panic("BUG")
+	}
+
+	cur = linkInstr(cur, load)
+	return linkInstr(cur, prevNext)
+}
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+	if x1.RegType() == regalloc.RegTypeInt {
+		prevNext := cur.next
+		xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8)
+		cur = linkInstr(cur, xc)
+		linkInstr(cur, prevNext)
+	} else {
+		if tmp.Valid() {
+			prevNext := cur.next
+			m.InsertMoveBefore(tmp, x1, prevNext)
+			m.InsertMoveBefore(x1, x2, prevNext)
+			m.InsertMoveBefore(x2, tmp, prevNext)
+		} else {
+			prevNext := cur.next
+			r2 := x2.RealReg()
+			// Temporarily spill x1 to stack.
+			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			// Then move x2 to x1.
+			cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1))
+			linkInstr(cur, prevNext)
+			// Then reload the original value on x1 from stack to r2.
+			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+		}
+	}
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+	cur := end
+	for cur.kind == nop0 {
+		cur = cur.prev
+		if cur == begin {
+			return end
+		}
+	}
+	switch cur.kind {
+	case jmp:
+		return cur
+	default:
+		return end
+	}
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+	return m.ectx.SsaBlockIDToLabels[id]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
@ -0,0 +1,992 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var swizzleMask = [16]byte{
+	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+}
+
+func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) {
+	masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:])
+
+	// Load mask to maskReg.
+	maskReg := m.c.AllocateVReg(ssa.TypeV128)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg)
+	m.insert(loadMask)
+
+	// Copy x and y to tmp registers.
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	tmpDst := m.copyToTmp(xx.reg())
+	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+	tmpX := m.copyToTmp(yy.reg())
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst))
+
+	// Copy the result to the destination register.
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) {
+	// Copy x to tmp.
+	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst))
+
+	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst))
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst))
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst))
+	case ssa.VecLaneF32x4:
+		// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
+		// See https://www.felixcloutier.com/x86/insertps
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst))
+	case ssa.VecLaneF64x2:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) {
+	// Pextr variants are used to extract a lane from a vector register.
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+	tmpDst := m.c.AllocateVReg(ret.Type())
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst))
+		if signed {
+			m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+		}
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst))
+		if signed {
+			m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+		}
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst))
+	case ssa.VecLaneF32x4:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst))
+		}
+	case ssa.VecLaneF64x2:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
+		} else {
+			m.copyTo(xx.reg(), tmpDst)
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var sqmulRoundSat = [16]byte{
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+}
+
+func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
+	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
+	maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:])
+
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp)
+	m.insert(loadMask)
+
+	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	tmpX := m.copyToTmp(xx.reg())
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
+
+	m.copyTo(tmpX, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.lowerVUshri8x16(x, y, ret)
+	case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
+		m.lowerShr(x, y, ret, lane, false)
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
+	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
+	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
+}
+
+func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) {
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, 0x7, false)
+	// Take the modulo 8 of the shift amount.
+	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false))
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false))
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx))
+
+	maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
+	base := m.c.AllocateVReg(ssa.TypeI64)
+	lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+	m.insert(lea)
+
+	// Shift tmpGpReg by 4 to multiply the shift amount by 16.
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+	mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp)
+	m.insert(loadMask)
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.lowerVSshri8x16(x, y, ret)
+	case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
+		m.lowerShr(x, y, ret, lane, true)
+	case ssa.VecLaneI64x2:
+		m.lowerVSshri64x2(x, y, ret)
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) {
+	shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(shiftAmtReg, 0x7, false)
+	// Take the modulo 8 of the shift amount.
+	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false))
+
+	// Copy the x value to two temporary registers.
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+	m.copyTo(xx, vecTmp)
+
+	// Assuming that we have
+	//  xx   = [b1, ..., b16]
+	//  vecTmp = [b1, ..., b16]
+	// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
+	//  xx   = [b1, b1, b2, b2, ..., b8, b8]
+	//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp))
+
+	// Adding 8 to the shift amount, and then move the amount to vecTmp2.
+	vecTmp2 := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false))
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false))
+
+	// Perform the word packed arithmetic right shifts on vreg and vecTmp.
+	// This changes these two registers as:
+	//  xx   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
+	//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
+	// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx))
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp))
+
+	// Finally, we can get the result by packing these two word vectors.
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx))
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) {
+	// Load the shift amount to RCX.
+	shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg))
+
+	tmpGp := m.c.AllocateVReg(ssa.TypeI64)
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xxReg := m.copyToTmp(_xx.reg())
+
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp))
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp))
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg))
+
+	m.copyTo(xxReg, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	var modulo uint64
+	var shiftOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI16x8:
+		modulo = 0xf
+		if signed {
+			shiftOp = sseOpcodePsraw
+		} else {
+			shiftOp = sseOpcodePsrlw
+		}
+	case ssa.VecLaneI32x4:
+		modulo = 0x1f
+		if signed {
+			shiftOp = sseOpcodePsrad
+		} else {
+			shiftOp = sseOpcodePsrld
+		}
+	case ssa.VecLaneI64x2:
+		modulo = 0x3f
+		if signed {
+			panic("BUG")
+		}
+		shiftOp = sseOpcodePsrlq
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, modulo, false)
+	// Take the modulo 8 of the shift amount.
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+	// And move it to a xmm register.
+	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+	// Then do the actual shift.
+	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) {
+	var modulo uint64
+	var shiftOp sseOpcode
+	var isI8x16 bool
+	switch lane {
+	case ssa.VecLaneI8x16:
+		isI8x16 = true
+		modulo = 0x7
+		shiftOp = sseOpcodePsllw
+	case ssa.VecLaneI16x8:
+		modulo = 0xf
+		shiftOp = sseOpcodePsllw
+	case ssa.VecLaneI32x4:
+		modulo = 0x1f
+		shiftOp = sseOpcodePslld
+	case ssa.VecLaneI64x2:
+		modulo = 0x3f
+		shiftOp = sseOpcodePsllq
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, modulo, false)
+	// Take the modulo 8 of the shift amount.
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+	// And move it to a xmm register.
+	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+	// Then do the actual shift.
+	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+	if isI8x16 {
+		maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
+		base := m.c.AllocateVReg(ssa.TypeI64)
+		lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+		m.insert(lea)
+
+		// Shift tmpGpReg by 4 to multiply the shift amount by 16.
+		m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+		mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+		loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec)
+		m.insert(loadMask)
+
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx))
+	}
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
+	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
+	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
+	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
+	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
+}
+
+func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) {
+	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	var round sseOpcode
+	if _64 {
+		round = sseOpcodeRoundpd
+	} else {
+		round = sseOpcodeRoundps
+	}
+	m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret)))
+}
+
+var (
+	allOnesI8x16              = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
+	allOnesI16x8              = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
+	extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
+	extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
+)
+
+func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	switch srcLane {
+	case ssa.VecLaneI8x16:
+		allOneReg := m.c.AllocateVReg(ssa.TypeV128)
+		mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:])
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg))
+
+		var resultReg regalloc.VReg
+		if signed {
+			resultReg = allOneReg
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg))
+		} else {
+			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
+			resultReg = xx
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg))
+		}
+		m.copyTo(resultReg, m.c.VRegOf(ret))
+
+	case ssa.VecLaneI16x8:
+		if signed {
+			allOnesReg := m.c.AllocateVReg(ssa.TypeV128)
+			mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx))
+			m.copyTo(xx, m.c.VRegOf(ret))
+		} else {
+			maskReg := m.c.AllocateVReg(ssa.TypeV128)
+			mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// Flip the sign bits on xx.
+			//
+			// Assuming that xx = [w1, ..., w8], now we have,
+			// 	xx[i] = int8(-w1) for i = 0...8
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx))
+
+			mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// For i = 0,..4 (as this results in i32x4 lanes), now we have
+			// xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
+			// c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx))
+
+			mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
+			// c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx))
+
+			m.copyTo(xx, m.c.VRegOf(ret))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", srcLane))
+	}
+}
+
+func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		if signed {
+			sseOp = sseOpcodePmovsxbw
+		} else {
+			sseOp = sseOpcodePmovzxbw
+		}
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePmovsxwd
+		} else {
+			sseOp = sseOpcodePmovzxwd
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePmovsxdq
+		} else {
+			sseOp = sseOpcodePmovzxdq
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	m.copyTo(xx.reg(), tmp)
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp))
+
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		if signed {
+			sseOp = sseOpcodePmovsxbw
+		} else {
+			sseOp = sseOpcodePmovzxbw
+		}
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePmovsxwd
+		} else {
+			sseOp = sseOpcodePmovzxwd
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePmovsxdq
+		} else {
+			sseOp = sseOpcodePmovzxdq
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) {
+	tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64)
+	am := newOperandMem(m.lowerToAddressMode(ptr, offset))
+
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst))
+		tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128)
+		m.insert(m.allocateInstr().asZeros(tmpZeroVec))
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst))
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asMov64MR(am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst))
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var f64x2CvtFromIMask = [16]byte{
+	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+}
+
+func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	switch lane {
+	case ssa.VecLaneF32x4:
+		if signed {
+			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret)))
+		} else {
+			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			// Copy the value to two temporary registers.
+			tmp := m.copyToTmp(xx.reg())
+			tmp2 := m.copyToTmp(xx.reg())
+
+			// Clear the higher 16 bits of each 32-bit element.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp))
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp))
+
+			// Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2))
+
+			// Convert the lower 16-bits in tmp.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+
+			// Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2))
+
+			// Double the converted halved higher 16bits.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2))
+
+			// Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2))
+
+			m.copyTo(tmp2, m.c.VRegOf(ret))
+		}
+	case ssa.VecLaneF64x2:
+		if signed {
+			xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret)))
+		} else {
+			maskReg := m.c.AllocateVReg(ssa.TypeV128)
+			maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
+			// maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+			_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			xx := m.copyToTmp(_xx.reg())
+
+			// Given that we have xx = [d1, d2, d3, d4], this results in
+			//	xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
+			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
+			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx))
+
+			// maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
+			maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+			// Now, we get the result as
+			// 	xx = [float64(uint32(d1)), float64(uint32(d2))]
+			// because the following equality always satisfies:
+			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx))
+
+			m.copyTo(xx, m.c.VRegOf(ret))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+var (
+	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
+	i32sMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+	}
+
+	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
+	i32uMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+	}
+
+	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
+	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
+	// like addition or subtraction, the resulted floating point holds exactly the same
+	// bit representations in 32-bit integer on its mantissa.
+	//
+	// Note: the name twop52 is common across various compiler ecosystem.
+	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
+	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
+	twop52 = [16]byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+	}
+)
+
+func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	switch lane {
+	case ssa.VecLaneF32x4:
+		if signed {
+			tmp := m.copyToTmp(xx)
+
+			// Assuming we have xx = [v1, v2, v3, v4].
+			//
+			// Set all bits if lane is not NaN on tmp.
+			// tmp[i] = 0xffffffff  if vi != NaN
+			//        = 0           if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+			// Clear NaN lanes on xx, meaning that
+			// 	xx[i] = vi  if vi != NaN
+			//	        0   if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx))
+
+			// tmp[i] = ^vi         if vi != NaN
+			//        = 0xffffffff  if vi == NaN
+			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp))
+
+			// xx[i] = int32(vi)   if vi != NaN and xx is not overflowing.
+			//       = 0x80000000  if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
+			//       = 0           if vi == NaN
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+
+			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
+			//
+			// tmp[i] = 0x80000000                         if vi is positive
+			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp))
+
+			// Arithmetic right shifting tmp by 31, meaning that we have
+			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp))
+
+			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx))
+		} else {
+			tmp := m.c.AllocateVReg(ssa.TypeV128)
+			m.insert(m.allocateInstr().asZeros(tmp))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp))
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+			tmp2 := m.copyToTmp(xx)
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx))
+		}
+
+	case ssa.VecLaneF64x2:
+		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+		if signed {
+			tmp := m.copyToTmp(xx)
+
+			// Set all bits for non-NaN lanes, zeros otherwise.
+			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+			maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
+			// Load the 2147483647 into tmp2's each lane.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2))
+
+			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp))
+
+			// MINPD returns the source register's value as-is, so we have
+			//  xx[i] = vi   if vi != NaN
+			//        = 0    if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx))
+
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx))
+		} else {
+			tmp := m.c.AllocateVReg(ssa.TypeV128)
+			m.insert(m.allocateInstr().asZeros(tmp))
+
+			//  xx[i] = vi   if vi != NaN && vi > 0
+			//        = 0    if vi == NaN || vi <= 0
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx))
+
+			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
+			maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+			// xx[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0    otherwise
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx))
+
+			// Round the floating points into integer.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx))
+
+			// tmp2[i] = float64(0x1.0p52)
+			maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+			// xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0                                       otherwise
+			//
+			// This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx))
+
+			// At this point, we have
+			// 	xx  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
+			//  tmp = [0, 0, 0, 0]
+			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
+			//	xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
+			// meaning that for i = 0 and 1, we have
+			//  xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//        = 0          otherwise.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePacksswb
+		} else {
+			sseOp = sseOpcodePackuswb
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePackssdw
+		} else {
+			sseOp = sseOpcodePackusdw
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+	m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIabs(instr *ssa.Instruction) {
+	x, lane := instr.ArgWithLane()
+	rd := m.c.VRegOf(instr.Return())
+
+	if lane == ssa.VecLaneI64x2 {
+		_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+		blendReg := xmm0VReg
+		m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg))
+
+		tmp := m.copyToTmp(_xx.reg())
+		xx := m.copyToTmp(_xx.reg())
+
+		// Clear all bits on blendReg.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg))
+		// Subtract xx from blendMaskReg.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg))
+		// Copy the subtracted value ^^ back into tmp.
+		m.copyTo(blendReg, xx)
+
+		m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx))
+
+		m.copyTo(xx, rd)
+	} else {
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePabsb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePabsw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePabsd
+		}
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+		i := m.allocateInstr()
+		i.asXmmUnaryRmR(vecOp, rn, rd)
+		m.insert(i)
+	}
+}
+
+func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) {
+	x := instr.Arg()
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp1 := m.c.AllocateVReg(ssa.TypeV128)
+	m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)
+
+	// Copy input into tmp2.
+	tmp2 := m.copyToTmp(rn.reg())
+
+	// Given that we have:
+	//  rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
+	//
+	// Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
+	//  tmp2 = [l1, ..., l16].
+	pand := m.allocateInstr()
+	pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2)
+	m.insert(pand)
+
+	// Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
+	//  tmp3 = [h1, ...., h16].
+	tmp3 := m.copyToTmp(rn.reg())
+	psrlw := m.allocateInstr()
+	psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3)
+	m.insert(psrlw)
+
+	pand2 := m.allocateInstr()
+	pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3)
+	m.insert(pand2)
+
+	// Read the popcntTable into tmp4, and we have
+	//  tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
+	tmp4 := m.c.AllocateVReg(ssa.TypeV128)
+	m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)
+
+	// Make a copy for later.
+	tmp5 := m.copyToTmp(tmp4)
+
+	//  tmp4 = [popcnt(l1), ..., popcnt(l16)].
+	pshufb := m.allocateInstr()
+	pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4)
+	m.insert(pshufb)
+
+	pshufb2 := m.allocateInstr()
+	pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5)
+	m.insert(pshufb2)
+
+	// tmp4 + tmp5 is the result.
+	paddb := m.allocateInstr()
+	paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5)
+	m.insert(paddb)
+
+	m.copyTo(tmp5, rd)
+}
+
+func (m *machine) lowerVImul(instr *ssa.Instruction) {
+	x, y, lane := instr.Arg2WithLane()
+	rd := m.c.VRegOf(instr.Return())
+	if lane == ssa.VecLaneI64x2 {
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+		// Assuming that we have
+		//	rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
+		//  rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
+		// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
+
+		// Copy rn into tmp1.
+		tmp1 := m.copyToTmp(rn.reg())
+
+		// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
+		shift := m.allocateInstr()
+		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1)
+		m.insert(shift)
+
+		// Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
+		mul := m.allocateInstr()
+		mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1)
+		m.insert(mul)
+
+		// Copy rm value into tmp2.
+		tmp2 := m.copyToTmp(rm.reg())
+
+		// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
+		shift2 := m.allocateInstr()
+		shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2)
+		m.insert(shift2)
+
+		// Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
+		mul2 := m.allocateInstr()
+		mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2)
+		m.insert(mul2)
+
+		// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
+		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
+		add := m.allocateInstr()
+		add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1)
+		m.insert(add)
+
+		shift3 := m.allocateInstr()
+		shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1)
+		m.insert(shift3)
+
+		// Copy rm value into tmp3.
+		tmp3 := m.copyToTmp(rm.reg())
+
+		// "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
+		mul3 := m.allocateInstr()
+		mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3)
+		m.insert(mul3)
+
+		// Finally, we get the result by computing tmp1 + tmp3,
+		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
+		add2 := m.allocateInstr()
+		add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1)
+		m.insert(add2)
+
+		m.copyTo(tmp1, rd)
+
+	} else {
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePmullw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePmulld
+		default:
+			panic("unsupported: " + lane.String())
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
@ -0,0 +1,346 @@
+package amd64
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type operand struct {
+	kind operandKind
+	data uint64
+}
+
+type operandKind byte
+
+const (
+	// operandKindReg is an operand which is an integer Register.
+	operandKindReg operandKind = iota + 1
+
+	// operandKindMem is a value in Memory.
+	// 32, 64, or 128 bit value.
+	operandKindMem
+
+	// operandKindImm32 is a signed-32-bit integer immediate value.
+	operandKindImm32
+
+	// operandKindLabel is a label.
+	operandKindLabel
+)
+
+// String implements fmt.Stringer.
+func (o operandKind) String() string {
+	switch o {
+	case operandKindReg:
+		return "reg"
+	case operandKindMem:
+		return "mem"
+	case operandKindImm32:
+		return "imm32"
+	case operandKindLabel:
+		return "label"
+	default:
+		panic("BUG: invalid operand kind")
+	}
+}
+
+// format returns the string representation of the operand.
+// _64 is only for the case where the operand is a register, and it's integer.
+func (o *operand) format(_64 bool) string {
+	switch o.kind {
+	case operandKindReg:
+		return formatVRegSized(o.reg(), _64)
+	case operandKindMem:
+		return o.addressMode().String()
+	case operandKindImm32:
+		return fmt.Sprintf("$%d", int32(o.imm32()))
+	case operandKindLabel:
+		return backend.Label(o.imm32()).String()
+	default:
+		panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind))
+	}
+}
+
+//go:inline
+func (o *operand) reg() regalloc.VReg {
+	return regalloc.VReg(o.data)
+}
+
+//go:inline
+func (o *operand) setReg(r regalloc.VReg) {
+	o.data = uint64(r)
+}
+
+//go:inline
+func (o *operand) addressMode() *amode {
+	return wazevoapi.PtrFromUintptr[amode](uintptr(o.data))
+}
+
+//go:inline
+func (o *operand) imm32() uint32 {
+	return uint32(o.data)
+}
+
+func (o *operand) label() backend.Label {
+	switch o.kind {
+	case operandKindLabel:
+		return backend.Label(o.data)
+	case operandKindMem:
+		mem := o.addressMode()
+		if mem.kind() != amodeRipRel {
+			panic("BUG: invalid label")
+		}
+		return backend.Label(mem.imm32)
+	default:
+		panic("BUG: invalid operand kind")
+	}
+}
+
+func newOperandLabel(label backend.Label) operand {
+	return operand{kind: operandKindLabel, data: uint64(label)}
+}
+
+func newOperandReg(r regalloc.VReg) operand {
+	return operand{kind: operandKindReg, data: uint64(r)}
+}
+
+func newOperandImm32(imm32 uint32) operand {
+	return operand{kind: operandKindImm32, data: uint64(imm32)}
+}
+
+func newOperandMem(amode *amode) operand {
+	return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))}
+}
+
+// amode is a memory operand (addressing mode).
+type amode struct {
+	kindWithShift uint32
+	imm32         uint32
+	base          regalloc.VReg
+
+	// For amodeRegRegShift:
+	index regalloc.VReg
+}
+
+type amodeKind byte
+
+const (
+	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base
+	amodeImmReg amodeKind = iota + 1
+
+	// amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP.
+	// The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the
+	// register allocator.
+	amodeImmRBP
+
+	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift)
+	amodeRegRegShift
+
+	// amodeRipRel is a RIP-relative addressing mode specified by the label.
+	amodeRipRel
+
+	// TODO: there are other addressing modes such as the one without base register.
+)
+
+func (a *amode) kind() amodeKind {
+	return amodeKind(a.kindWithShift & 0xff)
+}
+
+func (a *amode) shift() byte {
+	return byte(a.kindWithShift >> 8)
+}
+
+func (a *amode) uses(rs *[]regalloc.VReg) {
+	switch a.kind() {
+	case amodeImmReg:
+		*rs = append(*rs, a.base)
+	case amodeRegRegShift:
+		*rs = append(*rs, a.base, a.index)
+	case amodeImmRBP, amodeRipRel:
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (a *amode) nregs() int {
+	switch a.kind() {
+	case amodeImmReg:
+		return 1
+	case amodeRegRegShift:
+		return 2
+	case amodeImmRBP, amodeRipRel:
+		return 0
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (a *amode) assignUses(i int, reg regalloc.VReg) {
+	switch a.kind() {
+	case amodeImmReg:
+		if i == 0 {
+			a.base = reg
+		} else {
+			panic("BUG: invalid amode assignment")
+		}
+	case amodeRegRegShift:
+		if i == 0 {
+			a.base = reg
+		} else if i == 1 {
+			a.index = reg
+		} else {
+			panic("BUG: invalid amode assignment")
+		}
+	default:
+		panic("BUG: invalid amode assignment")
+	}
+}
+
+func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base}
+	return ret
+}
+
+func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg}
+	return ret
+}
+
+func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode {
+	if shift > 3 {
+		panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift))
+	}
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index}
+	return ret
+}
+
+func (m *machine) newAmodeRipRel(label backend.Label) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)}
+	return ret
+}
+
+// String implements fmt.Stringer.
+func (a *amode) String() string {
+	switch a.kind() {
+	case amodeImmReg, amodeImmRBP:
+		if a.imm32 == 0 {
+			return fmt.Sprintf("(%s)", formatVRegSized(a.base, true))
+		}
+		return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true))
+	case amodeRegRegShift:
+		shift := 1 << a.shift()
+		if a.imm32 == 0 {
+			return fmt.Sprintf(
+				"(%s,%s,%d)",
+				formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+		}
+		return fmt.Sprintf(
+			"%d(%s,%s,%d)",
+			int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+	case amodeRipRel:
+		return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32))
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	if def.SSAValue().Type() == ssa.TypeV128 {
+		// SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment.
+		return m.getOperand_Reg(def)
+	}
+
+	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+		instr := def.Instr
+		ptr, offset, _ := instr.LoadData()
+		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+		instr.MarkLowered()
+		return op
+	}
+	return m.getOperand_Reg(def)
+}
+
+func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+		instr := def.Instr
+		ptr, offset, _ := instr.LoadData()
+		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+		instr.MarkLowered()
+		return op
+	}
+	return m.getOperand_Imm32_Reg(def)
+}
+
+func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Constant() {
+		// If the operation is 64-bit, x64 sign-extends the 32-bit immediate value.
+		// Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set,
+		// we should not use the immediate value.
+		if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok {
+			instr.MarkLowered()
+			return op
+		}
+	}
+	return m.getOperand_Reg(def)
+}
+
+func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) {
+	if imm32, ok := asImm32(val, allowSignExt); ok {
+		return newOperandImm32(imm32), true
+	}
+	return operand{}, false
+}
+
+func asImm32(val uint64, allowSignExt bool) (uint32, bool) {
+	u32val := uint32(val)
+	if uint64(u32val) != val {
+		return 0, false
+	}
+	if !allowSignExt && u32val&0x80000000 != 0 {
+		return 0, false
+	}
+	return u32val, true
+}
+
+func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) {
+	var v regalloc.VReg
+	if def.IsFromBlockParam() {
+		v = def.BlkParamVReg
+	} else {
+		instr := def.Instr
+		if instr.Constant() {
+			// We inline all the constant instructions so that we could reduce the register usage.
+			v = m.lowerConstant(instr)
+			instr.MarkLowered()
+		} else {
+			if n := def.N; n == 0 {
+				v = m.c.VRegOf(instr.Return())
+			} else {
+				_, rs := instr.Returns()
+				v = m.c.VRegOf(rs[n-1])
+			}
+		}
+	}
+	return newOperandReg(v)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
@ -0,0 +1,11 @@
+//go:build !tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+	s.Len = int(limit)
+	s.Cap = int(limit)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
@ -0,0 +1,11 @@
+//go:build tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+	s.Len = limit
+	s.Len = limit
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
@ -0,0 +1,181 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Amd64-specific registers.
+const (
+	// rax is a gp register.
+	rax = regalloc.RealRegInvalid + 1 + iota
+	// rcx is a gp register.
+	rcx
+	// rdx is a gp register.
+	rdx
+	// rbx is a gp register.
+	rbx
+	// rsp is a gp register.
+	rsp
+	// rbp is a gp register.
+	rbp
+	// rsi is a gp register.
+	rsi
+	// rdi is a gp register.
+	rdi
+	// r8 is a gp register.
+	r8
+	// r9 is a gp register.
+	r9
+	// r10 is a gp register.
+	r10
+	// r11 is a gp register.
+	r11
+	// r12 is a gp register.
+	r12
+	// r13 is a gp register.
+	r13
+	// r14 is a gp register.
+	r14
+	// r15 is a gp register.
+	r15
+
+	// xmm0 is a vector register.
+	xmm0
+	// xmm1 is a vector register.
+	xmm1
+	// xmm2 is a vector register.
+	xmm2
+	// xmm3 is a vector register.
+	xmm3
+	// xmm4 is a vector register.
+	xmm4
+	// xmm5 is a vector register.
+	xmm5
+	// xmm6 is a vector register.
+	xmm6
+	// xmm7 is a vector register.
+	xmm7
+	// xmm8 is a vector register.
+	xmm8
+	// xmm9 is a vector register.
+	xmm9
+	// xmm10 is a vector register.
+	xmm10
+	// xmm11 is a vector register.
+	xmm11
+	// xmm12 is a vector register.
+	xmm12
+	// xmm13 is a vector register.
+	xmm13
+	// xmm14 is a vector register.
+	xmm14
+	// xmm15 is a vector register.
+	xmm15
+)
+
+var (
+	raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt)
+	rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt)
+	rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt)
+	rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt)
+	rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt)
+	rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt)
+	rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt)
+	rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt)
+	r8VReg  = regalloc.FromRealReg(r8, regalloc.RegTypeInt)
+	r9VReg  = regalloc.FromRealReg(r9, regalloc.RegTypeInt)
+	r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt)
+	r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt)
+	r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt)
+	r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt)
+	r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt)
+	r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt)
+
+	xmm0VReg  = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat)
+	xmm1VReg  = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat)
+	xmm2VReg  = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat)
+	xmm3VReg  = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat)
+	xmm4VReg  = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat)
+	xmm5VReg  = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat)
+	xmm6VReg  = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat)
+	xmm7VReg  = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat)
+	xmm8VReg  = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat)
+	xmm9VReg  = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat)
+	xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat)
+	xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat)
+	xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat)
+	xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat)
+	xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat)
+	xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat)
+)
+
+var regNames = [...]string{
+	rax:   "rax",
+	rcx:   "rcx",
+	rdx:   "rdx",
+	rbx:   "rbx",
+	rsp:   "rsp",
+	rbp:   "rbp",
+	rsi:   "rsi",
+	rdi:   "rdi",
+	r8:    "r8",
+	r9:    "r9",
+	r10:   "r10",
+	r11:   "r11",
+	r12:   "r12",
+	r13:   "r13",
+	r14:   "r14",
+	r15:   "r15",
+	xmm0:  "xmm0",
+	xmm1:  "xmm1",
+	xmm2:  "xmm2",
+	xmm3:  "xmm3",
+	xmm4:  "xmm4",
+	xmm5:  "xmm5",
+	xmm6:  "xmm6",
+	xmm7:  "xmm7",
+	xmm8:  "xmm8",
+	xmm9:  "xmm9",
+	xmm10: "xmm10",
+	xmm11: "xmm11",
+	xmm12: "xmm12",
+	xmm13: "xmm13",
+	xmm14: "xmm14",
+	xmm15: "xmm15",
+}
+
+func formatVRegSized(r regalloc.VReg, _64 bool) string {
+	if r.IsRealReg() {
+		if r.RegType() == regalloc.RegTypeInt {
+			rr := r.RealReg()
+			orig := regNames[rr]
+			if rr <= rdi {
+				if _64 {
+					return "%" + orig
+				} else {
+					return "%e" + orig[1:]
+				}
+			} else {
+				if _64 {
+					return "%" + orig
+				} else {
+					return "%" + orig + "d"
+				}
+			}
+		} else {
+			return "%" + regNames[r.RealReg()]
+		}
+	} else {
+		if r.RegType() == regalloc.RegTypeInt {
+			if _64 {
+				return fmt.Sprintf("%%r%d?", r.ID())
+			} else {
+				return fmt.Sprintf("%%r%dd?", r.ID())
+			}
+		} else {
+			return fmt.Sprintf("%%xmm%d?", r.ID())
+		}
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
@ -0,0 +1,128 @@
+package amd64
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+func stackView(rbp, top uintptr) []byte {
+	var stackBuf []byte
+	{
+		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+		hdr.Data = rbp
+		setSliceLimits(hdr, top-rbp)
+	}
+	return stackBuf
+}
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr {
+	stackBuf := stackView(rbp, top)
+
+	for i := uint64(0); i < uint64(len(stackBuf)); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |
+		//    |     .......     |
+		//    |      ret 0      |
+		//    |      arg X      |
+		//    |     .......     |
+		//    |      arg 1      |
+		//    |      arg 0      |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- Caller_RBP
+		//    |   ...........   |
+		//    |   clobbered  M  |
+		//    |   ............  |
+		//    |   clobbered  0  |
+		//    |   spill slot N  |
+		//    |   ............  |
+		//    |   spill slot 0  |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- RBP
+		//       (low address)
+
+		callerRBP := binary.LittleEndian.Uint64(stackBuf[i:])
+		retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:])
+		returnAddresses = append(returnAddresses, uintptr(retAddr))
+		i = callerRBP - uint64(rbp)
+		if len(returnAddresses) == wasmdebug.MaxFrames {
+			break
+		}
+	}
+	return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	//                  (high address)
+	//              +-----------------+ <----+
+	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
+	//           ^  |  arg[N]/ret[M]  |      |
+	// sliceSize |  |  ............   |      | SizeInBytes/8
+	//           |  |  arg[1]/ret[1]  |      |
+	//           v  |  arg[0]/ret[0]  | <----+
+	//              |   SizeInBytes   |
+	//              +-----------------+ <---- stackPointerBeforeGoCall
+	//                 (low address)
+	data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8)
+	size := *stackPointerBeforeGoCall / 8
+	return unsafe.Slice((*uint64)(data), int(size))
+}
+
+func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) {
+	diff := uint64(rsp - oldRsp)
+
+	newBuf := stackView(rbp, top)
+	for i := uint64(0); i < uint64(len(newBuf)); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |
+		//    |     .......     |
+		//    |      ret 0      |
+		//    |      arg X      |
+		//    |     .......     |
+		//    |      arg 1      |
+		//    |      arg 0      |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- Caller_RBP
+		//    |   ...........   |
+		//    |   clobbered  M  |
+		//    |   ............  |
+		//    |   clobbered  0  |
+		//    |   spill slot N  |
+		//    |   ............  |
+		//    |   spill slot 0  |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- RBP
+		//       (low address)
+
+		callerRBP := binary.LittleEndian.Uint64(newBuf[i:])
+		if callerRBP == 0 {
+			// End of stack.
+			break
+		}
+		if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) {
+			panic("BUG: callerRBP is out of range")
+		}
+		if int(callerRBP) < 0 {
+			panic("BUG: callerRBP is negative")
+		}
+		adjustedCallerRBP := callerRBP + diff
+		if int(adjustedCallerRBP) < 0 {
+			panic("BUG: adjustedCallerRBP is negative")
+		}
+		binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP)
+		i = adjustedCallerRBP - uint64(rbp)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@ -0,0 +1,332 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// References:
+// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture
+// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
+
+var (
+	intParamResultRegs   = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7}
+	floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+		// We don't allocate:
+		// - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers
+		// - x28: Reserved by Go runtime.
+		// - x27(=tmpReg): because of the reason described on tmpReg.
+		regalloc.RegTypeInt: {
+			x8, x9, x10, x11, x12, x13, x14, x15,
+			x16, x17, x19, x20, x21, x22, x23, x24, x25,
+			x26, x29, x30,
+			// These are the argument/return registers. Less preferred in the allocation.
+			x7, x6, x5, x4, x3, x2, x1, x0,
+		},
+		regalloc.RegTypeFloat: {
+			v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+			v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30,
+			// These are the argument/return registers. Less preferred in the allocation.
+			v7, v6, v5, v4, v3, v2, v1, v0,
+		},
+	},
+	CalleeSavedRegisters: regalloc.NewRegSet(
+		x19, x20, x21, x22, x23, x24, x25, x26, x28,
+		v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+	),
+	CallerSavedRegisters: regalloc.NewRegSet(
+		x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30,
+		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+	),
+	RealRegToVReg: []regalloc.VReg{
+		x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg,
+		v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg,
+	},
+	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+		if r < v0 {
+			return regalloc.RegTypeInt
+		}
+		return regalloc.RegTypeFloat
+	},
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+	return intParamResultRegs, floatParamResultRegs
+}
+
+// LowerParams implements backend.FunctionABI.
+func (m *machine) LowerParams(args []ssa.Value) {
+	a := m.currentABI
+
+	for i, ssaArg := range args {
+		if !ssaArg.Valid() {
+			continue
+		}
+		reg := m.compiler.VRegOf(ssaArg)
+		arg := &a.Args[i]
+		if arg.Kind == backend.ABIArgKindReg {
+			m.InsertMove(reg, arg.Reg, arg.Type)
+		} else {
+			// TODO: we could use pair load if there's consecutive loads for the same type.
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |
+			//          |      arg X      |
+			//          |     .......     |
+			//          |      arg 1      |
+			//          |      arg 0      |    <-|
+			//          |   ReturnAddress |      |
+			//          +-----------------+      |
+			//          |   ...........   |      |
+			//          |   clobbered  M  |      |   argStackOffset: is unknown at this point of compilation.
+			//          |   ............  |      |
+			//          |   clobbered  0  |      |
+			//          |   spill slot N  |      |
+			//          |   ...........   |      |
+			//          |   spill slot 0  |      |
+			//   SP---> +-----------------+    <-+
+			//             (low address)
+
+			bits := arg.Type.Bits()
+			// At this point of compilation, we don't yet know how much space exist below the return address.
+			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
+			amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+			load := m.allocateInstr()
+			switch arg.Type {
+			case ssa.TypeI32, ssa.TypeI64:
+				load.asULoad(operandNR(reg), amode, bits)
+			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+				load.asFpuLoad(operandNR(reg), amode, bits)
+			default:
+				panic("BUG")
+			}
+			m.insert(load)
+			m.unresolvedAddressModes = append(m.unresolvedAddressModes, load)
+		}
+	}
+}
+
+// LowerReturns lowers the given returns.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+	a := m.currentABI
+
+	l := len(rets) - 1
+	for i := range rets {
+		// Reverse order in order to avoid overwriting the stack returns existing in the return registers.
+		ret := rets[l-i]
+		r := &a.Rets[l-i]
+		reg := m.compiler.VRegOf(ret)
+		if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() {
+			// Constant instructions are inlined.
+			if inst := def.Instr; inst.Constant() {
+				val := inst.Return()
+				valType := val.Type()
+				v := inst.ConstantVal()
+				m.insertLoadConstant(v, valType, reg)
+			}
+		}
+		if r.Kind == backend.ABIArgKindReg {
+			m.InsertMove(r.Reg, reg, ret.Type())
+		} else {
+			// TODO: we could use pair store if there's consecutive stores for the same type.
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |    <-+
+			//          |      arg X      |      |
+			//          |     .......     |      |
+			//          |      arg 1      |      |
+			//          |      arg 0      |      |
+			//          |   ReturnAddress |      |
+			//          +-----------------+      |
+			//          |   ...........   |      |
+			//          |   spill slot M  |      |   retStackOffset: is unknown at this point of compilation.
+			//          |   ............  |      |
+			//          |   spill slot 2  |      |
+			//          |   spill slot 1  |      |
+			//          |   clobbered 0   |      |
+			//          |   clobbered 1   |      |
+			//          |   ...........   |      |
+			//          |   clobbered N   |      |
+			//   SP---> +-----------------+    <-+
+			//             (low address)
+
+			bits := r.Type.Bits()
+
+			// At this point of compilation, we don't yet know how much space exist below the return address.
+			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
+			amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+			store := m.allocateInstr()
+			store.asStore(operandNR(reg), amode, bits)
+			m.insert(store)
+			m.unresolvedAddressModes = append(m.unresolvedAddressModes, store)
+		}
+	}
+}
+
+// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
+// caller side of the function call.
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) {
+	arg := &a.Args[argIndex]
+	if def != nil && def.IsFromInstr() {
+		// Constant instructions are inlined.
+		if inst := def.Instr; inst.Constant() {
+			val := inst.Return()
+			valType := val.Type()
+			v := inst.ConstantVal()
+			m.insertLoadConstant(v, valType, reg)
+		}
+	}
+	if arg.Kind == backend.ABIArgKindReg {
+		m.InsertMove(arg.Reg, reg, arg.Type)
+	} else {
+		// TODO: we could use pair store if there's consecutive stores for the same type.
+		//
+		// Note that at this point, stack pointer is already adjusted.
+		bits := arg.Type.Bits()
+		amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false)
+		store := m.allocateInstr()
+		store.asStore(operandNR(reg), amode, bits)
+		m.insert(store)
+	}
+}
+
+func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) {
+	r := &a.Rets[retIndex]
+	if r.Kind == backend.ABIArgKindReg {
+		m.InsertMove(reg, r.Reg, r.Type)
+	} else {
+		// TODO: we could use pair load if there's consecutive loads for the same type.
+		amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false)
+		ldr := m.allocateInstr()
+		switch r.Type {
+		case ssa.TypeI32, ssa.TypeI64:
+			ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+		default:
+			panic("BUG")
+		}
+		m.insert(ldr)
+	}
+}
+
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur, mode
+}
+
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+	if rn.RegType() != regalloc.RegTypeInt {
+		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
+	}
+	var amode addressMode
+	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
+		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+	} else {
+		var indexReg regalloc.VReg
+		if allowTmpRegUse {
+			m.lowerConstantI64(tmpRegVReg, offset)
+			indexReg = tmpRegVReg
+		} else {
+			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
+			m.lowerConstantI64(indexReg, offset)
+		}
+		amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+	}
+	return amode
+}
+
+func (m *machine) lowerCall(si *ssa.Instruction) {
+	isDirectCall := si.Opcode() == ssa.OpcodeCall
+	var indirectCalleePtr ssa.Value
+	var directCallee ssa.FuncRef
+	var sigID ssa.SignatureID
+	var args []ssa.Value
+	if isDirectCall {
+		directCallee, sigID, args = si.CallData()
+	} else {
+		indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData()
+	}
+	calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID))
+
+	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
+	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
+		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame.
+	}
+
+	for i, arg := range args {
+		reg := m.compiler.VRegOf(arg)
+		def := m.compiler.ValueDefinition(arg)
+		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
+	}
+
+	if isDirectCall {
+		call := m.allocateInstr()
+		call.asCall(directCallee, calleeABI)
+		m.insert(call)
+	} else {
+		ptr := m.compiler.VRegOf(indirectCalleePtr)
+		callInd := m.allocateInstr()
+		callInd.asCallIndirect(ptr, calleeABI)
+		m.insert(callInd)
+	}
+
+	var index int
+	r1, rs := si.Returns()
+	if r1.Valid() {
+		m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize)
+		index++
+	}
+
+	for _, r := range rs {
+		m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize)
+		index++
+	}
+}
+
+func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) {
+	if imm12Operand, ok := asImm12Operand(uint64(diff)); ok {
+		alu := m.allocateInstr()
+		var ao aluOp
+		if add {
+			ao = aluOpAdd
+		} else {
+			ao = aluOpSub
+		}
+		alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+		m.insert(alu)
+	} else {
+		m.lowerConstantI64(tmpRegVReg, diff)
+		alu := m.allocateInstr()
+		var ao aluOp
+		if add {
+			ao = aluOpAdd
+		} else {
+			ao = aluOpSub
+		}
+		alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		m.insert(alu)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
@ -0,0 +1,9 @@
+package arm64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
@ -0,0 +1,29 @@
+//go:build arm64
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// See the comments on EmitGoEntryPreamble for what this function is supposed to do.
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+	MOVD preambleExecutable+0(FP), R27
+	MOVD functionExectuable+8(FP), R24
+	MOVD executionContextPtr+16(FP), R0
+	MOVD moduleContextPtr+24(FP), R1
+	MOVD paramResultSlicePtr+32(FP), R19
+	MOVD goAllocatedStackSlicePtr+40(FP), R26
+	JMP  (R27)
+
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+	MOVD goCallReturnAddress+0(FP), R20
+	MOVD executionContextPtr+8(FP), R0
+	MOVD stackPointer+16(FP), R19
+
+	// Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0).
+	MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+	MOVD RSP, R27    // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions.
+	MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+	MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress]
+
+	// Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP.
+	MOVD R19, RSP
+	JMP  (R20)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@ -0,0 +1,230 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes:
+//
+//  1. First (execution context ptr) and Second arguments are already passed in x0, and x1.
+//  2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values.
+//  3. Go-allocated stack slice ptr in x26.
+//  4. Function executable in x24.
+//
+// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller.
+func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte {
+	root := m.constructEntryPreamble(signature)
+	m.encode(root)
+	return m.compiler.Buf()
+}
+
+var (
+	executionContextPtrReg = x0VReg
+	// callee-saved regs so that they can be used in the prologue and epilogue.
+	paramResultSlicePtr      = x19VReg
+	savedExecutionContextPtr = x20VReg
+	// goAllocatedStackPtr is not used in the epilogue.
+	goAllocatedStackPtr = x26VReg
+	// paramResultSliceCopied is not used in the epilogue.
+	paramResultSliceCopied = x25VReg
+	// tmpRegVReg is not used in the epilogue.
+	functionExecutable = x24VReg
+)
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction {
+	typ := arg.Type
+	bits := typ.Bits()
+	isStackArg := arg.Kind == backend.ABIArgKindStack
+
+	var loadTargetReg operand
+	if !isStackArg {
+		loadTargetReg = operandNR(arg.Reg)
+	} else {
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			loadTargetReg = operandNR(x15VReg)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			loadTargetReg = operandNR(v15VReg)
+		default:
+			panic("TODO?")
+		}
+	}
+
+	var postIndexImm int64
+	if typ == ssa.TypeV128 {
+		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+	} else {
+		postIndexImm = 8
+	}
+	loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+
+	instr := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32:
+		instr.asULoad(loadTargetReg, loadMode, 32)
+	case ssa.TypeI64:
+		instr.asULoad(loadTargetReg, loadMode, 64)
+	case ssa.TypeF32:
+		instr.asFpuLoad(loadTargetReg, loadMode, 32)
+	case ssa.TypeF64:
+		instr.asFpuLoad(loadTargetReg, loadMode, 64)
+	case ssa.TypeV128:
+		instr.asFpuLoad(loadTargetReg, loadMode, 128)
+	}
+	cur = linkInstr(cur, instr)
+
+	if isStackArg {
+		var storeMode addressMode
+		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
+		toStack := m.allocateInstr()
+		toStack.asStore(loadTargetReg, storeMode, bits)
+		cur = linkInstr(cur, toStack)
+	}
+	return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction {
+	isStackArg := result.Kind == backend.ABIArgKindStack
+	typ := result.Type
+	bits := typ.Bits()
+
+	var storeTargetReg operand
+	if !isStackArg {
+		storeTargetReg = operandNR(result.Reg)
+	} else {
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			storeTargetReg = operandNR(x15VReg)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			storeTargetReg = operandNR(v15VReg)
+		default:
+			panic("TODO?")
+		}
+	}
+
+	var postIndexImm int64
+	if typ == ssa.TypeV128 {
+		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+	} else {
+		postIndexImm = 8
+	}
+
+	if isStackArg {
+		var loadMode addressMode
+		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
+		toReg := m.allocateInstr()
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			toReg.asULoad(storeTargetReg, loadMode, bits)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+		default:
+			panic("TODO?")
+		}
+		cur = linkInstr(cur, toReg)
+	}
+
+	mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+	instr := m.allocateInstr()
+	instr.asStore(storeTargetReg, mode, bits)
+	cur = linkInstr(cur, instr)
+	return cur
+}
+
+func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) {
+	abi := backend.FunctionABI{}
+	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+
+	root = m.allocateNop()
+
+	//// ----------------------------------- prologue ----------------------------------- ////
+
+	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+	// 		mov savedExecutionContextPtr, x0
+	cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root)
+
+	// Next, save the current FP, SP and LR into the wazevo.executionContext:
+	// 		str fp, [savedExecutionContextPtr, #OriginalFramePointer]
+	//      mov tmp, sp ;; sp cannot be str'ed directly.
+	// 		str sp, [savedExecutionContextPtr, #OriginalStackPointer]
+	// 		str lr, [savedExecutionContextPtr, #GoReturnAddress]
+	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur)
+	cur = m.move64(tmpRegVReg, spVReg, cur)
+	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur)
+	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur)
+
+	// Then, move the Go-allocated stack pointer to SP:
+	// 		mov sp, goAllocatedStackPtr
+	cur = m.move64(spVReg, goAllocatedStackPtr, cur)
+
+	prReg := paramResultSlicePtr
+	if len(abi.Args) > 2 && len(abi.Rets) > 0 {
+		// paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg,
+		// so copy it to another reg.
+		cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur)
+		prReg = paramResultSliceCopied
+	}
+
+	stackSlotSize := int64(abi.AlignedArgResultStackSlotSize())
+	for i := range abi.Args {
+		if i < 2 {
+			// module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function.
+			continue
+		}
+		arg := &abi.Args[i]
+		cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize)
+	}
+
+	// Call the real function.
+	bl := m.allocateInstr()
+	bl.asCallIndirect(functionExecutable, &abi)
+	cur = linkInstr(cur, bl)
+
+	///// ----------------------------------- epilogue ----------------------------------- /////
+
+	// Store the register results into paramResultSlicePtr.
+	for i := range abi.Rets {
+		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize)
+	}
+
+	// Finally, restore the FP, SP and LR, and return to the Go code.
+	// 		ldr fp, [savedExecutionContextPtr, #OriginalFramePointer]
+	// 		ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer]
+	//      mov sp, tmp ;; sp cannot be str'ed directly.
+	// 		ldr lr, [savedExecutionContextPtr, #GoReturnAddress]
+	// 		ret ;; --> return to the Go code
+	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur)
+	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur)
+	cur = m.move64(spVReg, tmpRegVReg, cur)
+	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur)
+	retInst := m.allocateInstr()
+	retInst.asRet()
+	linkInstr(cur, retInst)
+	return
+}
+
+func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction {
+	instr := m.allocateInstr()
+	instr.asMove64(dst, src)
+	return linkInstr(prev, instr)
+}
+
+func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
+	instr := m.allocateInstr()
+	mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+	if store {
+		instr.asStore(operandNR(d), mode, 64)
+	} else {
+		instr.asULoad(operandNR(d), mode, 64)
+	}
+	return linkInstr(prev, instr)
+}
+
+func linkInstr(prev, next *instruction) *instruction {
+	prev.next = next
+	next.prev = prev
+	return next
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@ -0,0 +1,428 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedRegistersSorted = []regalloc.VReg{
+	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg,
+	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+	exct := m.executableContext
+	argBegin := 1 // Skips exec context by default.
+	if needModuleContextPtr {
+		argBegin++
+	}
+
+	abi := &backend.FunctionABI{}
+	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+	m.currentABI = abi
+
+	cur := m.allocateInstr()
+	cur.asNop0()
+	exct.RootInstr = cur
+
+	// Execution context is always the first argument.
+	execCtrPtr := x0VReg
+
+	// In the following, we create the following stack layout:
+	//
+	//                   (high address)
+	//     SP ------> +-----------------+  <----+
+	//                |     .......     |       |
+	//                |      ret Y      |       |
+	//                |     .......     |       |
+	//                |      ret 0      |       |
+	//                |      arg X      |       |  size_of_arg_ret
+	//                |     .......     |       |
+	//                |      arg 1      |       |
+	//                |      arg 0      |  <----+ <-------- originalArg0Reg
+	//                | size_of_arg_ret |
+	//                |  ReturnAddress  |
+	//                +-----------------+ <----+
+	//                |      xxxx       |      |  ;; might be padded to make it 16-byte aligned.
+	//           +--->|  arg[N]/ret[M]  |      |
+	//  sliceSize|    |   ............  |      | goCallStackSize
+	//           |    |  arg[1]/ret[1]  |      |
+	//           +--->|  arg[0]/ret[0]  | <----+ <-------- arg0ret0AddrReg
+	//                |    sliceSize    |
+	//                |   frame_size    |
+	//                +-----------------+
+	//                   (low address)
+	//
+	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+	// the arguments/return values.
+
+	// First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret".
+	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+	const frameInfoSize = 16 // == frame_size + sliceSize.
+
+	// Next, we should allocate the stack for the Go function call if necessary.
+	goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+	cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur)
+
+	originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+	if m.currentABI.AlignedArgResultStackSlotSize() > 0 {
+		// At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot.
+		cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true)
+	}
+
+	// Save the callee saved registers.
+	cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+	if needModuleContextPtr {
+		offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64()
+		if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) {
+			panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context")
+		}
+
+		// Module context is always the second argument.
+		moduleCtrPtr := x1VReg
+		store := m.allocateInstr()
+		amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+		store.asStore(operandNR(moduleCtrPtr), amode, 64)
+		cur = linkInstr(cur, store)
+	}
+
+	// Advances the stack pointer.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false)
+
+	// Copy the pointer to x15VReg.
+	arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want.
+	copySp := m.allocateInstr()
+	copySp.asMove64(arg0ret0AddrReg, spVReg)
+	cur = linkInstr(cur, copySp)
+
+	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+	for i := range abi.Args[argBegin:] {
+		arg := &abi.Args[argBegin+i]
+		store := m.allocateInstr()
+		var v regalloc.VReg
+		if arg.Kind == backend.ABIArgKindReg {
+			v = arg.Reg
+		} else {
+			cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg,
+				// Caller save, so we can use it for whatever we want.
+				x11VReg, v11VReg)
+		}
+
+		var sizeInBits byte
+		if arg.Type == ssa.TypeV128 {
+			sizeInBits = 128
+		} else {
+			sizeInBits = 64
+		}
+		store.asStore(operandNR(v),
+			addressMode{
+				kind: addressModeKindPostIndex,
+				rn:   arg0ret0AddrReg, imm: int64(sizeInBits / 8),
+			}, sizeInBits)
+		cur = linkInstr(cur, store)
+	}
+
+	// Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`.
+	var frameSizeReg, sliceSizeReg regalloc.VReg
+	if goCallStackSize > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize)
+		frameSizeReg = tmpRegVReg
+		cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8)
+		sliceSizeReg = x16VReg
+	} else {
+		frameSizeReg = xzrVReg
+		sliceSizeReg = xzrVReg
+	}
+	_amode := addressModePreOrPostIndex(spVReg, -16, true)
+	storeP := m.allocateInstr()
+	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
+	cur = linkInstr(cur, storeP)
+
+	// Set the exit status on the execution context.
+	cur = m.setExitCode(cur, x0VReg, exitCode)
+
+	// Save the current stack pointer.
+	cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+	// Exit the execution.
+	cur = m.storeReturnAddressAndExit(cur)
+
+	// After the call, we need to restore the callee saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+	// Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`.
+	if len(abi.Rets) > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true)
+	}
+
+	// Advances the SP so that it points to `ReturnAddress`.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
+	ldr := m.allocateInstr()
+	// And load the return address.
+	ldr.asULoad(operandNR(lrVReg),
+		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	cur = linkInstr(cur, ldr)
+
+	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+	if m.currentABI.RetStackSize > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true)
+	}
+
+	// Make the SP point to the original address (above the result slot).
+	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		if r.Kind == backend.ABIArgKindReg {
+			loadIntoReg := m.allocateInstr()
+			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			switch r.Type {
+			case ssa.TypeI32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+			case ssa.TypeI64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+			case ssa.TypeF32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+			case ssa.TypeF64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+			case ssa.TypeV128:
+				mode.imm = 16
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+			default:
+				panic("TODO")
+			}
+			cur = linkInstr(cur, loadIntoReg)
+		} else {
+			// First we need to load the value to a temporary just like ^^.
+			intTmp, floatTmp := x11VReg, v11VReg
+			loadIntoTmpReg := m.allocateInstr()
+			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			var resultReg regalloc.VReg
+			switch r.Type {
+			case ssa.TypeI32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+				resultReg = intTmp
+			case ssa.TypeI64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+				resultReg = intTmp
+			case ssa.TypeF32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+				resultReg = floatTmp
+			case ssa.TypeF64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+				resultReg = floatTmp
+			case ssa.TypeV128:
+				mode.imm = 16
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+				resultReg = floatTmp
+			default:
+				panic("TODO")
+			}
+			cur = linkInstr(cur, loadIntoTmpReg)
+			cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg)
+		}
+	}
+
+	ret := m.allocateInstr()
+	ret.asRet()
+	linkInstr(cur, ret)
+
+	m.encode(m.executableContext.RootInstr)
+	return m.compiler.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		store := m.allocateInstr()
+		var sizeInBits byte
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			sizeInBits = 64
+		case regalloc.RegTypeFloat:
+			sizeInBits = 128
+		}
+		store.asStore(operandNR(v),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: offset,
+			}, sizeInBits)
+		store.prev = cur
+		cur.next = store
+		cur = store
+		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16.
+	}
+	return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		load := m.allocateInstr()
+		var as func(dst operand, amode addressMode, sizeInBits byte)
+		var sizeInBits byte
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			as = load.asULoad
+			sizeInBits = 64
+		case regalloc.RegTypeFloat:
+			as = load.asFpuLoad
+			sizeInBits = 128
+		}
+		as(operandNR(v),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: offset,
+			}, sizeInBits)
+		cur = linkInstr(cur, load)
+		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
+	}
+	return cur
+}
+
+func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.lowerConstantI64(dst, v)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur
+}
+
+func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.lowerConstantI32(dst, v)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur
+}
+
+func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction {
+	constReg := x17VReg // caller-saved, so we can use it.
+	cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode))
+
+	// Set the exit status on the execution context.
+	setExistStatus := m.allocateInstr()
+	setExistStatus.asStore(operandNR(constReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+		}, 32)
+	cur = linkInstr(cur, setExistStatus)
+	return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
+	// Read the return address into tmp, and store it in the execution context.
+	adr := m.allocateInstr()
+	adr.asAdr(tmpRegVReg, exitSequenceSize+8)
+	cur = linkInstr(cur, adr)
+
+	storeReturnAddr := m.allocateInstr()
+	storeReturnAddr.asStore(operandNR(tmpRegVReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+		}, 64)
+	cur = linkInstr(cur, storeReturnAddr)
+
+	// Exit the execution.
+	trapSeq := m.allocateInstr()
+	trapSeq.asExitSequence(x0VReg)
+	cur = linkInstr(cur, trapSeq)
+	return cur
+}
+
+func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction {
+	// Save the current stack pointer:
+	// 	mov tmp, sp,
+	// 	str tmp, [exec_ctx, #stackPointerBeforeGoCall]
+	movSp := m.allocateInstr()
+	movSp.asMove64(tmpRegVReg, spVReg)
+	cur = linkInstr(cur, movSp)
+
+	strSp := m.allocateInstr()
+	strSp.asStore(operandNR(tmpRegVReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+		}, 64)
+	cur = linkInstr(cur, strSp)
+	return cur
+}
+
+func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
+	load := m.allocateInstr()
+	var result regalloc.VReg
+	mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+	switch arg.Type {
+	case ssa.TypeI32:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asULoad(operandNR(intVReg), mode, 32)
+		result = intVReg
+	case ssa.TypeI64:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asULoad(operandNR(intVReg), mode, 64)
+		result = intVReg
+	case ssa.TypeF32:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asFpuLoad(operandNR(floatVReg), mode, 32)
+		result = floatVReg
+	case ssa.TypeF64:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asFpuLoad(operandNR(floatVReg), mode, 64)
+		result = floatVReg
+	case ssa.TypeV128:
+		mode.imm = 16
+		load.asFpuLoad(operandNR(floatVReg), mode, 128)
+		result = floatVReg
+	default:
+		panic("TODO")
+	}
+
+	cur = linkInstr(cur, load)
+	return cur, result
+}
+
+func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
+	store := m.allocateInstr()
+	mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+	var sizeInBits byte
+	switch result.Type {
+	case ssa.TypeI32, ssa.TypeF32:
+		mode.imm = 8
+		sizeInBits = 32
+	case ssa.TypeI64, ssa.TypeF64:
+		mode.imm = 8
+		sizeInBits = 64
+	case ssa.TypeV128:
+		mode.imm = 16
+		sizeInBits = 128
+	default:
+		panic("TODO")
+	}
+	store.asStore(operandNR(resultVReg), mode, sizeInBits)
+	return linkInstr(cur, store)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
@ -0,0 +1,215 @@
+package arm64
+
+import (
+	"strconv"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	cond     uint64
+	condKind byte
+)
+
+const (
+	// condKindRegisterZero represents a condition which checks if the register is zero.
+	// This indicates that the instruction must be encoded as CBZ:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
+	condKindRegisterZero condKind = iota
+	// condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
+	condKindRegisterNotZero
+	// condKindCondFlagSet indicates that the instruction must be encoded as B.cond:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
+	condKindCondFlagSet
+)
+
+// kind returns the kind of condition which is stored in the first two bits.
+func (c cond) kind() condKind {
+	return condKind(c & 0b11)
+}
+
+func (c cond) asUint64() uint64 {
+	return uint64(c)
+}
+
+// register returns the register for register conditions.
+// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero).
+func (c cond) register() regalloc.VReg {
+	if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero {
+		panic("condition is not a register")
+	}
+	return regalloc.VReg(c >> 2)
+}
+
+func registerAsRegZeroCond(r regalloc.VReg) cond {
+	return cond(r)<<2 | cond(condKindRegisterZero)
+}
+
+func registerAsRegNotZeroCond(r regalloc.VReg) cond {
+	return cond(r)<<2 | cond(condKindRegisterNotZero)
+}
+
+func (c cond) flag() condFlag {
+	if c.kind() != condKindCondFlagSet {
+		panic("condition is not a flag")
+	}
+	return condFlag(c >> 2)
+}
+
+func (c condFlag) asCond() cond {
+	return cond(c)<<2 | cond(condKindCondFlagSet)
+}
+
+// condFlag represents a condition flag for conditional branches.
+// The value matches the encoding of condition flags in the ARM64 instruction set.
+// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions
+type condFlag uint8
+
+const (
+	eq condFlag = iota // eq represents "equal"
+	ne                 // ne represents "not equal"
+	hs                 // hs represents "higher or same"
+	lo                 // lo represents "lower"
+	mi                 // mi represents "minus or negative result"
+	pl                 // pl represents "plus or positive result"
+	vs                 // vs represents "overflow set"
+	vc                 // vc represents "overflow clear"
+	hi                 // hi represents "higher"
+	ls                 // ls represents "lower or same"
+	ge                 // ge represents "greater or equal"
+	lt                 // lt represents "less than"
+	gt                 // gt represents "greater than"
+	le                 // le represents "less than or equal"
+	al                 // al represents "always"
+	nv                 // nv represents "never"
+)
+
+// invert returns the inverted condition.
+func (c condFlag) invert() condFlag {
+	switch c {
+	case eq:
+		return ne
+	case ne:
+		return eq
+	case hs:
+		return lo
+	case lo:
+		return hs
+	case mi:
+		return pl
+	case pl:
+		return mi
+	case vs:
+		return vc
+	case vc:
+		return vs
+	case hi:
+		return ls
+	case ls:
+		return hi
+	case ge:
+		return lt
+	case lt:
+		return ge
+	case gt:
+		return le
+	case le:
+		return gt
+	case al:
+		return nv
+	case nv:
+		return al
+	default:
+		panic(c)
+	}
+}
+
+// String implements fmt.Stringer.
+func (c condFlag) String() string {
+	switch c {
+	case eq:
+		return "eq"
+	case ne:
+		return "ne"
+	case hs:
+		return "hs"
+	case lo:
+		return "lo"
+	case mi:
+		return "mi"
+	case pl:
+		return "pl"
+	case vs:
+		return "vs"
+	case vc:
+		return "vc"
+	case hi:
+		return "hi"
+	case ls:
+		return "ls"
+	case ge:
+		return "ge"
+	case lt:
+		return "lt"
+	case gt:
+		return "gt"
+	case le:
+		return "le"
+	case al:
+		return "al"
+	case nv:
+		return "nv"
+	default:
+		panic(strconv.Itoa(int(c)))
+	}
+}
+
+// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond.
+func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag {
+	switch c {
+	case ssa.IntegerCmpCondEqual:
+		return eq
+	case ssa.IntegerCmpCondNotEqual:
+		return ne
+	case ssa.IntegerCmpCondSignedLessThan:
+		return lt
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+		return ge
+	case ssa.IntegerCmpCondSignedGreaterThan:
+		return gt
+	case ssa.IntegerCmpCondSignedLessThanOrEqual:
+		return le
+	case ssa.IntegerCmpCondUnsignedLessThan:
+		return lo
+	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return hs
+	case ssa.IntegerCmpCondUnsignedGreaterThan:
+		return hi
+	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		return ls
+	default:
+		panic(c)
+	}
+}
+
+// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond.
+func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag {
+	switch c {
+	case ssa.FloatCmpCondEqual:
+		return eq
+	case ssa.FloatCmpCondNotEqual:
+		return ne
+	case ssa.FloatCmpCondLessThan:
+		return mi
+	case ssa.FloatCmpCondLessThanOrEqual:
+		return ls
+	case ssa.FloatCmpCondGreaterThan:
+		return gt
+	case ssa.FloatCmpCondGreaterThanOrEqual:
+		return ge
+	default:
+		panic(c)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@ -0,0 +1,301 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+
+	vr = m.compiler.AllocateVReg(valType)
+	v := instr.ConstantVal()
+	m.insertLoadConstant(v, valType, vr)
+	return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+	v := instr.ConstantVal()
+	load := m.allocateInstr()
+	load.asLoadConstBlockArg(v, valType, vr)
+	m.insert(load)
+}
+
+func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) {
+	v, typ, dst := i.loadConstBlockArgData()
+	m.insertLoadConstant(v, typ, dst)
+}
+
+func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) {
+	if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+		v = v & ((1 << valType.Bits()) - 1)
+	}
+
+	switch valType {
+	case ssa.TypeF32:
+		loadF := m.allocateInstr()
+		loadF.asLoadFpuConst32(vr, v)
+		m.insert(loadF)
+	case ssa.TypeF64:
+		loadF := m.allocateInstr()
+		loadF.asLoadFpuConst64(vr, v)
+		m.insert(loadF)
+	case ssa.TypeI32:
+		if v == 0 {
+			m.InsertMove(vr, xzrVReg, ssa.TypeI32)
+		} else {
+			m.lowerConstantI32(vr, int32(v))
+		}
+	case ssa.TypeI64:
+		if v == 0 {
+			m.InsertMove(vr, xzrVReg, ssa.TypeI64)
+		} else {
+			m.lowerConstantI64(vr, int64(v))
+		}
+	default:
+		panic("TODO")
+	}
+}
+
+// The following logics are based on the old asm/arm64 package.
+// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go
+
+func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) {
+	// Following the logic here:
+	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
+	ic := int64(uint32(c))
+	if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
+		if isBitMaskImmediate(uint64(c), false) {
+			m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false)
+			return
+		}
+	}
+
+	if t := const16bitAligned(int64(uint32(c))); t >= 0 {
+		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+		// We could load it into temporary with movk.
+		m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false)
+	} else if t := const16bitAligned(int64(^c)); t >= 0 {
+		// Also, if the inverse of the const can fit within 16-bit range, do the same ^^.
+		m.insertMOVN(dst, uint64(^c>>(16*t)), t, false)
+	} else if isBitMaskImmediate(uint64(uint32(c)), false) {
+		m.lowerConstViaBitMaskImmediate(uint64(c), dst, false)
+	} else {
+		// Otherwise, we use MOVZ and MOVK to load it.
+		c16 := uint16(c)
+		m.insertMOVZ(dst, uint64(c16), 0, false)
+		c16 = uint16(uint32(c) >> 16)
+		m.insertMOVK(dst, uint64(c16), 1, false)
+	}
+}
+
+func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) {
+	// Following the logic here:
+	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
+	if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
+		if isBitMaskImmediate(uint64(c), true) {
+			m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+			return
+		}
+	}
+
+	if t := const16bitAligned(c); t >= 0 {
+		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+		// We could load it into temporary with movk.
+		m.insertMOVZ(dst, uint64(c)>>(16*t), t, true)
+	} else if t := const16bitAligned(^c); t >= 0 {
+		// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
+		m.insertMOVN(dst, uint64(^c)>>(16*t), t, true)
+	} else if isBitMaskImmediate(uint64(c), true) {
+		m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+	} else {
+		m.load64bitConst(c, dst)
+	}
+}
+
+func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) {
+	instr := m.allocateInstr()
+	instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64)
+	m.insert(instr)
+}
+
+// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
+//
+//	Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
+//	Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
+//
+// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
+func isBitMaskImmediate(x uint64, _64 bool) bool {
+	// All zeros and ones are not "bitmask immediate" by definition.
+	if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) {
+		return false
+	}
+
+	switch {
+	case x != x>>32|x<<32:
+		// e = 64
+	case x != x>>16|x<<48:
+		// e = 32 (x == x>>32|x<<32).
+		// e.g. 0x00ff_ff00_00ff_ff00
+		x = uint64(int32(x))
+	case x != x>>8|x<<56:
+		// e = 16 (x == x>>16|x<<48).
+		// e.g. 0x00ff_00ff_00ff_00ff
+		x = uint64(int16(x))
+	case x != x>>4|x<<60:
+		// e = 8 (x == x>>8|x<<56).
+		// e.g. 0x0f0f_0f0f_0f0f_0f0f
+		x = uint64(int8(x))
+	default:
+		// e = 4 or 2.
+		return true
+	}
+	return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
+}
+
+// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
+// For example: 0b1110 -> true, 0b1010 -> false
+func sequenceOfSetbits(x uint64) bool {
+	y := getLowestBit(x)
+	// If x is a sequence of set bit, this should results in the number
+	// with only one set bit (i.e. power of two).
+	y += x
+	return (y-1)&y == 0
+}
+
+func getLowestBit(x uint64) uint64 {
+	return x & (^x + 1)
+}
+
+// const16bitAligned check if the value is on the 16-bit alignment.
+// If so, returns the shift num divided by 16, and otherwise -1.
+func const16bitAligned(v int64) (ret int) {
+	ret = -1
+	for s := 0; s < 64; s += 16 {
+		if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
+			ret = s / 16
+			break
+		}
+	}
+	return
+}
+
+// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
+// consts as in the Go assembler.
+//
+// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
+func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
+	var bits [4]uint64
+	var zeros, negs int
+	for i := 0; i < 4; i++ {
+		bits[i] = uint64(c) >> uint(i*16) & 0xffff
+		if v := bits[i]; v == 0 {
+			zeros++
+		} else if v == 0xffff {
+			negs++
+		}
+	}
+
+	if zeros == 3 {
+		// one MOVZ instruction.
+		for i, v := range bits {
+			if v != 0 {
+				m.insertMOVZ(dst, v, i, true)
+			}
+		}
+	} else if negs == 3 {
+		// one MOVN instruction.
+		for i, v := range bits {
+			if v != 0xffff {
+				v = ^v
+				m.insertMOVN(dst, v, i, true)
+			}
+		}
+	} else if zeros == 2 {
+		// one MOVZ then one OVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if negs == 2 {
+		// one MOVN then one or two MOVK.
+		var movn bool
+		for i, v := range bits { // Emit MOVN.
+			if !movn && v != 0xffff {
+				v = ^v
+				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+				m.insertMOVN(dst, v, i, true)
+				movn = true
+			} else if v != 0xffff {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if zeros == 1 {
+		// one MOVZ then two MOVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if negs == 1 {
+		// one MOVN then two MOVK.
+		var movn bool
+		for i, v := range bits { // Emit MOVN.
+			if !movn && v != 0xffff {
+				v = ^v
+				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+				m.insertMOVN(dst, v, i, true)
+				movn = true
+			} else if v != 0xffff {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else {
+		// one MOVZ then up to three MOVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+	}
+}
+
+func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVZ(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
+
+func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVK(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
+
+func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVN(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
@ -0,0 +1,350 @@
+package arm64
+
+// This file contains the logic to "find and determine operands" for instructions.
+// In order to finalize the form of an operand, we might end up merging/eliminating
+// the source instructions into an operand whenever possible.
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	// operand represents an operand of an instruction whose type is determined by the kind.
+	operand struct {
+		kind        operandKind
+		data, data2 uint64
+	}
+	operandKind byte
+)
+
+// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts,
+// but also names of functions which return the operand of the kind.
+const (
+	// operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others.
+	operandKindNR operandKind = iota
+	// operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant.
+	// Some of the arm64 instructions can take this kind of operand.
+	operandKindSR
+	// operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size.
+	// Some of the arm64 instructions can take this kind of operand.
+	operandKindER
+	// operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not.
+	// See asImm12 function for detail.
+	operandKindImm12
+	// operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations.
+	operandKindShiftImm
+)
+
+// String implements fmt.Stringer for debugging.
+func (o operand) format(size byte) string {
+	switch o.kind {
+	case operandKindNR:
+		return formatVRegSized(o.nr(), size)
+	case operandKindSR:
+		r, amt, sop := o.sr()
+		return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt)
+	case operandKindER:
+		r, eop, _ := o.er()
+		return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop)
+	case operandKindImm12:
+		imm12, shiftBit := o.imm12()
+		if shiftBit == 1 {
+			return fmt.Sprintf("#%#x", uint64(imm12)<<12)
+		} else {
+			return fmt.Sprintf("#%#x", imm12)
+		}
+	default:
+		panic(fmt.Sprintf("unknown operand kind: %d", o.kind))
+	}
+}
+
+// operandNR encodes the given VReg as an operand of operandKindNR.
+func operandNR(r regalloc.VReg) operand {
+	return operand{kind: operandKindNR, data: uint64(r)}
+}
+
+// nr decodes the underlying VReg assuming the operand is of operandKindNR.
+func (o operand) nr() regalloc.VReg {
+	return regalloc.VReg(o.data)
+}
+
+// operandER encodes the given VReg as an operand of operandKindER.
+func operandER(r regalloc.VReg, eop extendOp, to byte) operand {
+	if to < 32 {
+		panic("TODO?BUG?: when we need to extend to less than 32 bits?")
+	}
+	return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)}
+}
+
+// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER.
+func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) {
+	return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff)
+}
+
+// operandSR encodes the given VReg as an operand of operandKindSR.
+func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand {
+	return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)}
+}
+
+// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR.
+func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) {
+	return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff
+}
+
+// operandImm12 encodes the given imm12 as an operand of operandKindImm12.
+func operandImm12(imm12 uint16, shiftBit byte) operand {
+	return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32}
+}
+
+// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12.
+func (o operand) imm12() (v uint16, shiftBit byte) {
+	return uint16(o.data), byte(o.data >> 32)
+}
+
+// operandShiftImm encodes the given amount as an operand of operandKindShiftImm.
+func operandShiftImm(amount byte) operand {
+	return operand{kind: operandKindShiftImm, data: uint64(amount)}
+}
+
+// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm.
+func (o operand) shiftImm() byte {
+	return byte(o.data)
+}
+
+// reg returns the register of the operand if applicable.
+func (o operand) reg() regalloc.VReg {
+	switch o.kind {
+	case operandKindNR:
+		return o.nr()
+	case operandKindSR:
+		r, _, _ := o.sr()
+		return r
+	case operandKindER:
+		r, _, _ := o.er()
+		return r
+	case operandKindImm12:
+		// Does not have a register.
+	case operandKindShiftImm:
+		// Does not have a register.
+	default:
+		panic(o.kind)
+	}
+	return regalloc.VRegInvalid
+}
+
+func (o operand) realReg() regalloc.RealReg {
+	return o.nr().RealReg()
+}
+
+func (o operand) assignReg(v regalloc.VReg) operand {
+	switch o.kind {
+	case operandKindNR:
+		return operandNR(v)
+	case operandKindSR:
+		_, amt, sop := o.sr()
+		return operandSR(v, amt, sop)
+	case operandKindER:
+		_, eop, to := o.er()
+		return operandER(v, eop, to)
+	case operandKindImm12:
+		// Does not have a register.
+	case operandKindShiftImm:
+		// Does not have a register.
+	}
+	panic(o.kind)
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+// If the operand can be expressed as operandKindImm12, `mode` is ignored.
+func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Opcode() == ssa.OpcodeIconst {
+		if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok {
+			instr.MarkLowered()
+			return imm12Op
+		}
+	}
+	return m.getOperand_ER_SR_NR(def, mode)
+}
+
+// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value.
+// If the immediate value is negated, the second return value is true, otherwise always false.
+func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg), false
+	}
+
+	instr := def.Instr
+	if instr.Opcode() == ssa.OpcodeIconst {
+		c := instr.ConstantVal()
+		if imm12Op, ok := asImm12Operand(c); ok {
+			instr.MarkLowered()
+			return imm12Op, false
+		}
+
+		signExtended := int64(c)
+		if def.SSAValue().Type().Bits() == 32 {
+			signExtended = (signExtended << 32) >> 32
+		}
+		negatedWithoutSign := -signExtended
+		if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok {
+			instr.MarkLowered()
+			return imm12Op, true
+		}
+	}
+	return m.getOperand_ER_SR_NR(def, mode), false
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) {
+		extInstr := def.Instr
+
+		signed := extInstr.Opcode() == ssa.OpcodeSExtend
+		innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits()
+		modeBits, modeSigned := mode.bits(), mode.signed()
+		if mode == extModeNone || innerExtToBits == modeBits {
+			eop := extendOpFrom(signed, innerExtFromBits)
+			extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone)
+			op = operandER(extArg.nr(), eop, innerExtToBits)
+			extInstr.MarkLowered()
+			return
+		}
+
+		if innerExtToBits > modeBits {
+			panic("BUG?TODO?: need the results of inner extension to be larger than the mode")
+		}
+
+		switch {
+		case (!signed && !modeSigned) || (signed && modeSigned):
+			// Two sign/zero extensions are equivalent to one sign/zero extension for the larger size.
+			eop := extendOpFrom(modeSigned, innerExtFromBits)
+			op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits)
+			extInstr.MarkLowered()
+		case (signed && !modeSigned) || (!signed && modeSigned):
+			// We need to {sign, zero}-extend the result of the {zero,sign} extension.
+			eop := extendOpFrom(modeSigned, innerExtToBits)
+			op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits)
+			// Note that we failed to merge the inner extension instruction this case.
+		}
+		return
+	}
+	return m.getOperand_SR_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	if m.compiler.MatchInstr(def, ssa.OpcodeIshl) {
+		// Check if the shift amount is constant instruction.
+		targetVal, amountVal := def.Instr.Arg2()
+		targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr()
+		amountDef := m.compiler.ValueDefinition(amountVal)
+		if amountDef.IsFromInstr() && amountDef.Instr.Constant() {
+			// If that is the case, we can use the shifted register operand (SR).
+			c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits.
+			def.Instr.MarkLowered()
+			amountDef.Instr.MarkLowered()
+			return operandSR(targetVReg, c, shiftOpLSL)
+		}
+	}
+	return m.getOperand_NR(def, mode)
+}
+
+// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def).
+func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Constant() {
+		amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits.
+		return operandShiftImm(amount)
+	}
+	return m.getOperand_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	var v regalloc.VReg
+	if def.IsFromBlockParam() {
+		v = def.BlkParamVReg
+	} else {
+		instr := def.Instr
+		if instr.Constant() {
+			// We inline all the constant instructions so that we could reduce the register usage.
+			v = m.lowerConstant(instr)
+			instr.MarkLowered()
+		} else {
+			if n := def.N; n == 0 {
+				v = m.compiler.VRegOf(instr.Return())
+			} else {
+				_, rs := instr.Returns()
+				v = m.compiler.VRegOf(rs[n-1])
+			}
+		}
+	}
+
+	r := v
+	switch inBits := def.SSAValue().Type().Bits(); {
+	case mode == extModeNone:
+	case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32):
+	case inBits == 32 && mode == extModeZeroExtend64:
+		extended := m.compiler.AllocateVReg(ssa.TypeI64)
+		ext := m.allocateInstr()
+		ext.asExtend(extended, v, 32, 64, false)
+		m.insert(ext)
+		r = extended
+	case inBits == 32 && mode == extModeSignExtend64:
+		extended := m.compiler.AllocateVReg(ssa.TypeI64)
+		ext := m.allocateInstr()
+		ext.asExtend(extended, v, 32, 64, true)
+		m.insert(ext)
+		r = extended
+	case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64):
+	}
+	return operandNR(r)
+}
+
+func asImm12Operand(val uint64) (op operand, ok bool) {
+	v, shiftBit, ok := asImm12(val)
+	if !ok {
+		return operand{}, false
+	}
+	return operandImm12(v, shiftBit), true
+}
+
+func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) {
+	const mask1, mask2 uint64 = 0xfff, 0xfff_000
+	if val&^mask1 == 0 {
+		return uint16(val), 0, true
+	} else if val&^mask2 == 0 {
+		return uint16(val >> 12), 1, true
+	} else {
+		return 0, 0, false
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@ -0,0 +1,440 @@
+package arm64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// addressMode represents an ARM64 addressing mode.
+	//
+	// https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing
+	// TODO: use the bit-packed layout like operand struct.
+	addressMode struct {
+		kind   addressModeKind
+		rn, rm regalloc.VReg
+		extOp  extendOp
+		imm    int64
+	}
+
+	// addressModeKind represents the kind of ARM64 addressing mode.
+	addressModeKind byte
+)
+
+const (
+	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
+	// and then scaled by bits(type)/8.
+	//
+	// e.g.
+	// 	- ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1)
+	// 	- strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1)
+	// 	- ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2)
+	// 	- str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3)
+	//
+	// See the following pages:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--
+	addressModeKindRegScaledExtended addressModeKind = iota
+
+	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor.
+	addressModeKindRegScaled
+
+	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor.
+	addressModeKindRegExtended
+
+	// addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended.
+	addressModeKindRegReg
+
+	// addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255).
+	// The immediate will be sign-extended, and be added to the base register.
+	// This is a.k.a. "unscaled" since the immediate is not scaled.
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
+	addressModeKindRegSignedImm9
+
+	// addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset.  scaled by
+	// the size of the type. In other words, the actual offset will be imm12 * bits(type)/8.
+	// See "Unsigned offset" in the following pages:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	addressModeKindRegUnsignedImm12
+
+	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+	// After the load/store, the base register will be updated by the offset.
+	//
+	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+	//
+	// See "Post-index" in the following pages for examples:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+	addressModeKindPostIndex
+
+	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+	// Before the load/store, the base register will be updated by the offset.
+	//
+	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+	//
+	// See "Pre-index" in the following pages for examples:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+	addressModeKindPreIndex
+
+	// addressModeKindArgStackSpace is used to resolve the address of the argument stack space
+	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+	addressModeKindArgStackSpace
+
+	// addressModeKindResultStackSpace is used to resolve the address of the result stack space
+	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+	addressModeKindResultStackSpace
+)
+
+func (a addressMode) format(dstSizeBits byte) (ret string) {
+	base := formatVRegSized(a.rn, 64)
+	if rn := a.rn; rn.RegType() != regalloc.RegTypeInt {
+		panic("invalid base register type: " + a.rn.RegType().String())
+	} else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 {
+		panic("BUG: likely a bug in reg alloc or reset behavior")
+	}
+
+	switch a.kind {
+	case addressModeKindRegScaledExtended:
+		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+		ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount)
+	case addressModeKindRegScaled:
+		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+		ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount)
+	case addressModeKindRegExtended:
+		ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp)
+	case addressModeKindRegReg:
+		ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()))
+	case addressModeKindRegSignedImm9:
+		if a.imm != 0 {
+			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+		} else {
+			ret = fmt.Sprintf("[%s]", base)
+		}
+	case addressModeKindRegUnsignedImm12:
+		if a.imm != 0 {
+			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+		} else {
+			ret = fmt.Sprintf("[%s]", base)
+		}
+	case addressModeKindPostIndex:
+		ret = fmt.Sprintf("[%s], #%#x", base, a.imm)
+	case addressModeKindPreIndex:
+		ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm)
+	case addressModeKindArgStackSpace:
+		ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm)
+	case addressModeKindResultStackSpace:
+		ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm)
+	}
+	return
+}
+
+func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
+		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
+	}
+	if preIndex {
+		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+	} else {
+		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+	}
+}
+
+func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
+	divisor := int64(dstSizeInBits) / 8
+	return 0 < offset && offset%divisor == 0 && offset/divisor < 4096
+}
+
+func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool {
+	return -256 <= offset && offset <= 255
+}
+
+func (a addressMode) indexRegBits() byte {
+	bits := a.extOp.srcBits()
+	if bits != 32 && bits != 64 {
+		panic("invalid index register for address mode. it must be either 32 or 64 bits")
+	}
+	return bits
+}
+
+func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) {
+	switch sizeInBits {
+	case 8:
+		lsl = 0
+	case 16:
+		lsl = 1
+	case 32:
+		lsl = 2
+	case 64:
+		lsl = 3
+	}
+	return
+}
+
+func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) {
+	switch op {
+	case ssa.OpcodeUload8:
+		size, signed = 8, false
+	case ssa.OpcodeUload16:
+		size, signed = 16, false
+	case ssa.OpcodeUload32:
+		size, signed = 32, false
+	case ssa.OpcodeSload8:
+		size, signed = 8, true
+	case ssa.OpcodeSload16:
+		size, signed = 16, true
+	case ssa.OpcodeSload32:
+		size, signed = 32, true
+	default:
+		panic("BUG")
+	}
+	return
+}
+
+func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) {
+	size, signed := extLoadSignSize(op)
+	amode := m.lowerToAddressMode(ptr, offset, size)
+	load := m.allocateInstr()
+	if signed {
+		load.asSLoad(operandNR(ret), amode, size)
+	} else {
+		load.asULoad(operandNR(ret), amode, size)
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) {
+	amode := m.lowerToAddressMode(ptr, offset, typ.Bits())
+
+	dst := m.compiler.VRegOf(ret)
+	load := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		load.asULoad(operandNR(dst), amode, typ.Bits())
+	case ssa.TypeF32, ssa.TypeF64:
+		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+	case ssa.TypeV128:
+		load.asFpuLoad(operandNR(dst), amode, 128)
+	default:
+		panic("TODO")
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) {
+	// vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base.
+	base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr()
+	offsetReg := m.compiler.AllocateVReg(ssa.TypeI64)
+	m.lowerConstantI64(offsetReg, int64(offset))
+	addedBase := m.addReg64ToReg64(base, offsetReg)
+
+	rd := operandNR(m.compiler.VRegOf(ret))
+
+	ld1r := m.allocateInstr()
+	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
+	m.insert(ld1r)
+}
+
+func (m *machine) lowerStore(si *ssa.Instruction) {
+	// TODO: merge consecutive stores into a single pair store instruction.
+	value, ptr, offset, storeSizeInBits := si.StoreData()
+	amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits)
+
+	valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone)
+	store := m.allocateInstr()
+	store.asStore(valueOp, amode, storeSizeInBits)
+	m.insert(store)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
+	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
+	// to support more efficient address resolution.
+
+	a32s, a64s, offset := m.collectAddends(ptr)
+	offset += int64(offsetBase)
+	return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset)
+}
+
+// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends.
+// During the construction, this might emit additional instructions.
+//
+// Extracted as a separate function for easy testing.
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
+	case a64sExist && a32sExist:
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		var a32 addend32
+		a32 = a32s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+		offset = 0
+	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+		offset = 0
+	case a64sExist:
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		if !a64s.Empty() {
+			index := a64s.Dequeue()
+			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+		} else {
+			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+		}
+	case a32sExist:
+		base32 := a32s.Dequeue()
+
+		// First we need 64-bit base.
+		base := m.compiler.AllocateVReg(ssa.TypeI64)
+		baseExt := m.allocateInstr()
+		var signed bool
+		if base32.ext == extendOpSXTW {
+			signed = true
+		}
+		baseExt.asExtend(base, base32.r, 32, 64, signed)
+		m.insert(baseExt)
+
+		if !a32s.Empty() {
+			index := a32s.Dequeue()
+			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+		} else {
+			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+		}
+	default: // Only static offsets.
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
+		m.lowerConstantI64(tmpReg, offset)
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+		offset = 0
+	}
+
+	baseReg := amode.rn
+	if offset > 0 {
+		baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
+	}
+
+	for !a64s.Empty() {
+		a64 := a64s.Dequeue()
+		baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
+	}
+
+	for !a32s.Empty() {
+		a32 := a32s.Dequeue()
+		baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
+	}
+	amode.rn = baseReg
+	return
+}
+
+var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
+
+func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) {
+	m.addendsWorkQueue.Reset()
+	m.addends32.Reset()
+	m.addends64.Reset()
+	m.addendsWorkQueue.Enqueue(ptr)
+
+	for !m.addendsWorkQueue.Empty() {
+		v := m.addendsWorkQueue.Dequeue()
+
+		def := m.compiler.ValueDefinition(v)
+		switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
+		case ssa.OpcodeIadd:
+			// If the addend is an add, we recursively collect its operands.
+			x, y := def.Instr.Arg2()
+			m.addendsWorkQueue.Enqueue(x)
+			m.addendsWorkQueue.Enqueue(y)
+			def.Instr.MarkLowered()
+		case ssa.OpcodeIconst:
+			// If the addend is constant, we just statically merge it into the offset.
+			ic := def.Instr
+			u64 := ic.ConstantVal()
+			if ic.Return().Type().Bits() == 32 {
+				offset += int64(int32(u64)) // sign-extend.
+			} else {
+				offset += int64(u64)
+			}
+			def.Instr.MarkLowered()
+		case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+			input := def.Instr.Arg()
+			if input.Type().Bits() != 32 {
+				panic("illegal size: " + input.Type().String())
+			}
+
+			var ext extendOp
+			if op == ssa.OpcodeUExtend {
+				ext = extendOpUXTW
+			} else {
+				ext = extendOpSXTW
+			}
+
+			inputDef := m.compiler.ValueDefinition(input)
+			constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+			switch {
+			case constInst && ext == extendOpUXTW:
+				// Zero-extension of a 32-bit constant can be merged into the offset.
+				offset += int64(uint32(inputDef.Instr.ConstantVal()))
+			case constInst && ext == extendOpSXTW:
+				// Sign-extension of a 32-bit constant can be merged into the offset.
+				offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
+			default:
+				m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
+			}
+			def.Instr.MarkLowered()
+			continue
+		default:
+			// If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
+			m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
+		}
+	}
+	return &m.addends32, &m.addends64, offset
+}
+
+func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
+		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
+		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+	} else {
+		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
+		m.load64bitConst(c, tmp)
+		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+	}
+	m.insert(alu)
+	return
+}
+
+func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+	m.insert(alu)
+	return
+}
+
+func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+	m.insert(alu)
+	return
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
@ -0,0 +1,515 @@
+package arm64
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// machine implements backend.Machine.
+	machine struct {
+		compiler          backend.Compiler
+		executableContext *backend.ExecutableContextT[instruction]
+		currentABI        *backend.FunctionABI
+
+		regAlloc   regalloc.Allocator
+		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
+
+		// addendsWorkQueue is used during address lowering, defined here for reuse.
+		addendsWorkQueue wazevoapi.Queue[ssa.Value]
+		addends32        wazevoapi.Queue[addend32]
+		// addends64 is used during address lowering, defined here for reuse.
+		addends64              wazevoapi.Queue[regalloc.VReg]
+		unresolvedAddressModes []*instruction
+
+		// condBrRelocs holds the conditional branches which need offset relocation.
+		condBrRelocs []condBrReloc
+
+		// jmpTableTargets holds the labels of the jump table targets.
+		jmpTableTargets [][]uint32
+
+		// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
+		// During the execution of the function, the stack looks like:
+		//
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |      xxxxx      |
+		//          |   ReturnAddress |
+		//          +-----------------+   <<-|
+		//          |   ...........   |      |
+		//          |   spill slot M  |      | <--- spillSlotSize
+		//          |   ............  |      |
+		//          |   spill slot 2  |      |
+		//          |   spill slot 1  |   <<-+
+		//          |   clobbered N   |
+		//          |   ...........   |
+		//          |   clobbered 1   |
+		//          |   clobbered 0   |
+		//   SP---> +-----------------+
+		//             (low address)
+		//
+		// and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
+		// Also note that this is only known after register allocation.
+		spillSlotSize int64
+		spillSlots    map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
+		// clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
+		clobberedRegs []regalloc.VReg
+
+		maxRequiredStackSizeForCalls int64
+		stackBoundsCheckDisabled     bool
+
+		regAllocStarted bool
+	}
+
+	addend32 struct {
+		r   regalloc.VReg
+		ext extendOp
+	}
+
+	condBrReloc struct {
+		cbr *instruction
+		// currentLabelPos is the labelPosition within which condBr is defined.
+		currentLabelPos *labelPosition
+		// Next block's labelPosition.
+		nextLabel label
+		offset    int64
+	}
+
+	labelPosition = backend.LabelPosition[instruction]
+	label         = backend.Label
+)
+
+const (
+	labelReturn  = backend.LabelReturn
+	labelInvalid = backend.LabelInvalid
+)
+
+// NewBackend returns a new backend for arm64.
+func NewBackend() backend.Machine {
+	m := &machine{
+		spillSlots:        make(map[regalloc.VRegID]int64),
+		executableContext: newExecutableContext(),
+		regAlloc:          regalloc.NewAllocator(regInfo),
+	}
+	return m
+}
+
+func newExecutableContext() *backend.ExecutableContextT[instruction] {
+	return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
+}
+
+// ExecutableContext implements backend.Machine.
+func (m *machine) ExecutableContext() backend.ExecutableContext {
+	return m.executableContext
+}
+
+// RegAlloc implements backend.Machine Function.
+func (m *machine) RegAlloc() {
+	rf := m.regAllocFn
+	for _, pos := range m.executableContext.OrderedBlockLabels {
+		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
+	}
+
+	m.regAllocStarted = true
+	m.regAlloc.DoAllocation(rf)
+	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
+	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
+}
+
+// Reset implements backend.Machine.
+func (m *machine) Reset() {
+	m.clobberedRegs = m.clobberedRegs[:0]
+	for key := range m.spillSlots {
+		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
+	}
+	for _, key := range m.clobberedRegs {
+		delete(m.spillSlots, regalloc.VRegID(key))
+	}
+	m.clobberedRegs = m.clobberedRegs[:0]
+	m.regAllocStarted = false
+	m.regAlloc.Reset()
+	m.regAllocFn.Reset()
+	m.spillSlotSize = 0
+	m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
+	m.maxRequiredStackSizeForCalls = 0
+	m.executableContext.Reset()
+	m.jmpTableTargets = m.jmpTableTargets[:0]
+}
+
+// SetCurrentABI implements backend.Machine SetCurrentABI.
+func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
+	m.currentABI = abi
+}
+
+// DisableStackCheck implements backend.Machine DisableStackCheck.
+func (m *machine) DisableStackCheck() {
+	m.stackBoundsCheckDisabled = true
+}
+
+// SetCompiler implements backend.Machine.
+func (m *machine) SetCompiler(ctx backend.Compiler) {
+	m.compiler = ctx
+	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx)
+}
+
+func (m *machine) insert(i *instruction) {
+	ectx := m.executableContext
+	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+}
+
+func (m *machine) insertBrTargetLabel() label {
+	nop, l := m.allocateBrTarget()
+	m.insert(nop)
+	return l
+}
+
+func (m *machine) allocateBrTarget() (nop *instruction, l label) {
+	ectx := m.executableContext
+	l = ectx.AllocateLabel()
+	nop = m.allocateInstr()
+	nop.asNop0WithLabel(l)
+	pos := ectx.AllocateLabelPosition(l)
+	pos.Begin, pos.End = nop, nop
+	ectx.LabelPositions[l] = pos
+	return
+}
+
+// allocateInstr allocates an instruction.
+func (m *machine) allocateInstr() *instruction {
+	instr := m.executableContext.InstructionPool.Allocate()
+	if !m.regAllocStarted {
+		instr.addedBeforeRegAlloc = true
+	}
+	return instr
+}
+
+func resetInstruction(i *instruction) {
+	*i = instruction{}
+}
+
+func (m *machine) allocateNop() *instruction {
+	instr := m.allocateInstr()
+	instr.asNop0()
+	return instr
+}
+
+func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
+	amode := &i.amode
+	switch amode.kind {
+	case addressModeKindResultStackSpace:
+		amode.imm += ret0offset
+	case addressModeKindArgStackSpace:
+		amode.imm += arg0offset
+	default:
+		panic("BUG")
+	}
+
+	var sizeInBits byte
+	switch i.kind {
+	case store8, uLoad8:
+		sizeInBits = 8
+	case store16, uLoad16:
+		sizeInBits = 16
+	case store32, fpuStore32, uLoad32, fpuLoad32:
+		sizeInBits = 32
+	case store64, fpuStore64, uLoad64, fpuLoad64:
+		sizeInBits = 64
+	case fpuStore128, fpuLoad128:
+		sizeInBits = 128
+	default:
+		panic("BUG")
+	}
+
+	if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
+		amode.kind = addressModeKindRegUnsignedImm12
+	} else {
+		// This case, we load the offset into the temporary register,
+		// and then use it as the index register.
+		newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
+		linkInstr(newPrev, i)
+		*amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
+	}
+}
+
+// resolveRelativeAddresses resolves the relative addresses before encoding.
+func (m *machine) resolveRelativeAddresses(ctx context.Context) {
+	ectx := m.executableContext
+	for {
+		if len(m.unresolvedAddressModes) > 0 {
+			arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
+			for _, i := range m.unresolvedAddressModes {
+				m.resolveAddressingMode(arg0offset, ret0offset, i)
+			}
+		}
+
+		// Reuse the slice to gather the unresolved conditional branches.
+		m.condBrRelocs = m.condBrRelocs[:0]
+
+		var fn string
+		var fnIndex int
+		var labelToSSABlockID map[label]ssa.BasicBlockID
+		if wazevoapi.PerfMapEnabled {
+			fn = wazevoapi.GetCurrentFunctionName(ctx)
+			labelToSSABlockID = make(map[label]ssa.BasicBlockID)
+			for i, l := range ectx.SsaBlockIDToLabels {
+				labelToSSABlockID[l] = ssa.BasicBlockID(i)
+			}
+			fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
+		}
+
+		// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
+		var offset int64
+		for i, pos := range ectx.OrderedBlockLabels {
+			pos.BinaryOffset = offset
+			var size int64
+			for cur := pos.Begin; ; cur = cur.next {
+				switch cur.kind {
+				case nop0:
+					l := cur.nop0Label()
+					if pos, ok := ectx.LabelPositions[l]; ok {
+						pos.BinaryOffset = offset + size
+					}
+				case condBr:
+					if !cur.condBrOffsetResolved() {
+						var nextLabel label
+						if i < len(ectx.OrderedBlockLabels)-1 {
+							// Note: this is only used when the block ends with fallthrough,
+							// therefore can be safely assumed that the next block exists when it's needed.
+							nextLabel = ectx.OrderedBlockLabels[i+1].L
+						}
+						m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
+							cbr: cur, currentLabelPos: pos, offset: offset + size,
+							nextLabel: nextLabel,
+						})
+					}
+				}
+				size += cur.size()
+				if cur == pos.End {
+					break
+				}
+			}
+
+			if wazevoapi.PerfMapEnabled {
+				if size > 0 {
+					l := pos.L
+					var labelStr string
+					if blkID, ok := labelToSSABlockID[l]; ok {
+						labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
+					} else {
+						labelStr = l.String()
+					}
+					wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+				}
+			}
+			offset += size
+		}
+
+		// Before resolving any offsets, we need to check if all the conditional branches can be resolved.
+		var needRerun bool
+		for i := range m.condBrRelocs {
+			reloc := &m.condBrRelocs[i]
+			cbr := reloc.cbr
+			offset := reloc.offset
+
+			target := cbr.condBrLabel()
+			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			diff := offsetOfTarget - offset
+			if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+				// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
+				// and jump to it.
+				m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
+				// Then, we need to recall this function to fix up the label offsets
+				// as they have changed after the trampoline is inserted.
+				needRerun = true
+			}
+		}
+		if needRerun {
+			if wazevoapi.PerfMapEnabled {
+				wazevoapi.PerfMap.Clear()
+			}
+		} else {
+			break
+		}
+	}
+
+	var currentOffset int64
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch cur.kind {
+		case br:
+			target := cur.brLabel()
+			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			diff := offsetOfTarget - currentOffset
+			divided := diff >> 2
+			if divided < minSignedInt26 || divided > maxSignedInt26 {
+				// This means the currently compiled single function is extremely large.
+				panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
+			}
+			cur.brOffsetResolve(diff)
+		case condBr:
+			if !cur.condBrOffsetResolved() {
+				target := cur.condBrLabel()
+				offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+				diff := offsetOfTarget - currentOffset
+				if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+					panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
+				}
+				cur.condBrOffsetResolve(diff)
+			}
+		case brTableSequence:
+			tableIndex := cur.u1
+			targets := m.jmpTableTargets[tableIndex]
+			for i := range targets {
+				l := label(targets[i])
+				offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
+				diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
+				targets[i] = uint32(diff)
+			}
+			cur.brTableSequenceOffsetsResolved()
+		case emitSourceOffsetInfo:
+			m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
+		}
+		currentOffset += cur.size()
+	}
+}
+
+const (
+	maxSignedInt26 = 1<<25 - 1
+	minSignedInt26 = -(1 << 25)
+
+	maxSignedInt19 = 1<<18 - 1
+	minSignedInt19 = -(1 << 18)
+)
+
+func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
+	cur := currentBlk.End
+	originalTarget := cbr.condBrLabel()
+	endNext := cur.next
+
+	if cur.kind != br {
+		// If the current block ends with a conditional branch, we can just insert the trampoline after it.
+		// Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
+		skip := m.allocateInstr()
+		skip.asBr(nextLabel)
+		cur = linkInstr(cur, skip)
+	}
+
+	cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
+	cbr.setCondBrTargets(cbrNewTargetLabel)
+	cur = linkInstr(cur, cbrNewTargetInstr)
+
+	// Then insert the unconditional branch to the original, which should be possible to get encoded
+	// as 26-bit offset should be enough for any practical application.
+	br := m.allocateInstr()
+	br.asBr(originalTarget)
+	cur = linkInstr(cur, br)
+
+	// Update the end of the current block.
+	currentBlk.End = cur
+
+	linkInstr(cur, endNext)
+}
+
+// Format implements backend.Machine.
+func (m *machine) Format() string {
+	ectx := m.executableContext
+	begins := map[*instruction]label{}
+	for l, pos := range ectx.LabelPositions {
+		begins[pos.Begin] = l
+	}
+
+	irBlocks := map[label]ssa.BasicBlockID{}
+	for i, l := range ectx.SsaBlockIDToLabels {
+		irBlocks[l] = ssa.BasicBlockID(i)
+	}
+
+	var lines []string
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		if l, ok := begins[cur]; ok {
+			var labelStr string
+			if blkID, ok := irBlocks[l]; ok {
+				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+			} else {
+				labelStr = fmt.Sprintf("%s:", l)
+			}
+			lines = append(lines, labelStr)
+		}
+		if cur.kind == nop0 {
+			continue
+		}
+		lines = append(lines, "\t"+cur.String())
+	}
+	return "\n" + strings.Join(lines, "\n") + "\n"
+}
+
+// InsertReturn implements backend.Machine.
+func (m *machine) InsertReturn() {
+	i := m.allocateInstr()
+	i.asRet()
+	m.insert(i)
+}
+
+func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
+	offset, ok := m.spillSlots[id]
+	if !ok {
+		offset = m.spillSlotSize
+		// TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
+		m.spillSlots[id] = offset
+		m.spillSlotSize += int64(size)
+	}
+	return offset + 16 // spill slot starts above the clobbered registers and the frame size.
+}
+
+func (m *machine) clobberedRegSlotSize() int64 {
+	return int64(len(m.clobberedRegs) * 16)
+}
+
+func (m *machine) arg0OffsetFromSP() int64 {
+	return m.frameSize() +
+		16 + // 16-byte aligned return address
+		16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) ret0OffsetFromSP() int64 {
+	return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize
+}
+
+func (m *machine) requiredStackSize() int64 {
+	return m.maxRequiredStackSizeForCalls +
+		m.frameSize() +
+		16 + // 16-byte aligned return address.
+		16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) frameSize() int64 {
+	s := m.clobberedRegSlotSize() + m.spillSlotSize
+	if s&0xf != 0 {
+		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
+	}
+	return s
+}
+
+func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
+	// TODO: reuse the slice!
+	labels := make([]uint32, len(targets))
+	for j, target := range targets {
+		labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
+	}
+	index = len(m.jmpTableTargets)
+	m.jmpTableTargets = append(m.jmpTableTargets, labels)
+	return
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@ -0,0 +1,469 @@
+package arm64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+	m.setupPrologue()
+	m.postRegAlloc()
+}
+
+// setupPrologue initializes the prologue of the function.
+func (m *machine) setupPrologue() {
+	ectx := m.executableContext
+
+	cur := ectx.RootInstr
+	prevInitInst := cur.next
+
+	//
+	//                   (high address)                    (high address)
+	//         SP----> +-----------------+               +------------------+ <----+
+	//                 |     .......     |               |     .......      |      |
+	//                 |      ret Y      |               |      ret Y       |      |
+	//                 |     .......     |               |     .......      |      |
+	//                 |      ret 0      |               |      ret 0       |      |
+	//                 |      arg X      |               |      arg X       |      |  size_of_arg_ret.
+	//                 |     .......     |     ====>     |     .......      |      |
+	//                 |      arg 1      |               |      arg 1       |      |
+	//                 |      arg 0      |               |      arg 0       | <----+
+	//                 |-----------------|               |  size_of_arg_ret |
+	//                                                   |  return address  |
+	//                                                   +------------------+ <---- SP
+	//                    (low address)                     (low address)
+
+	// Saves the return address (lr) and the size_of_arg_ret below the SP.
+	// size_of_arg_ret is used for stack unwinding.
+	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+	if !m.stackBoundsCheckDisabled {
+		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+	}
+
+	// Decrement SP if spillSlotSize > 0.
+	if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
+		panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
+	}
+
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		//
+		//            (high address)                  (high address)
+		//          +-----------------+             +-----------------+
+		//          |     .......     |             |     .......     |
+		//          |      ret Y      |             |      ret Y      |
+		//          |     .......     |             |     .......     |
+		//          |      ret 0      |             |      ret 0      |
+		//          |      arg X      |             |      arg X      |
+		//          |     .......     |             |     .......     |
+		//          |      arg 1      |             |      arg 1      |
+		//          |      arg 0      |             |      arg 0      |
+		//          | size_of_arg_ret |             | size_of_arg_ret |
+		//          |   ReturnAddress |             |  ReturnAddress  |
+		//  SP----> +-----------------+    ====>    +-----------------+
+		//             (low address)                |   clobbered M   |
+		//                                          |   ............  |
+		//                                          |   clobbered 0   |
+		//                                          +-----------------+ <----- SP
+		//                                             (low address)
+		//
+		_amode := addressModePreOrPostIndex(spVReg,
+			-16,  // stack pointer must be 16-byte aligned.
+			true, // Decrement before store.
+		)
+		for _, vr := range regs {
+			// TODO: pair stores to reduce the number of instructions.
+			store := m.allocateInstr()
+			store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
+			cur = linkInstr(cur, store)
+		}
+	}
+
+	if size := m.spillSlotSize; size > 0 {
+		// Check if size is 16-byte aligned.
+		if size&0xf != 0 {
+			panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
+		}
+
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
+
+		// At this point, the stack looks like:
+		//
+		//            (high address)
+		//          +------------------+
+		//          |     .......      |
+		//          |      ret Y       |
+		//          |     .......      |
+		//          |      ret 0       |
+		//          |      arg X       |
+		//          |     .......      |
+		//          |      arg 1       |
+		//          |      arg 0       |
+		//          |  size_of_arg_ret |
+		//          |   ReturnAddress  |
+		//          +------------------+
+		//          |    clobbered M   |
+		//          |   ............   |
+		//          |    clobbered 0   |
+		//          |   spill slot N   |
+		//          |   ............   |
+		//          |   spill slot 2   |
+		//          |   spill slot 0   |
+		//  SP----> +------------------+
+		//             (low address)
+	}
+
+	// We push the frame size into the stack to make it possible to unwind stack:
+	//
+	//
+	//            (high address)                  (high address)
+	//         +-----------------+                +-----------------+
+	//         |     .......     |                |     .......     |
+	//         |      ret Y      |                |      ret Y      |
+	//         |     .......     |                |     .......     |
+	//         |      ret 0      |                |      ret 0      |
+	//         |      arg X      |                |      arg X      |
+	//         |     .......     |                |     .......     |
+	//         |      arg 1      |                |      arg 1      |
+	//         |      arg 0      |                |      arg 0      |
+	//         | size_of_arg_ret |                | size_of_arg_ret |
+	//         |  ReturnAddress  |                |  ReturnAddress  |
+	//         +-----------------+      ==>       +-----------------+ <----+
+	//         |   clobbered  M  |                |   clobbered  M  |      |
+	//         |   ............  |                |   ............  |      |
+	//         |   clobbered  2  |                |   clobbered  2  |      |
+	//         |   clobbered  1  |                |   clobbered  1  |      | frame size
+	//         |   clobbered  0  |                |   clobbered  0  |      |
+	//         |   spill slot N  |                |   spill slot N  |      |
+	//         |   ............  |                |   ............  |      |
+	//         |   spill slot 0  |                |   spill slot 0  | <----+
+	// SP--->  +-----------------+                |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
+	//                                            |   frame_size    |
+	//                                            +-----------------+ <---- SP
+	//            (low address)
+	//
+	cur = m.createFrameSizeSlot(cur, m.frameSize())
+
+	linkInstr(cur, prevInitInst)
+}
+
+func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
+	// First we decrement the stack pointer to point the arg0 slot.
+	var sizeOfArgRetReg regalloc.VReg
+	s := int64(m.currentABI.AlignedArgResultStackSlotSize())
+	if s > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+		sizeOfArgRetReg = tmpRegVReg
+
+		subSp := m.allocateInstr()
+		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+		cur = linkInstr(cur, subSp)
+	} else {
+		sizeOfArgRetReg = xzrVReg
+	}
+
+	// Saves the return address (lr) and the size_of_arg_ret below the SP.
+	// size_of_arg_ret is used for stack unwinding.
+	pstr := m.allocateInstr()
+	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
+	cur = linkInstr(cur, pstr)
+	return cur
+}
+
+func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
+	var frameSizeReg regalloc.VReg
+	if s > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+		frameSizeReg = tmpRegVReg
+	} else {
+		frameSizeReg = xzrVReg
+	}
+	_amode := addressModePreOrPostIndex(spVReg,
+		-16,  // stack pointer must be 16-byte aligned.
+		true, // Decrement before store.
+	)
+	store := m.allocateInstr()
+	store.asStore(operandNR(frameSizeReg), _amode, 64)
+	cur = linkInstr(cur, store)
+	return cur
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Removes the redundant copy instruction.
+// 2. Inserts the epilogue.
+func (m *machine) postRegAlloc() {
+	ectx := m.executableContext
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch cur.kind {
+		case ret:
+			m.setupEpilogueAfter(cur.prev)
+		case loadConstBlockArg:
+			lc := cur
+			next := lc.next
+			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+			m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
+			for _, instr := range m.executableContext.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+		default:
+			// Removes the redundant copy instruction.
+			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+				prev, next := cur.prev, cur.next
+				// Remove the copy instruction.
+				prev.next = next
+				if next != nil {
+					next.prev = prev
+				}
+			}
+		}
+	}
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+	prevNext := cur.next
+
+	// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
+
+	if s := m.spillSlotSize; s > 0 {
+		// Adjust SP to the original value:
+		//
+		//            (high address)                        (high address)
+		//          +-----------------+                  +-----------------+
+		//          |     .......     |                  |     .......     |
+		//          |      ret Y      |                  |      ret Y      |
+		//          |     .......     |                  |     .......     |
+		//          |      ret 0      |                  |      ret 0      |
+		//          |      arg X      |                  |      arg X      |
+		//          |     .......     |                  |     .......     |
+		//          |      arg 1      |                  |      arg 1      |
+		//          |      arg 0      |                  |      arg 0      |
+		//          |      xxxxx      |                  |      xxxxx      |
+		//          |   ReturnAddress |                  |   ReturnAddress |
+		//          +-----------------+      ====>       +-----------------+
+		//          |    clobbered M  |                  |    clobbered M  |
+		//          |   ............  |                  |   ............  |
+		//          |    clobbered 1  |                  |    clobbered 1  |
+		//          |    clobbered 0  |                  |    clobbered 0  |
+		//          |   spill slot N  |                  +-----------------+ <---- SP
+		//          |   ............  |
+		//          |   spill slot 0  |
+		//   SP---> +-----------------+
+		//             (low address)
+		//
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	// First we need to restore the clobbered registers.
+	if len(m.clobberedRegs) > 0 {
+		//            (high address)
+		//          +-----------------+                      +-----------------+
+		//          |     .......     |                      |     .......     |
+		//          |      ret Y      |                      |      ret Y      |
+		//          |     .......     |                      |     .......     |
+		//          |      ret 0      |                      |      ret 0      |
+		//          |      arg X      |                      |      arg X      |
+		//          |     .......     |                      |     .......     |
+		//          |      arg 1      |                      |      arg 1      |
+		//          |      arg 0      |                      |      arg 0      |
+		//          |      xxxxx      |                      |      xxxxx      |
+		//          |   ReturnAddress |                      |   ReturnAddress |
+		//          +-----------------+      ========>       +-----------------+ <---- SP
+		//          |   clobbered M   |
+		//          |   ...........   |
+		//          |   clobbered 1   |
+		//          |   clobbered 0   |
+		//   SP---> +-----------------+
+		//             (low address)
+
+		l := len(m.clobberedRegs) - 1
+		for i := range m.clobberedRegs {
+			vr := m.clobberedRegs[l-i] // reverse order to restore.
+			load := m.allocateInstr()
+			amode := addressModePreOrPostIndex(spVReg,
+				16,    // stack pointer must be 16-byte aligned.
+				false, // Increment after store.
+			)
+			// TODO: pair loads to reduce the number of instructions.
+			switch regTypeToRegisterSizeInBits(vr.RegType()) {
+			case 64: // save int reg.
+				load.asULoad(operandNR(vr), amode, 64)
+			case 128: // save vector reg.
+				load.asFpuLoad(operandNR(vr), amode, 128)
+			}
+			cur = linkInstr(cur, load)
+		}
+	}
+
+	// Reload the return address (lr).
+	//
+	//            +-----------------+          +-----------------+
+	//            |     .......     |          |     .......     |
+	//            |      ret Y      |          |      ret Y      |
+	//            |     .......     |          |     .......     |
+	//            |      ret 0      |          |      ret 0      |
+	//            |      arg X      |          |      arg X      |
+	//            |     .......     |   ===>   |     .......     |
+	//            |      arg 1      |          |      arg 1      |
+	//            |      arg 0      |          |      arg 0      |
+	//            |      xxxxx      |          +-----------------+ <---- SP
+	//            |  ReturnAddress  |
+	//    SP----> +-----------------+
+
+	ldr := m.allocateInstr()
+	ldr.asULoad(operandNR(lrVReg),
+		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	cur = linkInstr(cur, ldr)
+
+	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	linkInstr(cur, prevNext)
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
+// which always points to the execution context whenever the native code is entered from Go.
+var saveRequiredRegs = []regalloc.VReg{
+	x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
+	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
+	v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
+	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+//
+// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+	if requiredStackSize%16 != 0 {
+		panic("BUG")
+	}
+
+	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
+		// sub tmp, sp, #requiredStackSize
+		sub := m.allocateInstr()
+		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+		cur = linkInstr(cur, sub)
+	} else {
+		// This case, we first load the requiredStackSize into the temporary register,
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+		// Then subtract it.
+		sub := m.allocateInstr()
+		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		cur = linkInstr(cur, sub)
+	}
+
+	tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue.
+
+	// ldr tmp2, [executionContext #StackBottomPtr]
+	ldr := m.allocateInstr()
+	ldr.asULoad(operandNR(tmp2), addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   x0VReg, // execution context is always the first argument.
+		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
+	}, 64)
+	cur = linkInstr(cur, ldr)
+
+	// subs xzr, tmp, tmp2
+	subs := m.allocateInstr()
+	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+	cur = linkInstr(cur, subs)
+
+	// b.ge #imm
+	cbr := m.allocateInstr()
+	cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
+	cur = linkInstr(cur, cbr)
+
+	// Set the required stack size and set it to the exec context.
+	{
+		// First load the requiredStackSize into the temporary register,
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+		setRequiredStackSize := m.allocateInstr()
+		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+			}, 64)
+
+		cur = linkInstr(cur, setRequiredStackSize)
+	}
+
+	ldrAddress := m.allocateInstr()
+	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   x0VReg, // execution context is always the first argument
+		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
+	}, 64)
+	cur = linkInstr(cur, ldrAddress)
+
+	// Then jumps to the stack grow call sequence's address, meaning
+	// transferring the control to the code compiled by CompileStackGrowCallSequence.
+	bl := m.allocateInstr()
+	bl.asCallIndirect(tmpRegVReg, nil)
+	cur = linkInstr(cur, bl)
+
+	// Now that we know the entire code, we can finalize how many bytes
+	// we have to skip when the stack size is sufficient.
+	var cbrOffset int64
+	for _cur := cbr; ; _cur = _cur.next {
+		cbrOffset += _cur.size()
+		if _cur == cur {
+			break
+		}
+	}
+	cbr.condBrOffsetResolve(cbrOffset)
+	return cur
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+	ectx := m.executableContext
+
+	cur := m.allocateInstr()
+	cur.asNop0()
+	ectx.RootInstr = cur
+
+	// Save the callee saved and argument registers.
+	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
+
+	// Save the current stack pointer.
+	cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+	// Set the exit status on the execution context.
+	cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
+
+	// Exit the execution.
+	cur = m.storeReturnAddressAndExit(cur)
+
+	// After the exit, restore the saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
+
+	// Then goes back the original address of this stack grow call.
+	ret := m.allocateInstr()
+	ret.asRet()
+	linkInstr(cur, ret)
+
+	m.encode(ectx.RootInstr)
+	return m.compiler.Buf()
+}
+
+func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
+	ectx := m.executableContext
+
+	ectx.PendingInstructions = ectx.PendingInstructions[:0]
+	m.insertAddOrSubStackPointer(rd, diff, add)
+	for _, inserted := range ectx.PendingInstructions {
+		cur = linkInstr(cur, inserted)
+	}
+	return cur
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@ -0,0 +1,152 @@
+package arm64
+
+// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine.
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+	prevNext := cur.next
+	var mov1, mov2, mov3 *instruction
+	if x1.RegType() == regalloc.RegTypeInt {
+		if !tmp.Valid() {
+			tmp = tmpRegVReg
+		}
+		mov1 = m.allocateInstr().asMove64(tmp, x1)
+		mov2 = m.allocateInstr().asMove64(x1, x2)
+		mov3 = m.allocateInstr().asMove64(x2, tmp)
+		cur = linkInstr(cur, mov1)
+		cur = linkInstr(cur, mov2)
+		cur = linkInstr(cur, mov3)
+		linkInstr(cur, prevNext)
+	} else {
+		if !tmp.Valid() {
+			r2 := x2.RealReg()
+			// Temporarily spill x1 to stack.
+			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			// Then move x2 to x1.
+			cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2))
+			linkInstr(cur, prevNext)
+			// Then reload the original value on x1 from stack to r2.
+			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+		} else {
+			mov1 = m.allocateInstr().asFpuMov128(tmp, x1)
+			mov2 = m.allocateInstr().asFpuMov128(x1, x2)
+			mov3 = m.allocateInstr().asFpuMov128(x2, tmp)
+			cur = linkInstr(cur, mov1)
+			cur = linkInstr(cur, mov2)
+			cur = linkInstr(cur, mov3)
+			linkInstr(cur, prevNext)
+		}
+	}
+}
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	typ := src.RegType()
+	if typ != dst.RegType() {
+		panic("BUG: src and dst must have the same type")
+	}
+
+	mov := m.allocateInstr()
+	if typ == regalloc.RegTypeInt {
+		mov.asMove64(dst, src)
+	} else {
+		mov.asFpuMov128(dst, src)
+	}
+
+	cur := instr.prev
+	prevNext := cur.next
+	cur = linkInstr(cur, mov)
+	linkInstr(cur, prevNext)
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+	return m.executableContext.SsaBlockIDToLabels[id]
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.compiler.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	var amode addressMode
+	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+	store := m.allocateInstr()
+	store.asStore(operandNR(v), amode, typ.Bits())
+
+	cur = linkInstr(cur, store)
+	return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.compiler.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	var amode addressMode
+	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+	load := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		load.asULoad(operandNR(v), amode, typ.Bits())
+	case ssa.TypeF32, ssa.TypeF64:
+		load.asFpuLoad(operandNR(v), amode, typ.Bits())
+	case ssa.TypeV128:
+		load.asFpuLoad(operandNR(v), amode, 128)
+	default:
+		panic("TODO")
+	}
+
+	cur = linkInstr(cur, load)
+	return linkInstr(cur, prevNext)
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+	cur := end
+	for cur.kind == nop0 {
+		cur = cur.prev
+		if cur == begin {
+			return end
+		}
+	}
+	switch cur.kind {
+	case br:
+		return cur
+	default:
+		return end
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
@ -0,0 +1,117 @@
+package arm64
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+	"sort"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+)
+
+const (
+	// trampolineCallSize is the size of the trampoline instruction sequence for each function in an island.
+	trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate.
+
+	// Unconditional branch offset is encoded as divided by 4 in imm26.
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
+
+	maxUnconditionalBranchOffset = maxSignedInt26 * 4
+	minUnconditionalBranchOffset = minSignedInt26 * 4
+
+	// trampolineIslandInterval is the range of the trampoline island.
+	// Half of the range is used for the trampoline island, and the other half is used for the function.
+	trampolineIslandInterval = maxUnconditionalBranchOffset / 2
+
+	// maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable.
+	maxNumFunctions = trampolineIslandInterval >> 6
+
+	// maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island.
+	// Conservatively set to 1/4 of the trampoline island interval.
+	maxFunctionExecutableSize = trampolineIslandInterval >> 2
+)
+
+// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
+func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) {
+	if numFunctions > maxNumFunctions {
+		return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions)
+	}
+	return trampolineIslandInterval, trampolineCallSize * numFunctions, nil
+}
+
+// ResolveRelocations implements backend.Machine ResolveRelocations.
+func (m *machine) ResolveRelocations(
+	refToBinaryOffset []int,
+	executable []byte,
+	relocations []backend.RelocationInfo,
+	callTrampolineIslandOffsets []int,
+) {
+	for _, islandOffset := range callTrampolineIslandOffsets {
+		encodeCallTrampolineIsland(refToBinaryOffset, islandOffset, executable)
+	}
+
+	for _, r := range relocations {
+		instrOffset := r.Offset
+		calleeFnOffset := refToBinaryOffset[r.FuncRef]
+		diff := int64(calleeFnOffset) - (instrOffset)
+		// Check if the diff is within the range of the branch instruction.
+		if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+			// Find the near trampoline island from callTrampolineIslandOffsets.
+			islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset))
+			islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef)
+			diff = int64(islandTargetOffset) - (instrOffset)
+			if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+				panic("BUG in trampoline placement")
+			}
+		}
+		binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff))
+	}
+}
+
+// encodeCallTrampolineIsland encodes a trampoline island for the given functions.
+// Each island consists of a trampoline instruction sequence for each function.
+// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate.
+func encodeCallTrampolineIsland(refToBinaryOffset []int, islandOffset int, executable []byte) {
+	for i := 0; i < len(refToBinaryOffset); i++ {
+		trampolineOffset := islandOffset + trampolineCallSize*i
+
+		fnOffset := refToBinaryOffset[i]
+		diff := fnOffset - (trampolineOffset + 16)
+		if diff > math.MaxInt32 || diff < math.MinInt32 {
+			// This case even amd64 can't handle. 4GB is too big.
+			panic("too big binary")
+		}
+
+		// The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use).
+		tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11]
+
+		// adr tmpReg, PC+16: load the address of #diff into tmpReg.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16))
+		// ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+4:],
+			encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg}))
+		// add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+8:],
+			encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false))
+		// br tmpReg: branch to the function without overwriting the link register.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false))
+		// #diff
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff))
+	}
+}
+
+// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets.
+// Note that even if the offset is in the middle of two islands, it returns the latter one.
+// That is ok because the island is always placed in the middle of the range.
+//
+// precondition: callTrampolineIslandOffsets is sorted in ascending order.
+func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int {
+	l := len(callTrampolineIslandOffsets)
+	n := sort.Search(l, func(i int) bool {
+		return callTrampolineIslandOffsets[i] >= offset
+	})
+	if n == l {
+		n = l - 1
+	}
+	return callTrampolineIslandOffsets[n]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
@ -0,0 +1,397 @@
+package arm64
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Arm64-specific registers.
+//
+// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state
+
+const (
+	// General purpose registers. Note that we do not distinguish wn and xn registers
+	// because they are the same from the perspective of register allocator, and
+	// the size can be determined by the type of the instruction.
+
+	x0 = regalloc.RealRegInvalid + 1 + iota
+	x1
+	x2
+	x3
+	x4
+	x5
+	x6
+	x7
+	x8
+	x9
+	x10
+	x11
+	x12
+	x13
+	x14
+	x15
+	x16
+	x17
+	x18
+	x19
+	x20
+	x21
+	x22
+	x23
+	x24
+	x25
+	x26
+	x27
+	x28
+	x29
+	x30
+
+	// Vector registers. Note that we do not distinguish vn and dn, ... registers
+	// because they are the same from the perspective of register allocator, and
+	// the size can be determined by the type of the instruction.
+
+	v0
+	v1
+	v2
+	v3
+	v4
+	v5
+	v6
+	v7
+	v8
+	v9
+	v10
+	v11
+	v12
+	v13
+	v14
+	v15
+	v16
+	v17
+	v18
+	v19
+	v20
+	v21
+	v22
+	v23
+	v24
+	v25
+	v26
+	v27
+	v28
+	v29
+	v30
+	v31
+
+	// Special registers
+
+	xzr
+	sp
+	lr  = x30
+	fp  = x29
+	tmp = x27
+)
+
+var (
+	x0VReg  = regalloc.FromRealReg(x0, regalloc.RegTypeInt)
+	x1VReg  = regalloc.FromRealReg(x1, regalloc.RegTypeInt)
+	x2VReg  = regalloc.FromRealReg(x2, regalloc.RegTypeInt)
+	x3VReg  = regalloc.FromRealReg(x3, regalloc.RegTypeInt)
+	x4VReg  = regalloc.FromRealReg(x4, regalloc.RegTypeInt)
+	x5VReg  = regalloc.FromRealReg(x5, regalloc.RegTypeInt)
+	x6VReg  = regalloc.FromRealReg(x6, regalloc.RegTypeInt)
+	x7VReg  = regalloc.FromRealReg(x7, regalloc.RegTypeInt)
+	x8VReg  = regalloc.FromRealReg(x8, regalloc.RegTypeInt)
+	x9VReg  = regalloc.FromRealReg(x9, regalloc.RegTypeInt)
+	x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt)
+	x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt)
+	x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt)
+	x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt)
+	x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt)
+	x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt)
+	x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt)
+	x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt)
+	x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt)
+	x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt)
+	x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt)
+	x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt)
+	x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt)
+	x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt)
+	x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt)
+	x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt)
+	x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt)
+	x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt)
+	x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt)
+	x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt)
+	x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt)
+	v0VReg  = regalloc.FromRealReg(v0, regalloc.RegTypeFloat)
+	v1VReg  = regalloc.FromRealReg(v1, regalloc.RegTypeFloat)
+	v2VReg  = regalloc.FromRealReg(v2, regalloc.RegTypeFloat)
+	v3VReg  = regalloc.FromRealReg(v3, regalloc.RegTypeFloat)
+	v4VReg  = regalloc.FromRealReg(v4, regalloc.RegTypeFloat)
+	v5VReg  = regalloc.FromRealReg(v5, regalloc.RegTypeFloat)
+	v6VReg  = regalloc.FromRealReg(v6, regalloc.RegTypeFloat)
+	v7VReg  = regalloc.FromRealReg(v7, regalloc.RegTypeFloat)
+	v8VReg  = regalloc.FromRealReg(v8, regalloc.RegTypeFloat)
+	v9VReg  = regalloc.FromRealReg(v9, regalloc.RegTypeFloat)
+	v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat)
+	v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat)
+	v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat)
+	v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat)
+	v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat)
+	v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat)
+	v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat)
+	v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat)
+	v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat)
+	v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat)
+	v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat)
+	v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat)
+	v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat)
+	v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat)
+	v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat)
+	v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat)
+	v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat)
+	v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat)
+	// lr (link register) holds the return address at the function entry.
+	lrVReg = x30VReg
+	// tmpReg is used to perform spill/load on large stack offsets, and load large constants.
+	// Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation.
+	// This is the same as golang/go, but it's only described in the source code:
+	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59
+	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15
+	tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt)
+	v28VReg    = regalloc.FromRealReg(v28, regalloc.RegTypeFloat)
+	v29VReg    = regalloc.FromRealReg(v29, regalloc.RegTypeFloat)
+	v30VReg    = regalloc.FromRealReg(v30, regalloc.RegTypeFloat)
+	v31VReg    = regalloc.FromRealReg(v31, regalloc.RegTypeFloat)
+	xzrVReg    = regalloc.FromRealReg(xzr, regalloc.RegTypeInt)
+	spVReg     = regalloc.FromRealReg(sp, regalloc.RegTypeInt)
+	fpVReg     = regalloc.FromRealReg(fp, regalloc.RegTypeInt)
+)
+
+var regNames = [...]string{
+	x0:  "x0",
+	x1:  "x1",
+	x2:  "x2",
+	x3:  "x3",
+	x4:  "x4",
+	x5:  "x5",
+	x6:  "x6",
+	x7:  "x7",
+	x8:  "x8",
+	x9:  "x9",
+	x10: "x10",
+	x11: "x11",
+	x12: "x12",
+	x13: "x13",
+	x14: "x14",
+	x15: "x15",
+	x16: "x16",
+	x17: "x17",
+	x18: "x18",
+	x19: "x19",
+	x20: "x20",
+	x21: "x21",
+	x22: "x22",
+	x23: "x23",
+	x24: "x24",
+	x25: "x25",
+	x26: "x26",
+	x27: "x27",
+	x28: "x28",
+	x29: "x29",
+	x30: "x30",
+	xzr: "xzr",
+	sp:  "sp",
+	v0:  "v0",
+	v1:  "v1",
+	v2:  "v2",
+	v3:  "v3",
+	v4:  "v4",
+	v5:  "v5",
+	v6:  "v6",
+	v7:  "v7",
+	v8:  "v8",
+	v9:  "v9",
+	v10: "v10",
+	v11: "v11",
+	v12: "v12",
+	v13: "v13",
+	v14: "v14",
+	v15: "v15",
+	v16: "v16",
+	v17: "v17",
+	v18: "v18",
+	v19: "v19",
+	v20: "v20",
+	v21: "v21",
+	v22: "v22",
+	v23: "v23",
+	v24: "v24",
+	v25: "v25",
+	v26: "v26",
+	v27: "v27",
+	v28: "v28",
+	v29: "v29",
+	v30: "v30",
+	v31: "v31",
+}
+
+func formatVRegSized(r regalloc.VReg, size byte) (ret string) {
+	if r.IsRealReg() {
+		ret = regNames[r.RealReg()]
+		switch ret[0] {
+		case 'x':
+			switch size {
+			case 32:
+				ret = strings.Replace(ret, "x", "w", 1)
+			case 64:
+			default:
+				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+			}
+		case 'v':
+			switch size {
+			case 32:
+				ret = strings.Replace(ret, "v", "s", 1)
+			case 64:
+				ret = strings.Replace(ret, "v", "d", 1)
+			case 128:
+				ret = strings.Replace(ret, "v", "q", 1)
+			default:
+				panic("BUG: invalid register size")
+			}
+		}
+	} else {
+		switch r.RegType() {
+		case regalloc.RegTypeInt:
+			switch size {
+			case 32:
+				ret = fmt.Sprintf("w%d?", r.ID())
+			case 64:
+				ret = fmt.Sprintf("x%d?", r.ID())
+			default:
+				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+			}
+		case regalloc.RegTypeFloat:
+			switch size {
+			case 32:
+				ret = fmt.Sprintf("s%d?", r.ID())
+			case 64:
+				ret = fmt.Sprintf("d%d?", r.ID())
+			case 128:
+				ret = fmt.Sprintf("q%d?", r.ID())
+			default:
+				panic("BUG: invalid register size")
+			}
+		default:
+			panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r))
+		}
+	}
+	return
+}
+
+func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) {
+	var id string
+	wspec := strings.ToLower(width.String())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()][1:]
+	} else {
+		id = fmt.Sprintf("%d?", r.ID())
+	}
+	ret = fmt.Sprintf("%s%s", wspec, id)
+	return
+}
+
+func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) {
+	id := fmt.Sprintf("v%d?", r.ID())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()]
+	}
+	ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String()))
+	if index != vecIndexNone {
+		ret += fmt.Sprintf("[%d]", index)
+	}
+	return
+}
+
+func regTypeToRegisterSizeInBits(r regalloc.RegType) byte {
+	switch r {
+	case regalloc.RegTypeInt:
+		return 64
+	case regalloc.RegTypeFloat:
+		return 128
+	default:
+		panic("BUG: invalid register type")
+	}
+}
+
+var regNumberInEncoding = [...]uint32{
+	x0:  0,
+	x1:  1,
+	x2:  2,
+	x3:  3,
+	x4:  4,
+	x5:  5,
+	x6:  6,
+	x7:  7,
+	x8:  8,
+	x9:  9,
+	x10: 10,
+	x11: 11,
+	x12: 12,
+	x13: 13,
+	x14: 14,
+	x15: 15,
+	x16: 16,
+	x17: 17,
+	x18: 18,
+	x19: 19,
+	x20: 20,
+	x21: 21,
+	x22: 22,
+	x23: 23,
+	x24: 24,
+	x25: 25,
+	x26: 26,
+	x27: 27,
+	x28: 28,
+	x29: 29,
+	x30: 30,
+	xzr: 31,
+	sp:  31,
+	v0:  0,
+	v1:  1,
+	v2:  2,
+	v3:  3,
+	v4:  4,
+	v5:  5,
+	v6:  6,
+	v7:  7,
+	v8:  8,
+	v9:  9,
+	v10: 10,
+	v11: 11,
+	v12: 12,
+	v13: 13,
+	v14: 14,
+	v15: 15,
+	v16: 16,
+	v17: 17,
+	v18: 18,
+	v19: 19,
+	v20: 20,
+	v21: 21,
+	v22: 22,
+	v23: 23,
+	v24: 24,
+	v25: 25,
+	v26: 26,
+	v27: 27,
+	v28: 28,
+	v29: 29,
+	v30: 30,
+	v31: 31,
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
@ -0,0 +1,90 @@
+package arm64
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr {
+	l := int(top - sp)
+
+	var stackBuf []byte
+	{
+		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+		hdr.Data = sp
+		hdr.Len = l
+		hdr.Cap = l
+	}
+
+	for i := uint64(0); i < uint64(l); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |  <----+
+		//    |     .......     |       |
+		//    |      ret 0      |       |
+		//    |      arg X      |       |  size_of_arg_ret
+		//    |     .......     |       |
+		//    |      arg 1      |       |
+		//    |      arg 0      |  <----+
+		//    | size_of_arg_ret |
+		//    |  ReturnAddress  |
+		//    +-----------------+ <----+
+		//    |   ...........   |      |
+		//    |   spill slot M  |      |
+		//    |   ............  |      |
+		//    |   spill slot 2  |      |
+		//    |   spill slot 1  |      | frame size
+		//    |   spill slot 1  |      |
+		//    |   clobbered N   |      |
+		//    |   ............  |      |
+		//    |   clobbered 0   | <----+
+		//    |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
+		//    |   frame_size    |
+		//    +-----------------+ <---- SP
+		//       (low address)
+
+		frameSize := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += frameSize +
+			16 // frame size + aligned space.
+		retAddr := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += 8 // ret addr.
+		sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += 8 + sizeOfArgRet
+		returnAddresses = append(returnAddresses, uintptr(retAddr))
+		if len(returnAddresses) == wasmdebug.MaxFrames {
+			break
+		}
+	}
+	return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	//                  (high address)
+	//              +-----------------+ <----+
+	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
+	//           ^  |  arg[N]/ret[M]  |      |
+	// sliceSize |  |  ............   |      | sliceSize
+	//           |  |  arg[1]/ret[1]  |      |
+	//           v  |  arg[0]/ret[0]  | <----+
+	//              |    sliceSize    |
+	//              |   frame_size    |
+	//              +-----------------+ <---- stackPointerBeforeGoCall
+	//                 (low address)
+	ptr := unsafe.Pointer(stackPointerBeforeGoCall)
+	size := *(*uint64)(unsafe.Add(ptr, 8))
+	var view []uint64
+	{
+		sh := (*reflect.SliceHeader)(unsafe.Pointer(&view))
+		sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
+		sh.Len = int(size)
+		sh.Cap = int(size)
+	}
+	return view
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
@ -0,0 +1,100 @@
+package backend
+
+import (
+	"context"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// Machine is a backend for a specific ISA machine.
+	Machine interface {
+		ExecutableContext() ExecutableContext
+
+		// DisableStackCheck disables the stack check for the current compilation for debugging/testing.
+		DisableStackCheck()
+
+		// SetCurrentABI initializes the FunctionABI for the given signature.
+		SetCurrentABI(abi *FunctionABI)
+
+		// SetCompiler sets the compilation context used for the lifetime of Machine.
+		// This is only called once per Machine, i.e. before the first compilation.
+		SetCompiler(Compiler)
+
+		// LowerSingleBranch is called when the compilation of the given single branch is started.
+		LowerSingleBranch(b *ssa.Instruction)
+
+		// LowerConditionalBranch is called when the compilation of the given conditional branch is started.
+		LowerConditionalBranch(b *ssa.Instruction)
+
+		// LowerInstr is called for each instruction in the given block except for the ones marked as already lowered
+		// via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one.
+		//
+		// Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible
+		// for optimization.
+		LowerInstr(*ssa.Instruction)
+
+		// Reset resets the machine state for the next compilation.
+		Reset()
+
+		// InsertMove inserts a move instruction from src to dst whose type is typ.
+		InsertMove(dst, src regalloc.VReg, typ ssa.Type)
+
+		// InsertReturn inserts the return instruction to return from the current function.
+		InsertReturn()
+
+		// InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg.
+		InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg)
+
+		// Format returns the string representation of the currently compiled machine code.
+		// This is only for testing purpose.
+		Format() string
+
+		// RegAlloc does the register allocation after lowering.
+		RegAlloc()
+
+		// PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc.
+		PostRegAlloc()
+
+		// ResolveRelocations resolves the relocations after emitting machine code.
+		//  * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset.
+		//  * executable: the binary to resolve the relocations.
+		//  * relocations: the relocations to resolve.
+		//  * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable.
+		ResolveRelocations(
+			refToBinaryOffset []int,
+			executable []byte,
+			relocations []RelocationInfo,
+			callTrampolineIslandOffsets []int,
+		)
+
+		// Encode encodes the machine instructions to the Compiler.
+		Encode(ctx context.Context) error
+
+		// CompileGoFunctionTrampoline compiles the trampoline function  to call a Go function of the given exit code and signature.
+		CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte
+
+		// CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to
+		// call the stack grow builtin function.
+		CompileStackGrowCallSequence() []byte
+
+		// CompileEntryPreamble returns the sequence of instructions shared by multiple functions to
+		// enter the function from Go.
+		CompileEntryPreamble(signature *ssa.Signature) []byte
+
+		// LowerParams lowers the given parameters.
+		LowerParams(params []ssa.Value)
+
+		// LowerReturns lowers the given returns.
+		LowerReturns(returns []ssa.Value)
+
+		// ArgsResultsRegs returns the registers used for arguments and return values.
+		ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg)
+
+		// CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and
+		// the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine.
+		CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error)
+	}
+)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
@ -0,0 +1,319 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction.
+type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface {
+	// InsertMoveBefore inserts the move instruction from src to dst before the given instruction.
+	InsertMoveBefore(dst, src regalloc.VReg, instr I)
+	// InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction.
+	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+	InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I
+	// InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction.
+	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+	InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I
+	// ClobberedRegisters is called when the register allocation is done and the clobbered registers are known.
+	ClobberedRegisters(regs []regalloc.VReg)
+	// Swap swaps the two virtual registers after the given instruction.
+	Swap(cur I, x1, x2, tmp regalloc.VReg)
+	// LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details.
+	LastInstrForInsertion(begin, end I) I
+	// SSABlockLabel returns the label of the given ssa.BasicBlockID.
+	SSABlockLabel(id ssa.BasicBlockID) Label
+}
+
+type (
+	// RegAllocFunction implements regalloc.Function.
+	RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+		m   m
+		ssb ssa.Builder
+		c   Compiler
+		// iter is the iterator for reversePostOrderBlocks
+		iter                   int
+		reversePostOrderBlocks []RegAllocBlock[I, m]
+		// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
+		labelToRegAllocBlockIndex map[Label]int
+		loopNestingForestRoots    []ssa.BasicBlock
+	}
+
+	// RegAllocBlock implements regalloc.Block.
+	RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+		// f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses().
+		f                           *RegAllocFunction[I, m]
+		sb                          ssa.BasicBlock
+		l                           Label
+		begin, end                  I
+		loopNestingForestChildren   []ssa.BasicBlock
+		cur                         I
+		id                          int
+		cachedLastInstrForInsertion I
+	}
+)
+
+// NewRegAllocFunction returns a new RegAllocFunction.
+func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
+	return &RegAllocFunction[I, M]{
+		m:                         m,
+		ssb:                       ssb,
+		c:                         c,
+		labelToRegAllocBlockIndex: make(map[Label]int),
+	}
+}
+
+// AddBlock adds a new block to the function.
+func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) {
+	i := len(f.reversePostOrderBlocks)
+	f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{
+		f:     f,
+		sb:    sb,
+		l:     l,
+		begin: begin,
+		end:   end,
+		id:    int(sb.ID()),
+	})
+	f.labelToRegAllocBlockIndex[l] = i
+}
+
+// Reset resets the function for the next compilation.
+func (f *RegAllocFunction[I, M]) Reset() {
+	f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0]
+	f.iter = 0
+}
+
+// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter.
+func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertStoreRegisterAt(v, instr.(I), true)
+}
+
+// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore.
+func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertReloadRegisterAt(v, instr.(I), false)
+}
+
+// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter.
+func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertReloadRegisterAt(v, instr.(I), true)
+}
+
+// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore.
+func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertStoreRegisterAt(v, instr.(I), false)
+}
+
+// ClobberedRegisters implements regalloc.Function ClobberedRegisters.
+func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) {
+	f.m.ClobberedRegisters(regs)
+}
+
+// SwapBefore implements regalloc.Function SwapBefore.
+func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) {
+	f.m.Swap(instr.Prev().(I), x1, x2, tmp)
+}
+
+// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block {
+	f.iter = len(f.reversePostOrderBlocks) - 1
+	return f.PostOrderBlockIteratorNext()
+}
+
+// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block {
+	if f.iter < 0 {
+		return nil
+	}
+	b := &f.reversePostOrderBlocks[f.iter]
+	f.iter--
+	return b
+}
+
+// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block {
+	f.iter = 0
+	return f.ReversePostOrderBlockIteratorNext()
+}
+
+// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block {
+	if f.iter >= len(f.reversePostOrderBlocks) {
+		return nil
+	}
+	b := &f.reversePostOrderBlocks[f.iter]
+	f.iter++
+	return b
+}
+
+// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int {
+	f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots()
+	return len(f.loopNestingForestRoots)
+}
+
+// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block {
+	blk := f.loopNestingForestRoots[i]
+	l := f.m.SSABlockLabel(blk.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// InsertMoveBefore implements regalloc.Function InsertMoveBefore.
+func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) {
+	f.m.InsertMoveBefore(dst, src, instr.(I))
+}
+
+// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor.
+func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block {
+	ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb)
+	l := f.m.SSABlockLabel(ret.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// Idom implements regalloc.Function Idom.
+func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block {
+	builder := f.ssb
+	idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb)
+	if idom == nil {
+		panic("BUG: idom must not be nil")
+	}
+	l := f.m.SSABlockLabel(idom.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// ID implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) }
+
+// BlockParams implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg {
+	c := r.f.c
+	*regs = (*regs)[:0]
+	for i := 0; i < r.sb.Params(); i++ {
+		v := c.VRegOf(r.sb.Param(i))
+		*regs = append(*regs, v)
+	}
+	return *regs
+}
+
+// InstrIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr {
+	r.cur = r.begin
+	return r.cur
+}
+
+// InstrIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr {
+	for {
+		if r.cur == r.end {
+			return nil
+		}
+		instr := r.cur.Next()
+		r.cur = instr.(I)
+		if instr == nil {
+			return nil
+		} else if instr.AddedBeforeRegAlloc() {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// InstrRevIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr {
+	r.cur = r.end
+	return r.cur
+}
+
+// InstrRevIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr {
+	for {
+		if r.cur == r.begin {
+			return nil
+		}
+		instr := r.cur.Prev()
+		r.cur = instr.(I)
+		if instr == nil {
+			return nil
+		} else if instr.AddedBeforeRegAlloc() {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// FirstInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr {
+	return r.begin
+}
+
+// EndInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr {
+	return r.end
+}
+
+// LastInstrForInsertion implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr {
+	var nil I
+	if r.cachedLastInstrForInsertion == nil {
+		r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end)
+	}
+	return r.cachedLastInstrForInsertion
+}
+
+// Preds implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() }
+
+// Pred implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block {
+	sb := r.sb
+	pred := sb.Pred(i)
+	l := r.f.m.SSABlockLabel(pred.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
+// Entry implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() }
+
+// Succs implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succs() int {
+	return r.sb.Succs()
+}
+
+// Succ implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block {
+	sb := r.sb
+	succ := sb.Succ(i)
+	if succ.ReturnBlock() {
+		return nil
+	}
+	l := r.f.m.SSABlockLabel(succ.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
+// LoopHeader implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopHeader() bool {
+	return r.sb.LoopHeader()
+}
+
+// LoopNestingForestChildren implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int {
+	r.loopNestingForestChildren = r.sb.LoopNestingForestChildren()
+	return len(r.loopNestingForestChildren)
+}
+
+// LoopNestingForestChild implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block {
+	blk := r.loopNestingForestChildren[i]
+	l := r.f.m.SSABlockLabel(blk.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
@ -0,0 +1,136 @@
+package regalloc
+
+import "fmt"
+
+// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register
+// allocators to work on any ISA.
+//
+// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode
+// 	where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged
+// 	by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html
+
+type (
+	// Function is the top-level interface to do register allocation, which corresponds to a CFG containing
+	// Blocks(s).
+	Function interface {
+		// PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG.
+		// In other words, the last blocks in the CFG will be returned first.
+		PostOrderBlockIteratorBegin() Block
+		// PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG.
+		PostOrderBlockIteratorNext() Block
+		// ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG.
+		// In other words, the first blocks in the CFG will be returned first.
+		ReversePostOrderBlockIteratorBegin() Block
+		// ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG.
+		ReversePostOrderBlockIteratorNext() Block
+		// ClobberedRegisters tell the clobbered registers by this function.
+		ClobberedRegisters([]VReg)
+		// LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
+		LoopNestingForestRoots() int
+		// LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
+		LoopNestingForestRoot(i int) Block
+		// LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree.
+		LowestCommonAncestor(blk1, blk2 Block) Block
+		// Idom returns the immediate dominator of the given block.
+		Idom(blk Block) Block
+
+		// Followings are for rewriting the function.
+
+		// SwapAtEndOfBlock swaps the two virtual registers at the end of the given block.
+		SwapBefore(x1, x2, tmp VReg, instr Instr)
+		// StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register.
+		StoreRegisterBefore(v VReg, instr Instr)
+		// StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register.
+		StoreRegisterAfter(v VReg, instr Instr)
+		// ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register.
+		ReloadRegisterBefore(v VReg, instr Instr)
+		// ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register.
+		ReloadRegisterAfter(v VReg, instr Instr)
+		// InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers.
+		InsertMoveBefore(dst, src VReg, instr Instr)
+	}
+
+	// Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
+	Block interface {
+		// ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG.
+		ID() int32
+		// BlockParams returns the virtual registers used as the parameters of this block.
+		BlockParams(*[]VReg) []VReg
+		// InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
+		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+		InstrIteratorBegin() Instr
+		// InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
+		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+		InstrIteratorNext() Instr
+		// InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
+		InstrRevIteratorBegin() Instr
+		// InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
+		InstrRevIteratorNext() Instr
+		// FirstInstr returns the fist instruction in this block where instructions will be inserted after it.
+		FirstInstr() Instr
+		// EndInstr returns the end instruction in this block.
+		EndInstr() Instr
+		// LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it.
+		// Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges.
+		// At the time of register allocation, all the critical edges are already split, so there is no need
+		// to worry about the case where branching instruction has multiple successors.
+		// Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns
+		// the unconditional branch, not the nop. In other words it is either nop or unconditional branch.
+		LastInstrForInsertion() Instr
+		// Preds returns the number of predecessors of this block in the CFG.
+		Preds() int
+		// Pred returns the i-th predecessor of this block in the CFG.
+		Pred(i int) Block
+		// Entry returns true if the block is for the entry block.
+		Entry() bool
+		// Succs returns the number of successors of this block in the CFG.
+		Succs() int
+		// Succ returns the i-th successor of this block in the CFG.
+		Succ(i int) Block
+		// LoopHeader returns true if this block is a loop header.
+		LoopHeader() bool
+		// LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
+		LoopNestingForestChildren() int
+		// LoopNestingForestChild returns the i-th child of this block in the loop nesting forest.
+		LoopNestingForestChild(i int) Block
+	}
+
+	// Instr is an instruction in a block, abstracting away the underlying ISA.
+	Instr interface {
+		fmt.Stringer
+		// Next returns the next instruction in the same block.
+		Next() Instr
+		// Prev returns the previous instruction in the same block.
+		Prev() Instr
+		// Defs returns the virtual registers defined by this instruction.
+		Defs(*[]VReg) []VReg
+		// Uses returns the virtual registers used by this instruction.
+		// Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this.
+		Uses(*[]VReg) []VReg
+		// AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index.
+		AssignUse(index int, v VReg)
+		// AssignDef assigns a RealReg-allocated virtual register defined by this instruction.
+		// This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction)
+		AssignDef(VReg)
+		// IsCopy returns true if this instruction is a move instruction between two registers.
+		// If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other,
+		// we could coalesce them, and hence the copy can be eliminated from the final code.
+		IsCopy() bool
+		// IsCall returns true if this instruction is a call instruction. The result is used to insert
+		// caller saved register spills and restores.
+		IsCall() bool
+		// IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer.
+		//  The result is used to insert caller saved register spills and restores.
+		IsIndirectCall() bool
+		// IsReturn returns true if this instruction is a return instruction.
+		IsReturn() bool
+		// AddedBeforeRegAlloc returns true if this instruction is added before register allocation.
+		AddedBeforeRegAlloc() bool
+	}
+
+	// InstrConstraint is an interface for arch-specific instruction constraints.
+	InstrConstraint interface {
+		comparable
+		Instr
+	}
+)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
@ -0,0 +1,123 @@
+package regalloc
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend.
+// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg.
+type VReg uint64
+
+// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info.
+type VRegID uint32
+
+// RealReg returns the RealReg of this VReg.
+func (v VReg) RealReg() RealReg {
+	return RealReg(v >> 32)
+}
+
+// IsRealReg returns true if this VReg is backed by a physical register.
+func (v VReg) IsRealReg() bool {
+	return v.RealReg() != RealRegInvalid
+}
+
+// FromRealReg returns a VReg from the given RealReg and RegType.
+// This is used to represent a specific pre-colored register in the backend.
+func FromRealReg(r RealReg, typ RegType) VReg {
+	rid := VRegID(r)
+	if rid > vRegIDReservedForRealNum {
+		panic(fmt.Sprintf("invalid real reg %d", r))
+	}
+	return VReg(r).SetRealReg(r).SetRegType(typ)
+}
+
+// SetRealReg sets the RealReg of this VReg and returns the updated VReg.
+func (v VReg) SetRealReg(r RealReg) VReg {
+	return VReg(r)<<32 | (v & 0xff_00_ffffffff)
+}
+
+// RegType returns the RegType of this VReg.
+func (v VReg) RegType() RegType {
+	return RegType(v >> 40)
+}
+
+// SetRegType sets the RegType of this VReg and returns the updated VReg.
+func (v VReg) SetRegType(t RegType) VReg {
+	return VReg(t)<<40 | (v & 0x00_ff_ffffffff)
+}
+
+// ID returns the VRegID of this VReg.
+func (v VReg) ID() VRegID {
+	return VRegID(v & 0xffffffff)
+}
+
+// Valid returns true if this VReg is Valid.
+func (v VReg) Valid() bool {
+	return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid
+}
+
+// RealReg represents a physical register.
+type RealReg byte
+
+const RealRegInvalid RealReg = 0
+
+const (
+	vRegIDInvalid            VRegID = 1 << 31
+	VRegIDNonReservedBegin          = vRegIDReservedForRealNum
+	vRegIDReservedForRealNum VRegID = 128
+	VRegInvalid                     = VReg(vRegIDInvalid)
+)
+
+// String implements fmt.Stringer.
+func (r RealReg) String() string {
+	switch r {
+	case RealRegInvalid:
+		return "invalid"
+	default:
+		return fmt.Sprintf("r%d", r)
+	}
+}
+
+// String implements fmt.Stringer.
+func (v VReg) String() string {
+	if v.IsRealReg() {
+		return fmt.Sprintf("r%d", v.ID())
+	}
+	return fmt.Sprintf("v%d?", v.ID())
+}
+
+// RegType represents the type of a register.
+type RegType byte
+
+const (
+	RegTypeInvalid RegType = iota
+	RegTypeInt
+	RegTypeFloat
+	NumRegType
+)
+
+// String implements fmt.Stringer.
+func (r RegType) String() string {
+	switch r {
+	case RegTypeInt:
+		return "int"
+	case RegTypeFloat:
+		return "float"
+	default:
+		return "invalid"
+	}
+}
+
+// RegTypeOf returns the RegType of the given ssa.Type.
+func RegTypeOf(p ssa.Type) RegType {
+	switch p {
+	case ssa.TypeI32, ssa.TypeI64:
+		return RegTypeInt
+	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+		return RegTypeFloat
+	default:
+		panic("invalid type")
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
@ -0,0 +1,108 @@
+package regalloc
+
+import (
+	"fmt"
+	"strings"
+)
+
+// NewRegSet returns a new RegSet with the given registers.
+func NewRegSet(regs ...RealReg) RegSet {
+	var ret RegSet
+	for _, r := range regs {
+		ret = ret.add(r)
+	}
+	return ret
+}
+
+// RegSet represents a set of registers.
+type RegSet uint64
+
+func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused
+	var ret []string
+	for i := 0; i < 64; i++ {
+		if rs&(1<<uint(i)) != 0 {
+			ret = append(ret, info.RealRegName(RealReg(i)))
+		}
+	}
+	return strings.Join(ret, ", ")
+}
+
+func (rs RegSet) has(r RealReg) bool {
+	return rs&(1<<uint(r)) != 0
+}
+
+func (rs RegSet) add(r RealReg) RegSet {
+	if r >= 64 {
+		return rs
+	}
+	return rs | 1<<uint(r)
+}
+
+func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
+	for i := 0; i < 64; i++ {
+		if rs&(1<<uint(i)) != 0 {
+			f(RealReg(i))
+		}
+	}
+}
+
+type regInUseSet struct {
+	set RegSet
+	vrs [64]VReg
+}
+
+func (rs *regInUseSet) reset() {
+	rs.set = 0
+	for i := range rs.vrs {
+		rs.vrs[i] = VRegInvalid
+	}
+}
+
+func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
+	var ret []string
+	for i := 0; i < 64; i++ {
+		if rs.set&(1<<uint(i)) != 0 {
+			vr := rs.vrs[i]
+			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
+		}
+	}
+	return strings.Join(ret, ", ")
+}
+
+func (rs *regInUseSet) has(r RealReg) bool {
+	if r >= 64 {
+		return false
+	}
+	return rs.set&(1<<uint(r)) != 0
+}
+
+func (rs *regInUseSet) get(r RealReg) VReg {
+	if r >= 64 {
+		return VRegInvalid
+	}
+	return rs.vrs[r]
+}
+
+func (rs *regInUseSet) remove(r RealReg) {
+	if r >= 64 {
+		return
+	}
+	rs.set &= ^(1 << uint(r))
+	rs.vrs[r] = VRegInvalid
+}
+
+func (rs *regInUseSet) add(r RealReg, vr VReg) {
+	if r >= 64 {
+		return
+	}
+	rs.set |= 1 << uint(r)
+	rs.vrs[r] = vr
+}
+
+func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
+	for i := 0; i < 64; i++ {
+		if rs.set&(1<<uint(i)) != 0 {
+			f(RealReg(i), rs.vrs[i])
+		}
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
@ -0,0 +1,43 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// SSAValueDefinition represents a definition of an SSA value.
+type SSAValueDefinition struct {
+	// BlockParamValue is valid if Instr == nil
+	BlockParamValue ssa.Value
+
+	// BlkParamVReg is valid if Instr == nil
+	BlkParamVReg regalloc.VReg
+
+	// Instr is not nil if this is a definition from an instruction.
+	Instr *ssa.Instruction
+	// N is the index of the return value in the instr's return values list.
+	N int
+	// RefCount is the number of references to the result.
+	RefCount int
+}
+
+func (d *SSAValueDefinition) IsFromInstr() bool {
+	return d.Instr != nil
+}
+
+func (d *SSAValueDefinition) IsFromBlockParam() bool {
+	return d.Instr == nil
+}
+
+func (d *SSAValueDefinition) SSAValue() ssa.Value {
+	if d.IsFromBlockParam() {
+		return d.BlockParamValue
+	} else {
+		r, rs := d.Instr.Returns()
+		if d.N == 0 {
+			return r
+		} else {
+			return rs[d.N-1]
+		}
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
@ -0,0 +1,722 @@
+package wazevo
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"reflect"
+	"runtime"
+	"sync/atomic"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/expctxkeys"
+	"github.com/tetratelabs/wazero/internal/internalapi"
+	"github.com/tetratelabs/wazero/internal/wasm"
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+	"github.com/tetratelabs/wazero/internal/wasmruntime"
+)
+
+type (
+	// callEngine implements api.Function.
+	callEngine struct {
+		internalapi.WazeroOnly
+		stack []byte
+		// stackTop is the pointer to the *aligned* top of the stack. This must be updated
+		// whenever the stack is changed. This is passed to the assembly function
+		// at the very beginning of api.Function Call/CallWithStack.
+		stackTop uintptr
+		// executable is the pointer to the executable code for this function.
+		executable         *byte
+		preambleExecutable *byte
+		// parent is the *moduleEngine from which this callEngine is created.
+		parent *moduleEngine
+		// indexInModule is the index of the function in the module.
+		indexInModule wasm.Index
+		// sizeOfParamResultSlice is the size of the parameter/result slice.
+		sizeOfParamResultSlice int
+		requiredParams         int
+		// execCtx holds various information to be read/written by assembly functions.
+		execCtx executionContext
+		// execCtxPtr holds the pointer to the executionContext which doesn't change after callEngine is created.
+		execCtxPtr        uintptr
+		numberOfResults   int
+		stackIteratorImpl stackIterator
+	}
+
+	// executionContext is the struct to be read/written by assembly functions.
+	executionContext struct {
+		// exitCode holds the wazevoapi.ExitCode describing the state of the function execution.
+		exitCode wazevoapi.ExitCode
+		// callerModuleContextPtr holds the moduleContextOpaque for Go function calls.
+		callerModuleContextPtr *byte
+		// originalFramePointer holds the original frame pointer of the caller of the assembly function.
+		originalFramePointer uintptr
+		// originalStackPointer holds the original stack pointer of the caller of the assembly function.
+		originalStackPointer uintptr
+		// goReturnAddress holds the return address to go back to the caller of the assembly function.
+		goReturnAddress uintptr
+		// stackBottomPtr holds the pointer to the bottom of the stack.
+		stackBottomPtr *byte
+		// goCallReturnAddress holds the return address to go back to the caller of the Go function.
+		goCallReturnAddress *byte
+		// stackPointerBeforeGoCall holds the stack pointer before calling a Go function.
+		stackPointerBeforeGoCall *uint64
+		// stackGrowRequiredSize holds the required size of stack grow.
+		stackGrowRequiredSize uintptr
+		// memoryGrowTrampolineAddress holds the address of memory grow trampoline function.
+		memoryGrowTrampolineAddress *byte
+		// stackGrowCallTrampolineAddress holds the address of stack grow trampoline function.
+		stackGrowCallTrampolineAddress *byte
+		// checkModuleExitCodeTrampolineAddress holds the address of check-module-exit-code function.
+		checkModuleExitCodeTrampolineAddress *byte
+		// savedRegisters is the opaque spaces for save/restore registers.
+		// We want to align 16 bytes for each register, so we use [64][2]uint64.
+		savedRegisters [64][2]uint64
+		// goFunctionCallCalleeModuleContextOpaque is the pointer to the target Go function's moduleContextOpaque.
+		goFunctionCallCalleeModuleContextOpaque uintptr
+		// tableGrowTrampolineAddress holds the address of table grow trampoline function.
+		tableGrowTrampolineAddress *byte
+		// refFuncTrampolineAddress holds the address of ref-func trampoline function.
+		refFuncTrampolineAddress *byte
+		// memmoveAddress holds the address of memmove function implemented by Go runtime. See memmove.go.
+		memmoveAddress uintptr
+		// framePointerBeforeGoCall holds the frame pointer before calling a Go function. Note: only used in amd64.
+		framePointerBeforeGoCall uintptr
+		// memoryWait32TrampolineAddress holds the address of memory_wait32 trampoline function.
+		memoryWait32TrampolineAddress *byte
+		// memoryWait32TrampolineAddress holds the address of memory_wait64 trampoline function.
+		memoryWait64TrampolineAddress *byte
+		// memoryNotifyTrampolineAddress holds the address of the memory_notify trampoline function.
+		memoryNotifyTrampolineAddress *byte
+	}
+)
+
+func (c *callEngine) requiredInitialStackSize() int {
+	const initialStackSizeDefault = 10240
+	stackSize := initialStackSizeDefault
+	paramResultInBytes := c.sizeOfParamResultSlice * 8 * 2 // * 8 because uint64 is 8 bytes, and *2 because we need both separated param/result slots.
+	required := paramResultInBytes + 32 + 16               // 32 is enough to accommodate the call frame info, and 16 exists just in case when []byte is not aligned to 16 bytes.
+	if required > stackSize {
+		stackSize = required
+	}
+	return stackSize
+}
+
+func (c *callEngine) init() {
+	stackSize := c.requiredInitialStackSize()
+	if wazevoapi.StackGuardCheckEnabled {
+		stackSize += wazevoapi.StackGuardCheckGuardPageSize
+	}
+	c.stack = make([]byte, stackSize)
+	c.stackTop = alignedStackTop(c.stack)
+	if wazevoapi.StackGuardCheckEnabled {
+		c.execCtx.stackBottomPtr = &c.stack[wazevoapi.StackGuardCheckGuardPageSize]
+	} else {
+		c.execCtx.stackBottomPtr = &c.stack[0]
+	}
+	c.execCtxPtr = uintptr(unsafe.Pointer(&c.execCtx))
+}
+
+// alignedStackTop returns 16-bytes aligned stack top of given stack.
+// 16 bytes should be good for all platform (arm64/amd64).
+func alignedStackTop(s []byte) uintptr {
+	stackAddr := uintptr(unsafe.Pointer(&s[len(s)-1]))
+	return stackAddr - (stackAddr & (16 - 1))
+}
+
+// Definition implements api.Function.
+func (c *callEngine) Definition() api.FunctionDefinition {
+	return c.parent.module.Source.FunctionDefinition(c.indexInModule)
+}
+
+// Call implements api.Function.
+func (c *callEngine) Call(ctx context.Context, params ...uint64) ([]uint64, error) {
+	if c.requiredParams != len(params) {
+		return nil, fmt.Errorf("expected %d params, but passed %d", c.requiredParams, len(params))
+	}
+	paramResultSlice := make([]uint64, c.sizeOfParamResultSlice)
+	copy(paramResultSlice, params)
+	if err := c.callWithStack(ctx, paramResultSlice); err != nil {
+		return nil, err
+	}
+	return paramResultSlice[:c.numberOfResults], nil
+}
+
+func (c *callEngine) addFrame(builder wasmdebug.ErrorBuilder, addr uintptr) (def api.FunctionDefinition, listener experimental.FunctionListener) {
+	eng := c.parent.parent.parent
+	cm := eng.compiledModuleOfAddr(addr)
+	if cm == nil {
+		// This case, the module might have been closed and deleted from the engine.
+		// We fall back to searching the imported modules that can be referenced from this callEngine.
+
+		// First, we check itself.
+		if checkAddrInBytes(addr, c.parent.parent.executable) {
+			cm = c.parent.parent
+		} else {
+			// Otherwise, search all imported modules. TODO: maybe recursive, but not sure it's useful in practice.
+			p := c.parent
+			for i := range p.importedFunctions {
+				candidate := p.importedFunctions[i].me.parent
+				if checkAddrInBytes(addr, candidate.executable) {
+					cm = candidate
+					break
+				}
+			}
+		}
+	}
+
+	if cm != nil {
+		index := cm.functionIndexOf(addr)
+		def = cm.module.FunctionDefinition(cm.module.ImportFunctionCount + index)
+		var sources []string
+		if dw := cm.module.DWARFLines; dw != nil {
+			sourceOffset := cm.getSourceOffset(addr)
+			sources = dw.Line(sourceOffset)
+		}
+		builder.AddFrame(def.DebugName(), def.ParamTypes(), def.ResultTypes(), sources)
+		if len(cm.listeners) > 0 {
+			listener = cm.listeners[index]
+		}
+	}
+	return
+}
+
+// CallWithStack implements api.Function.
+func (c *callEngine) CallWithStack(ctx context.Context, paramResultStack []uint64) (err error) {
+	if c.sizeOfParamResultSlice > len(paramResultStack) {
+		return fmt.Errorf("need %d params, but stack size is %d", c.sizeOfParamResultSlice, len(paramResultStack))
+	}
+	return c.callWithStack(ctx, paramResultStack)
+}
+
+// CallWithStack implements api.Function.
+func (c *callEngine) callWithStack(ctx context.Context, paramResultStack []uint64) (err error) {
+	snapshotEnabled := ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil
+	if snapshotEnabled {
+		ctx = context.WithValue(ctx, expctxkeys.SnapshotterKey{}, c)
+	}
+
+	if wazevoapi.StackGuardCheckEnabled {
+		defer func() {
+			wazevoapi.CheckStackGuardPage(c.stack)
+		}()
+	}
+
+	p := c.parent
+	ensureTermination := p.parent.ensureTermination
+	m := p.module
+	if ensureTermination {
+		select {
+		case <-ctx.Done():
+			// If the provided context is already done, close the module and return the error.
+			m.CloseWithCtxErr(ctx)
+			return m.FailIfClosed()
+		default:
+		}
+	}
+
+	var paramResultPtr *uint64
+	if len(paramResultStack) > 0 {
+		paramResultPtr = &paramResultStack[0]
+	}
+	defer func() {
+		r := recover()
+		if s, ok := r.(*snapshot); ok {
+			// A snapshot that wasn't handled was created by a different call engine possibly from a nested wasm invocation,
+			// let it propagate up to be handled by the caller.
+			panic(s)
+		}
+		if r != nil {
+			type listenerForAbort struct {
+				def api.FunctionDefinition
+				lsn experimental.FunctionListener
+			}
+
+			var listeners []listenerForAbort
+			builder := wasmdebug.NewErrorBuilder()
+			def, lsn := c.addFrame(builder, uintptr(unsafe.Pointer(c.execCtx.goCallReturnAddress)))
+			if lsn != nil {
+				listeners = append(listeners, listenerForAbort{def, lsn})
+			}
+			returnAddrs := unwindStack(
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)),
+				c.execCtx.framePointerBeforeGoCall,
+				c.stackTop,
+				nil,
+			)
+			for _, retAddr := range returnAddrs[:len(returnAddrs)-1] { // the last return addr is the trampoline, so we skip it.
+				def, lsn = c.addFrame(builder, retAddr)
+				if lsn != nil {
+					listeners = append(listeners, listenerForAbort{def, lsn})
+				}
+			}
+			err = builder.FromRecovered(r)
+
+			for _, lsn := range listeners {
+				lsn.lsn.Abort(ctx, m, lsn.def, err)
+			}
+		} else {
+			if err != wasmruntime.ErrRuntimeStackOverflow { // Stackoverflow case shouldn't be panic (to avoid extreme stack unwinding).
+				err = c.parent.module.FailIfClosed()
+			}
+		}
+
+		if err != nil {
+			// Ensures that we can reuse this callEngine even after an error.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+		}
+	}()
+
+	if ensureTermination {
+		done := m.CloseModuleOnCanceledOrTimeout(ctx)
+		defer done()
+	}
+
+	if c.stackTop&(16-1) != 0 {
+		panic("BUG: stack must be aligned to 16 bytes")
+	}
+	entrypoint(c.preambleExecutable, c.executable, c.execCtxPtr, c.parent.opaquePtr, paramResultPtr, c.stackTop)
+	for {
+		switch ec := c.execCtx.exitCode; ec & wazevoapi.ExitCodeMask {
+		case wazevoapi.ExitCodeOK:
+			return nil
+		case wazevoapi.ExitCodeGrowStack:
+			oldsp := uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall))
+			oldTop := c.stackTop
+			oldStack := c.stack
+			var newsp, newfp uintptr
+			if wazevoapi.StackGuardCheckEnabled {
+				newsp, newfp, err = c.growStackWithGuarded()
+			} else {
+				newsp, newfp, err = c.growStack()
+			}
+			if err != nil {
+				return err
+			}
+			adjustClonedStack(oldsp, oldTop, newsp, newfp, c.stackTop)
+			// Old stack must be alive until the new stack is adjusted.
+			runtime.KeepAlive(oldStack)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, newsp, newfp)
+		case wazevoapi.ExitCodeGrowMemory:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			argRes := &s[0]
+			if res, ok := mem.Grow(uint32(*argRes)); !ok {
+				*argRes = uint64(0xffffffff) // = -1 in signed 32-bit integer.
+			} else {
+				*argRes = uint64(res)
+				calleeOpaque := opaqueViewFromPtr(uintptr(unsafe.Pointer(c.execCtx.callerModuleContextPtr)))
+				if mod.Source.MemorySection != nil { // Local memory.
+					putLocalMemory(calleeOpaque, 8 /* local memory begins at 8 */, mem)
+				} else {
+					// Imported memory's owner at offset 16 of the callerModuleContextPtr.
+					opaquePtr := uintptr(binary.LittleEndian.Uint64(calleeOpaque[16:]))
+					importedMemOwner := opaqueViewFromPtr(opaquePtr)
+					putLocalMemory(importedMemOwner, 8 /* local memory begins at 8 */, mem)
+				}
+			}
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeTableGrow:
+			mod := c.callerModuleInstance()
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			tableIndex, num, ref := uint32(s[0]), uint32(s[1]), uintptr(s[2])
+			table := mod.Tables[tableIndex]
+			s[0] = uint64(uint32(int32(table.Grow(num, ref))))
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoFunction:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, goCallStackView(c.execCtx.stackPointerBeforeGoCall))
+			}()
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoFunctionWithListener:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			listeners := hostModuleListenersSliceFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			// Call Listener.Before.
+			callerModule := c.callerModuleInstance()
+			listener := listeners[index]
+			hostModule := hostModuleFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			def := hostModule.FunctionDefinition(wasm.Index(index))
+			listener.Before(ctx, callerModule, def, s, c.stackIterator(true))
+			// Call into the Go function.
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, s)
+			}()
+			// Call Listener.After.
+			listener.After(ctx, callerModule, def, s)
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoModuleFunction:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoModuleFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			mod := c.callerModuleInstance()
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, mod, goCallStackView(c.execCtx.stackPointerBeforeGoCall))
+			}()
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoModuleFunctionWithListener:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoModuleFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			listeners := hostModuleListenersSliceFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			// Call Listener.Before.
+			callerModule := c.callerModuleInstance()
+			listener := listeners[index]
+			hostModule := hostModuleFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			def := hostModule.FunctionDefinition(wasm.Index(index))
+			listener.Before(ctx, callerModule, def, s, c.stackIterator(true))
+			// Call into the Go function.
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, callerModule, s)
+			}()
+			// Call Listener.After.
+			listener.After(ctx, callerModule, def, s)
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallListenerBefore:
+			stack := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			index := wasm.Index(stack[0])
+			mod := c.callerModuleInstance()
+			listener := mod.Engine.(*moduleEngine).listeners[index]
+			def := mod.Source.FunctionDefinition(index + mod.Source.ImportFunctionCount)
+			listener.Before(ctx, mod, def, stack[1:], c.stackIterator(false))
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallListenerAfter:
+			stack := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			index := wasm.Index(stack[0])
+			mod := c.callerModuleInstance()
+			listener := mod.Engine.(*moduleEngine).listeners[index]
+			def := mod.Source.FunctionDefinition(index + mod.Source.ImportFunctionCount)
+			listener.After(ctx, mod, def, stack[1:])
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCheckModuleExitCode:
+			// Note: this operation must be done in Go, not native code. The reason is that
+			// native code cannot be preempted and that means it can block forever if there are not
+			// enough OS threads (which we don't have control over).
+			if err := m.FailIfClosed(); err != nil {
+				panic(err)
+			}
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeRefFunc:
+			mod := c.callerModuleInstance()
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			funcIndex := wasm.Index(s[0])
+			ref := mod.Engine.FunctionInstanceReference(funcIndex)
+			s[0] = uint64(ref)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeMemoryWait32:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+			if !mem.Shared {
+				panic(wasmruntime.ErrRuntimeExpectedSharedMemory)
+			}
+
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			timeout, exp, addr := int64(s[0]), uint32(s[1]), uintptr(s[2])
+			base := uintptr(unsafe.Pointer(&mem.Buffer[0]))
+
+			offset := uint32(addr - base)
+			res := mem.Wait32(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint32 {
+				addr := unsafe.Add(unsafe.Pointer(&mem.Buffer[0]), offset)
+				return atomic.LoadUint32((*uint32)(addr))
+			})
+			s[0] = res
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeMemoryWait64:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+			if !mem.Shared {
+				panic(wasmruntime.ErrRuntimeExpectedSharedMemory)
+			}
+
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			timeout, exp, addr := int64(s[0]), uint64(s[1]), uintptr(s[2])
+			base := uintptr(unsafe.Pointer(&mem.Buffer[0]))
+
+			offset := uint32(addr - base)
+			res := mem.Wait64(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint64 {
+				addr := unsafe.Add(unsafe.Pointer(&mem.Buffer[0]), offset)
+				return atomic.LoadUint64((*uint64)(addr))
+			})
+			s[0] = uint64(res)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeMemoryNotify:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			count, addr := uint32(s[0]), s[1]
+			offset := uint32(uintptr(addr) - uintptr(unsafe.Pointer(&mem.Buffer[0])))
+			res := mem.Notify(offset, count)
+			s[0] = uint64(res)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeUnreachable:
+			panic(wasmruntime.ErrRuntimeUnreachable)
+		case wazevoapi.ExitCodeMemoryOutOfBounds:
+			panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+		case wazevoapi.ExitCodeTableOutOfBounds:
+			panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+		case wazevoapi.ExitCodeIndirectCallNullPointer:
+			panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+		case wazevoapi.ExitCodeIndirectCallTypeMismatch:
+			panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
+		case wazevoapi.ExitCodeIntegerOverflow:
+			panic(wasmruntime.ErrRuntimeIntegerOverflow)
+		case wazevoapi.ExitCodeIntegerDivisionByZero:
+			panic(wasmruntime.ErrRuntimeIntegerDivideByZero)
+		case wazevoapi.ExitCodeInvalidConversionToInteger:
+			panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+		case wazevoapi.ExitCodeUnalignedAtomic:
+			panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+		default:
+			panic("BUG")
+		}
+	}
+}
+
+func (c *callEngine) callerModuleInstance() *wasm.ModuleInstance {
+	return moduleInstanceFromOpaquePtr(c.execCtx.callerModuleContextPtr)
+}
+
+func opaqueViewFromPtr(ptr uintptr) []byte {
+	var opaque []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaque))
+	sh.Data = ptr
+	setSliceLimits(sh, 24, 24)
+	return opaque
+}
+
+const callStackCeiling = uintptr(50000000) // in uint64 (8 bytes) == 400000000 bytes in total == 400mb.
+
+func (c *callEngine) growStackWithGuarded() (newSP uintptr, newFP uintptr, err error) {
+	if wazevoapi.StackGuardCheckEnabled {
+		wazevoapi.CheckStackGuardPage(c.stack)
+	}
+	newSP, newFP, err = c.growStack()
+	if err != nil {
+		return
+	}
+	if wazevoapi.StackGuardCheckEnabled {
+		c.execCtx.stackBottomPtr = &c.stack[wazevoapi.StackGuardCheckGuardPageSize]
+	}
+	return
+}
+
+// growStack grows the stack, and returns the new stack pointer.
+func (c *callEngine) growStack() (newSP, newFP uintptr, err error) {
+	currentLen := uintptr(len(c.stack))
+	if callStackCeiling < currentLen {
+		err = wasmruntime.ErrRuntimeStackOverflow
+		return
+	}
+
+	newLen := 2*currentLen + c.execCtx.stackGrowRequiredSize + 16 // Stack might be aligned to 16 bytes, so add 16 bytes just in case.
+	newSP, newFP, c.stackTop, c.stack = c.cloneStack(newLen)
+	c.execCtx.stackBottomPtr = &c.stack[0]
+	return
+}
+
+func (c *callEngine) cloneStack(l uintptr) (newSP, newFP, newTop uintptr, newStack []byte) {
+	newStack = make([]byte, l)
+
+	relSp := c.stackTop - uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall))
+	relFp := c.stackTop - c.execCtx.framePointerBeforeGoCall
+
+	// Copy the existing contents in the previous Go-allocated stack into the new one.
+	var prevStackAligned, newStackAligned []byte
+	{
+		sh := (*reflect.SliceHeader)(unsafe.Pointer(&prevStackAligned))
+		sh.Data = c.stackTop - relSp
+		setSliceLimits(sh, relSp, relSp)
+	}
+	newTop = alignedStackTop(newStack)
+	{
+		newSP = newTop - relSp
+		newFP = newTop - relFp
+		sh := (*reflect.SliceHeader)(unsafe.Pointer(&newStackAligned))
+		sh.Data = newSP
+		setSliceLimits(sh, relSp, relSp)
+	}
+	copy(newStackAligned, prevStackAligned)
+	return
+}
+
+func (c *callEngine) stackIterator(onHostCall bool) experimental.StackIterator {
+	c.stackIteratorImpl.reset(c, onHostCall)
+	return &c.stackIteratorImpl
+}
+
+// stackIterator implements experimental.StackIterator.
+type stackIterator struct {
+	retAddrs      []uintptr
+	retAddrCursor int
+	eng           *engine
+	pc            uint64
+
+	currentDef *wasm.FunctionDefinition
+}
+
+func (si *stackIterator) reset(c *callEngine, onHostCall bool) {
+	if onHostCall {
+		si.retAddrs = append(si.retAddrs[:0], uintptr(unsafe.Pointer(c.execCtx.goCallReturnAddress)))
+	} else {
+		si.retAddrs = si.retAddrs[:0]
+	}
+	si.retAddrs = unwindStack(uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall, c.stackTop, si.retAddrs)
+	si.retAddrs = si.retAddrs[:len(si.retAddrs)-1] // the last return addr is the trampoline, so we skip it.
+	si.retAddrCursor = 0
+	si.eng = c.parent.parent.parent
+}
+
+// Next implements the same method as documented on experimental.StackIterator.
+func (si *stackIterator) Next() bool {
+	if si.retAddrCursor >= len(si.retAddrs) {
+		return false
+	}
+
+	addr := si.retAddrs[si.retAddrCursor]
+	cm := si.eng.compiledModuleOfAddr(addr)
+	if cm != nil {
+		index := cm.functionIndexOf(addr)
+		def := cm.module.FunctionDefinition(cm.module.ImportFunctionCount + index)
+		si.currentDef = def
+		si.retAddrCursor++
+		si.pc = uint64(addr)
+		return true
+	}
+	return false
+}
+
+// ProgramCounter implements the same method as documented on experimental.StackIterator.
+func (si *stackIterator) ProgramCounter() experimental.ProgramCounter {
+	return experimental.ProgramCounter(si.pc)
+}
+
+// Function implements the same method as documented on experimental.StackIterator.
+func (si *stackIterator) Function() experimental.InternalFunction {
+	return si
+}
+
+// Definition implements the same method as documented on experimental.InternalFunction.
+func (si *stackIterator) Definition() api.FunctionDefinition {
+	return si.currentDef
+}
+
+// SourceOffsetForPC implements the same method as documented on experimental.InternalFunction.
+func (si *stackIterator) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 {
+	upc := uintptr(pc)
+	cm := si.eng.compiledModuleOfAddr(upc)
+	return cm.getSourceOffset(upc)
+}
+
+// snapshot implements experimental.Snapshot
+type snapshot struct {
+	sp, fp, top    uintptr
+	returnAddress  *byte
+	stack          []byte
+	savedRegisters [64][2]uint64
+	ret            []uint64
+	c              *callEngine
+}
+
+// Snapshot implements the same method as documented on experimental.Snapshotter.
+func (c *callEngine) Snapshot() experimental.Snapshot {
+	returnAddress := c.execCtx.goCallReturnAddress
+	oldTop, oldSp := c.stackTop, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall))
+	newSP, newFP, newTop, newStack := c.cloneStack(uintptr(len(c.stack)) + 16)
+	adjustClonedStack(oldSp, oldTop, newSP, newFP, newTop)
+	return &snapshot{
+		sp:             newSP,
+		fp:             newFP,
+		top:            newTop,
+		savedRegisters: c.execCtx.savedRegisters,
+		returnAddress:  returnAddress,
+		stack:          newStack,
+		c:              c,
+	}
+}
+
+// Restore implements the same method as documented on experimental.Snapshot.
+func (s *snapshot) Restore(ret []uint64) {
+	s.ret = ret
+	panic(s)
+}
+
+func (s *snapshot) doRestore() {
+	spp := *(**uint64)(unsafe.Pointer(&s.sp))
+	view := goCallStackView(spp)
+	copy(view, s.ret)
+
+	c := s.c
+	c.stack = s.stack
+	c.stackTop = s.top
+	ec := &c.execCtx
+	ec.stackBottomPtr = &c.stack[0]
+	ec.stackPointerBeforeGoCall = spp
+	ec.framePointerBeforeGoCall = s.fp
+	ec.goCallReturnAddress = s.returnAddress
+	ec.savedRegisters = s.savedRegisters
+}
+
+// Error implements the same method on error.
+func (s *snapshot) Error() string {
+	return "unhandled snapshot restore, this generally indicates restore was called from a different " +
+		"exported function invocation than snapshot"
+}
+
+func snapshotRecoverFn(c *callEngine) {
+	if r := recover(); r != nil {
+		if s, ok := r.(*snapshot); ok && s.c == c {
+			s.doRestore()
+		} else {
+			panic(r)
+		}
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go
@ -0,0 +1,843 @@
+package wazevo
+
+import (
+	"context"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"runtime"
+	"sort"
+	"sync"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/frontend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/filecache"
+	"github.com/tetratelabs/wazero/internal/platform"
+	"github.com/tetratelabs/wazero/internal/version"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+type (
+	// engine implements wasm.Engine.
+	engine struct {
+		wazeroVersion   string
+		fileCache       filecache.Cache
+		compiledModules map[wasm.ModuleID]*compiledModule
+		// sortedCompiledModules is a list of compiled modules sorted by the initial address of the executable.
+		sortedCompiledModules []*compiledModule
+		mux                   sync.RWMutex
+		// sharedFunctions is compiled functions shared by all modules.
+		sharedFunctions *sharedFunctions
+		// setFinalizer defaults to runtime.SetFinalizer, but overridable for tests.
+		setFinalizer func(obj interface{}, finalizer interface{})
+
+		// The followings are reused for compiling shared functions.
+		machine backend.Machine
+		be      backend.Compiler
+	}
+
+	sharedFunctions struct {
+		// memoryGrowExecutable is a compiled trampoline executable for memory.grow builtin function.
+		memoryGrowExecutable []byte
+		// checkModuleExitCode is a compiled trampoline executable for checking module instance exit code. This
+		// is used when ensureTermination is true.
+		checkModuleExitCode []byte
+		// stackGrowExecutable is a compiled executable for growing stack builtin function.
+		stackGrowExecutable []byte
+		// tableGrowExecutable is a compiled trampoline executable for table.grow builtin function.
+		tableGrowExecutable []byte
+		// refFuncExecutable is a compiled trampoline executable for ref.func builtin function.
+		refFuncExecutable []byte
+		// memoryWait32Executable is a compiled trampoline executable for memory.wait32 builtin function
+		memoryWait32Executable []byte
+		// memoryWait64Executable is a compiled trampoline executable for memory.wait64 builtin function
+		memoryWait64Executable []byte
+		// memoryNotifyExecutable is a compiled trampoline executable for memory.notify builtin function
+		memoryNotifyExecutable    []byte
+		listenerBeforeTrampolines map[*wasm.FunctionType][]byte
+		listenerAfterTrampolines  map[*wasm.FunctionType][]byte
+	}
+
+	// compiledModule is a compiled variant of a wasm.Module and ready to be used for instantiation.
+	compiledModule struct {
+		*executables
+		// functionOffsets maps a local function index to the offset in the executable.
+		functionOffsets           []int
+		parent                    *engine
+		module                    *wasm.Module
+		ensureTermination         bool
+		listeners                 []experimental.FunctionListener
+		listenerBeforeTrampolines []*byte
+		listenerAfterTrampolines  []*byte
+
+		// The followings are only available for non host modules.
+
+		offsets         wazevoapi.ModuleContextOffsetData
+		sharedFunctions *sharedFunctions
+		sourceMap       sourceMap
+	}
+
+	executables struct {
+		executable     []byte
+		entryPreambles [][]byte
+	}
+)
+
+// sourceMap is a mapping from the offset of the executable to the offset of the original wasm binary.
+type sourceMap struct {
+	// executableOffsets is a sorted list of offsets of the executable. This is index-correlated with wasmBinaryOffsets,
+	// in other words executableOffsets[i] is the offset of the executable which corresponds to the offset of a Wasm
+	// binary pointed by wasmBinaryOffsets[i].
+	executableOffsets []uintptr
+	// wasmBinaryOffsets is the counterpart of executableOffsets.
+	wasmBinaryOffsets []uint64
+}
+
+var _ wasm.Engine = (*engine)(nil)
+
+// NewEngine returns the implementation of wasm.Engine.
+func NewEngine(ctx context.Context, _ api.CoreFeatures, fc filecache.Cache) wasm.Engine {
+	machine := newMachine()
+	be := backend.NewCompiler(ctx, machine, ssa.NewBuilder())
+	e := &engine{
+		compiledModules: make(map[wasm.ModuleID]*compiledModule),
+		setFinalizer:    runtime.SetFinalizer,
+		machine:         machine,
+		be:              be,
+		fileCache:       fc,
+		wazeroVersion:   version.GetWazeroVersion(),
+	}
+	e.compileSharedFunctions()
+	return e
+}
+
+// CompileModule implements wasm.Engine.
+func (e *engine) CompileModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (err error) {
+	if wazevoapi.PerfMapEnabled {
+		wazevoapi.PerfMap.Lock()
+		defer wazevoapi.PerfMap.Unlock()
+	}
+
+	if _, ok, err := e.getCompiledModule(module, listeners, ensureTermination); ok { // cache hit!
+		return nil
+	} else if err != nil {
+		return err
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		ctx = wazevoapi.NewDeterministicCompilationVerifierContext(ctx, len(module.CodeSection))
+	}
+	cm, err := e.compileModule(ctx, module, listeners, ensureTermination)
+	if err != nil {
+		return err
+	}
+	if err = e.addCompiledModule(module, cm); err != nil {
+		return err
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		for i := 0; i < wazevoapi.DeterministicCompilationVerifyingIter; i++ {
+			_, err := e.compileModule(ctx, module, listeners, ensureTermination)
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	if len(listeners) > 0 {
+		cm.listeners = listeners
+		cm.listenerBeforeTrampolines = make([]*byte, len(module.TypeSection))
+		cm.listenerAfterTrampolines = make([]*byte, len(module.TypeSection))
+		for i := range module.TypeSection {
+			typ := &module.TypeSection[i]
+			before, after := e.getListenerTrampolineForType(typ)
+			cm.listenerBeforeTrampolines[i] = before
+			cm.listenerAfterTrampolines[i] = after
+		}
+	}
+	return nil
+}
+
+func (exec *executables) compileEntryPreambles(m *wasm.Module, machine backend.Machine, be backend.Compiler) {
+	exec.entryPreambles = make([][]byte, len(m.TypeSection))
+	for i := range m.TypeSection {
+		typ := &m.TypeSection[i]
+		sig := frontend.SignatureForWasmFunctionType(typ)
+		be.Init()
+		buf := machine.CompileEntryPreamble(&sig)
+		executable := mmapExecutable(buf)
+		exec.entryPreambles[i] = executable
+
+		if wazevoapi.PerfMapEnabled {
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&executable[0])),
+				uint64(len(executable)), fmt.Sprintf("entry_preamble::type=%s", typ.String()))
+		}
+	}
+}
+
+func (e *engine) compileModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (*compiledModule, error) {
+	withListener := len(listeners) > 0
+	cm := &compiledModule{
+		offsets: wazevoapi.NewModuleContextOffsetData(module, withListener), parent: e, module: module,
+		ensureTermination: ensureTermination,
+		executables:       &executables{},
+	}
+
+	if module.IsHostModule {
+		return e.compileHostModule(ctx, module, listeners)
+	}
+
+	importedFns, localFns := int(module.ImportFunctionCount), len(module.FunctionSection)
+	if localFns == 0 {
+		return cm, nil
+	}
+
+	rels := make([]backend.RelocationInfo, 0)
+	refToBinaryOffset := make([]int, importedFns+localFns)
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		// The compilation must be deterministic regardless of the order of functions being compiled.
+		wazevoapi.DeterministicCompilationVerifierRandomizeIndexes(ctx)
+	}
+
+	needSourceInfo := module.DWARFLines != nil
+
+	// Creates new compiler instances which are reused for each function.
+	ssaBuilder := ssa.NewBuilder()
+	fe := frontend.NewFrontendCompiler(module, ssaBuilder, &cm.offsets, ensureTermination, withListener, needSourceInfo)
+	machine := newMachine()
+	be := backend.NewCompiler(ctx, machine, ssaBuilder)
+
+	cm.executables.compileEntryPreambles(module, machine, be)
+
+	totalSize := 0 // Total binary size of the executable.
+	cm.functionOffsets = make([]int, localFns)
+	bodies := make([][]byte, localFns)
+
+	// Trampoline relocation related variables.
+	trampolineInterval, callTrampolineIslandSize, err := machine.CallTrampolineIslandInfo(localFns)
+	if err != nil {
+		return nil, err
+	}
+	needCallTrampoline := callTrampolineIslandSize > 0
+	var callTrampolineIslandOffsets []int // Holds the offsets of trampoline islands.
+
+	for i := range module.CodeSection {
+		if wazevoapi.DeterministicCompilationVerifierEnabled {
+			i = wazevoapi.DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx, i)
+		}
+
+		fidx := wasm.Index(i + importedFns)
+
+		if wazevoapi.NeedFunctionNameInContext {
+			def := module.FunctionDefinition(fidx)
+			name := def.DebugName()
+			if len(def.ExportNames()) > 0 {
+				name = def.ExportNames()[0]
+			}
+			ctx = wazevoapi.SetCurrentFunctionName(ctx, i, fmt.Sprintf("[%d/%d]%s", i, len(module.CodeSection)-1, name))
+		}
+
+		needListener := len(listeners) > 0 && listeners[i] != nil
+		body, relsPerFunc, err := e.compileLocalWasmFunction(ctx, module, wasm.Index(i), fe, ssaBuilder, be, needListener)
+		if err != nil {
+			return nil, fmt.Errorf("compile function %d/%d: %v", i, len(module.CodeSection)-1, err)
+		}
+
+		// Align 16-bytes boundary.
+		totalSize = (totalSize + 15) &^ 15
+		cm.functionOffsets[i] = totalSize
+
+		if needSourceInfo {
+			// At the beginning of the function, we add the offset of the function body so that
+			// we can resolve the source location of the call site of before listener call.
+			cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize))
+			cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, module.CodeSection[i].BodyOffsetInCodeSection)
+
+			for _, info := range be.SourceOffsetInfo() {
+				cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize)+uintptr(info.ExecutableOffset))
+				cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, uint64(info.SourceOffset))
+			}
+		}
+
+		fref := frontend.FunctionIndexToFuncRef(fidx)
+		refToBinaryOffset[fref] = totalSize
+
+		// At this point, relocation offsets are relative to the start of the function body,
+		// so we adjust it to the start of the executable.
+		for _, r := range relsPerFunc {
+			r.Offset += int64(totalSize)
+			rels = append(rels, r)
+		}
+
+		bodies[i] = body
+		totalSize += len(body)
+		if wazevoapi.PrintMachineCodeHexPerFunction {
+			fmt.Printf("[[[machine code for %s]]]\n%s\n\n", wazevoapi.GetCurrentFunctionName(ctx), hex.EncodeToString(body))
+		}
+
+		if needCallTrampoline {
+			// If the total size exceeds the trampoline interval, we need to add a trampoline island.
+			if totalSize/trampolineInterval > len(callTrampolineIslandOffsets) {
+				callTrampolineIslandOffsets = append(callTrampolineIslandOffsets, totalSize)
+				totalSize += callTrampolineIslandSize
+			}
+		}
+	}
+
+	// Allocate executable memory and then copy the generated machine code.
+	executable, err := platform.MmapCodeSegment(totalSize)
+	if err != nil {
+		panic(err)
+	}
+	cm.executable = executable
+
+	for i, b := range bodies {
+		offset := cm.functionOffsets[i]
+		copy(executable[offset:], b)
+	}
+
+	if wazevoapi.PerfMapEnabled {
+		wazevoapi.PerfMap.Flush(uintptr(unsafe.Pointer(&executable[0])), cm.functionOffsets)
+	}
+
+	if needSourceInfo {
+		for i := range cm.sourceMap.executableOffsets {
+			cm.sourceMap.executableOffsets[i] += uintptr(unsafe.Pointer(&cm.executable[0]))
+		}
+	}
+
+	// Resolve relocations for local function calls.
+	if len(rels) > 0 {
+		machine.ResolveRelocations(refToBinaryOffset, executable, rels, callTrampolineIslandOffsets)
+	}
+
+	if runtime.GOARCH == "arm64" {
+		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+		if err = platform.MprotectRX(executable); err != nil {
+			return nil, err
+		}
+	}
+	cm.sharedFunctions = e.sharedFunctions
+	e.setFinalizer(cm.executables, executablesFinalizer)
+	return cm, nil
+}
+
+func (e *engine) compileLocalWasmFunction(
+	ctx context.Context,
+	module *wasm.Module,
+	localFunctionIndex wasm.Index,
+	fe *frontend.Compiler,
+	ssaBuilder ssa.Builder,
+	be backend.Compiler,
+	needListener bool,
+) (body []byte, rels []backend.RelocationInfo, err error) {
+	typIndex := module.FunctionSection[localFunctionIndex]
+	typ := &module.TypeSection[typIndex]
+	codeSeg := &module.CodeSection[localFunctionIndex]
+
+	// Initializes both frontend and backend compilers.
+	fe.Init(localFunctionIndex, typIndex, typ, codeSeg.LocalTypes, codeSeg.Body, needListener, codeSeg.BodyOffsetInCodeSection)
+	be.Init()
+
+	// Lower Wasm to SSA.
+	fe.LowerToSSA()
+	if wazevoapi.PrintSSA && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[SSA for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), ssaBuilder.Format())
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "SSA", ssaBuilder.Format())
+	}
+
+	// Run SSA-level optimization passes.
+	ssaBuilder.RunPasses()
+
+	if wazevoapi.PrintOptimizedSSA && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[Optimized SSA for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), ssaBuilder.Format())
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "Optimized SSA", ssaBuilder.Format())
+	}
+
+	// Now our ssaBuilder contains the necessary information to further lower them to
+	// machine code.
+	original, rels, err := be.Compile(ctx)
+	if err != nil {
+		return nil, nil, fmt.Errorf("ssa->machine code: %v", err)
+	}
+
+	// TODO: optimize as zero copy.
+	copied := make([]byte, len(original))
+	copy(copied, original)
+	return copied, rels, nil
+}
+
+func (e *engine) compileHostModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener) (*compiledModule, error) {
+	machine := newMachine()
+	be := backend.NewCompiler(ctx, machine, ssa.NewBuilder())
+
+	num := len(module.CodeSection)
+	cm := &compiledModule{module: module, listeners: listeners, executables: &executables{}}
+	cm.functionOffsets = make([]int, num)
+	totalSize := 0 // Total binary size of the executable.
+	bodies := make([][]byte, num)
+	var sig ssa.Signature
+	for i := range module.CodeSection {
+		totalSize = (totalSize + 15) &^ 15
+		cm.functionOffsets[i] = totalSize
+
+		typIndex := module.FunctionSection[i]
+		typ := &module.TypeSection[typIndex]
+
+		// We can relax until the index fits together in ExitCode as we do in wazevoapi.ExitCodeCallGoModuleFunctionWithIndex.
+		// However, 1 << 16 should be large enough for a real use case.
+		const hostFunctionNumMaximum = 1 << 16
+		if i >= hostFunctionNumMaximum {
+			return nil, fmt.Errorf("too many host functions (maximum %d)", hostFunctionNumMaximum)
+		}
+
+		sig.ID = ssa.SignatureID(typIndex) // This is important since we reuse the `machine` which caches the ABI based on the SignatureID.
+		sig.Params = append(sig.Params[:0],
+			ssa.TypeI64, // First argument must be exec context.
+			ssa.TypeI64, // The second argument is the moduleContextOpaque of this host module.
+		)
+		for _, t := range typ.Params {
+			sig.Params = append(sig.Params, frontend.WasmTypeToSSAType(t))
+		}
+
+		sig.Results = sig.Results[:0]
+		for _, t := range typ.Results {
+			sig.Results = append(sig.Results, frontend.WasmTypeToSSAType(t))
+		}
+
+		c := &module.CodeSection[i]
+		if c.GoFunc == nil {
+			panic("BUG: GoFunc must be set for host module")
+		}
+
+		withListener := len(listeners) > 0 && listeners[i] != nil
+		var exitCode wazevoapi.ExitCode
+		fn := c.GoFunc
+		switch fn.(type) {
+		case api.GoModuleFunction:
+			exitCode = wazevoapi.ExitCodeCallGoModuleFunctionWithIndex(i, withListener)
+		case api.GoFunction:
+			exitCode = wazevoapi.ExitCodeCallGoFunctionWithIndex(i, withListener)
+		}
+
+		be.Init()
+		machine.CompileGoFunctionTrampoline(exitCode, &sig, true)
+		if err := be.Finalize(ctx); err != nil {
+			return nil, err
+		}
+		body := be.Buf()
+
+		if wazevoapi.PerfMapEnabled {
+			name := module.FunctionDefinition(wasm.Index(i)).DebugName()
+			wazevoapi.PerfMap.AddModuleEntry(i,
+				int64(totalSize),
+				uint64(len(body)),
+				fmt.Sprintf("trampoline:%s", name))
+		}
+
+		// TODO: optimize as zero copy.
+		copied := make([]byte, len(body))
+		copy(copied, body)
+		bodies[i] = copied
+		totalSize += len(body)
+	}
+
+	if totalSize == 0 {
+		// Empty module.
+		return cm, nil
+	}
+
+	// Allocate executable memory and then copy the generated machine code.
+	executable, err := platform.MmapCodeSegment(totalSize)
+	if err != nil {
+		panic(err)
+	}
+	cm.executable = executable
+
+	for i, b := range bodies {
+		offset := cm.functionOffsets[i]
+		copy(executable[offset:], b)
+	}
+
+	if wazevoapi.PerfMapEnabled {
+		wazevoapi.PerfMap.Flush(uintptr(unsafe.Pointer(&executable[0])), cm.functionOffsets)
+	}
+
+	if runtime.GOARCH == "arm64" {
+		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+		if err = platform.MprotectRX(executable); err != nil {
+			return nil, err
+		}
+	}
+	e.setFinalizer(cm.executables, executablesFinalizer)
+	return cm, nil
+}
+
+// Close implements wasm.Engine.
+func (e *engine) Close() (err error) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	e.sortedCompiledModules = nil
+	e.compiledModules = nil
+	e.sharedFunctions = nil
+	return nil
+}
+
+// CompiledModuleCount implements wasm.Engine.
+func (e *engine) CompiledModuleCount() uint32 {
+	e.mux.RLock()
+	defer e.mux.RUnlock()
+	return uint32(len(e.compiledModules))
+}
+
+// DeleteCompiledModule implements wasm.Engine.
+func (e *engine) DeleteCompiledModule(m *wasm.Module) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	cm, ok := e.compiledModules[m.ID]
+	if ok {
+		if len(cm.executable) > 0 {
+			e.deleteCompiledModuleFromSortedList(cm)
+		}
+		delete(e.compiledModules, m.ID)
+	}
+}
+
+func (e *engine) addCompiledModuleToSortedList(cm *compiledModule) {
+	ptr := uintptr(unsafe.Pointer(&cm.executable[0]))
+
+	index := sort.Search(len(e.sortedCompiledModules), func(i int) bool {
+		return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) >= ptr
+	})
+	e.sortedCompiledModules = append(e.sortedCompiledModules, nil)
+	copy(e.sortedCompiledModules[index+1:], e.sortedCompiledModules[index:])
+	e.sortedCompiledModules[index] = cm
+}
+
+func (e *engine) deleteCompiledModuleFromSortedList(cm *compiledModule) {
+	ptr := uintptr(unsafe.Pointer(&cm.executable[0]))
+
+	index := sort.Search(len(e.sortedCompiledModules), func(i int) bool {
+		return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) >= ptr
+	})
+	if index >= len(e.sortedCompiledModules) {
+		return
+	}
+	copy(e.sortedCompiledModules[index:], e.sortedCompiledModules[index+1:])
+	e.sortedCompiledModules = e.sortedCompiledModules[:len(e.sortedCompiledModules)-1]
+}
+
+func (e *engine) compiledModuleOfAddr(addr uintptr) *compiledModule {
+	e.mux.RLock()
+	defer e.mux.RUnlock()
+
+	index := sort.Search(len(e.sortedCompiledModules), func(i int) bool {
+		return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) > addr
+	})
+	index -= 1
+	if index < 0 {
+		return nil
+	}
+	candidate := e.sortedCompiledModules[index]
+	if checkAddrInBytes(addr, candidate.executable) {
+		// If a module is already deleted, the found module may have been wrong.
+		return candidate
+	}
+	return nil
+}
+
+func checkAddrInBytes(addr uintptr, b []byte) bool {
+	return uintptr(unsafe.Pointer(&b[0])) <= addr && addr <= uintptr(unsafe.Pointer(&b[len(b)-1]))
+}
+
+// NewModuleEngine implements wasm.Engine.
+func (e *engine) NewModuleEngine(m *wasm.Module, mi *wasm.ModuleInstance) (wasm.ModuleEngine, error) {
+	me := &moduleEngine{}
+
+	// Note: imported functions are resolved in moduleEngine.ResolveImportedFunction.
+	me.importedFunctions = make([]importedFunction, m.ImportFunctionCount)
+
+	compiled, ok := e.getCompiledModuleFromMemory(m)
+	if !ok {
+		return nil, errors.New("source module must be compiled before instantiation")
+	}
+	me.parent = compiled
+	me.module = mi
+	me.listeners = compiled.listeners
+
+	if m.IsHostModule {
+		me.opaque = buildHostModuleOpaque(m, compiled.listeners)
+		me.opaquePtr = &me.opaque[0]
+	} else {
+		if size := compiled.offsets.TotalSize; size != 0 {
+			opaque := newAlignedOpaque(size)
+			me.opaque = opaque
+			me.opaquePtr = &opaque[0]
+		}
+	}
+	return me, nil
+}
+
+func (e *engine) compileSharedFunctions() {
+	e.sharedFunctions = &sharedFunctions{
+		listenerBeforeTrampolines: make(map[*wasm.FunctionType][]byte),
+		listenerAfterTrampolines:  make(map[*wasm.FunctionType][]byte),
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeGrowMemory, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32},
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryGrowExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryGrowExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_grow_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeTableGrow, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* table index */, ssa.TypeI32 /* num */, ssa.TypeI64 /* ref */},
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.tableGrowExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.tableGrowExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "table_grow_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCheckModuleExitCode, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI32 /* exec context */},
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.checkModuleExitCode = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.checkModuleExitCode
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "check_module_exit_code_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeRefFunc, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* function index */},
+			Results: []ssa.Type{ssa.TypeI64}, // returns the function reference.
+		}, false)
+		e.sharedFunctions.refFuncExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.refFuncExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "ref_func_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileStackGrowCallSequence()
+		e.sharedFunctions.stackGrowExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.stackGrowExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "stack_grow_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait32, &ssa.Signature{
+			// exec context, timeout, expected, addr
+			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+			// Returns the status.
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryWait32Executable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryWait32Executable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait32_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait64, &ssa.Signature{
+			// exec context, timeout, expected, addr
+			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64, ssa.TypeI64},
+			// Returns the status.
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryWait64Executable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryWait64Executable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait64_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryNotify, &ssa.Signature{
+			// exec context, count, addr
+			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+			// Returns the number notified.
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryNotifyExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryNotifyExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_notify_trampoline")
+		}
+	}
+
+	e.setFinalizer(e.sharedFunctions, sharedFunctionsFinalizer)
+}
+
+func sharedFunctionsFinalizer(sf *sharedFunctions) {
+	if err := platform.MunmapCodeSegment(sf.memoryGrowExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.checkModuleExitCode); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.stackGrowExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.tableGrowExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.refFuncExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.memoryWait32Executable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.memoryWait64Executable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.memoryNotifyExecutable); err != nil {
+		panic(err)
+	}
+	for _, f := range sf.listenerBeforeTrampolines {
+		if err := platform.MunmapCodeSegment(f); err != nil {
+			panic(err)
+		}
+	}
+	for _, f := range sf.listenerAfterTrampolines {
+		if err := platform.MunmapCodeSegment(f); err != nil {
+			panic(err)
+		}
+	}
+
+	sf.memoryGrowExecutable = nil
+	sf.checkModuleExitCode = nil
+	sf.stackGrowExecutable = nil
+	sf.tableGrowExecutable = nil
+	sf.refFuncExecutable = nil
+	sf.memoryWait32Executable = nil
+	sf.memoryWait64Executable = nil
+	sf.memoryNotifyExecutable = nil
+	sf.listenerBeforeTrampolines = nil
+	sf.listenerAfterTrampolines = nil
+}
+
+func executablesFinalizer(exec *executables) {
+	if len(exec.executable) > 0 {
+		if err := platform.MunmapCodeSegment(exec.executable); err != nil {
+			panic(err)
+		}
+	}
+	exec.executable = nil
+
+	for _, f := range exec.entryPreambles {
+		if err := platform.MunmapCodeSegment(f); err != nil {
+			panic(err)
+		}
+	}
+	exec.entryPreambles = nil
+}
+
+func mmapExecutable(src []byte) []byte {
+	executable, err := platform.MmapCodeSegment(len(src))
+	if err != nil {
+		panic(err)
+	}
+
+	copy(executable, src)
+
+	if runtime.GOARCH == "arm64" {
+		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+		if err = platform.MprotectRX(executable); err != nil {
+			panic(err)
+		}
+	}
+	return executable
+}
+
+func (cm *compiledModule) functionIndexOf(addr uintptr) wasm.Index {
+	addr -= uintptr(unsafe.Pointer(&cm.executable[0]))
+	offset := cm.functionOffsets
+	index := sort.Search(len(offset), func(i int) bool {
+		return offset[i] > int(addr)
+	})
+	index--
+	if index < 0 {
+		panic("BUG")
+	}
+	return wasm.Index(index)
+}
+
+func (e *engine) getListenerTrampolineForType(functionType *wasm.FunctionType) (before, after *byte) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+
+	beforeBuf, ok := e.sharedFunctions.listenerBeforeTrampolines[functionType]
+	afterBuf := e.sharedFunctions.listenerAfterTrampolines[functionType]
+	if ok {
+		return &beforeBuf[0], &afterBuf[0]
+	}
+
+	beforeSig, afterSig := frontend.SignatureForListener(functionType)
+
+	e.be.Init()
+	buf := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerBefore, beforeSig, false)
+	beforeBuf = mmapExecutable(buf)
+
+	e.be.Init()
+	buf = e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerAfter, afterSig, false)
+	afterBuf = mmapExecutable(buf)
+
+	e.sharedFunctions.listenerBeforeTrampolines[functionType] = beforeBuf
+	e.sharedFunctions.listenerAfterTrampolines[functionType] = afterBuf
+	return &beforeBuf[0], &afterBuf[0]
+}
+
+func (cm *compiledModule) getSourceOffset(pc uintptr) uint64 {
+	offsets := cm.sourceMap.executableOffsets
+	if len(offsets) == 0 {
+		return 0
+	}
+
+	index := sort.Search(len(offsets), func(i int) bool {
+		return offsets[i] >= pc
+	})
+
+	index--
+	if index < 0 {
+		return 0
+	}
+	return cm.sourceMap.wasmBinaryOffsets[index]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
@ -0,0 +1,296 @@
+package wazevo
+
+import (
+	"bytes"
+	"context"
+	"crypto/sha256"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"runtime"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/filecache"
+	"github.com/tetratelabs/wazero/internal/platform"
+	"github.com/tetratelabs/wazero/internal/u32"
+	"github.com/tetratelabs/wazero/internal/u64"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+var crc = crc32.MakeTable(crc32.Castagnoli)
+
+// fileCacheKey returns a key for the file cache.
+// In order to avoid collisions with the existing compiler, we do not use m.ID directly,
+// but instead we rehash it with magic.
+func fileCacheKey(m *wasm.Module) (ret filecache.Key) {
+	s := sha256.New()
+	s.Write(m.ID[:])
+	s.Write(magic)
+	s.Sum(ret[:0])
+	return
+}
+
+func (e *engine) addCompiledModule(module *wasm.Module, cm *compiledModule) (err error) {
+	e.addCompiledModuleToMemory(module, cm)
+	if !module.IsHostModule && e.fileCache != nil {
+		err = e.addCompiledModuleToCache(module, cm)
+	}
+	return
+}
+
+func (e *engine) getCompiledModule(module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (cm *compiledModule, ok bool, err error) {
+	cm, ok = e.getCompiledModuleFromMemory(module)
+	if ok {
+		return
+	}
+	cm, ok, err = e.getCompiledModuleFromCache(module)
+	if ok {
+		cm.parent = e
+		cm.module = module
+		cm.sharedFunctions = e.sharedFunctions
+		cm.ensureTermination = ensureTermination
+		cm.offsets = wazevoapi.NewModuleContextOffsetData(module, len(listeners) > 0)
+		if len(listeners) > 0 {
+			cm.listeners = listeners
+			cm.listenerBeforeTrampolines = make([]*byte, len(module.TypeSection))
+			cm.listenerAfterTrampolines = make([]*byte, len(module.TypeSection))
+			for i := range module.TypeSection {
+				typ := &module.TypeSection[i]
+				before, after := e.getListenerTrampolineForType(typ)
+				cm.listenerBeforeTrampolines[i] = before
+				cm.listenerAfterTrampolines[i] = after
+			}
+		}
+		e.addCompiledModuleToMemory(module, cm)
+		ssaBuilder := ssa.NewBuilder()
+		machine := newMachine()
+		be := backend.NewCompiler(context.Background(), machine, ssaBuilder)
+		cm.executables.compileEntryPreambles(module, machine, be)
+
+		// Set the finalizer.
+		e.setFinalizer(cm.executables, executablesFinalizer)
+	}
+	return
+}
+
+func (e *engine) addCompiledModuleToMemory(m *wasm.Module, cm *compiledModule) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	e.compiledModules[m.ID] = cm
+	if len(cm.executable) > 0 {
+		e.addCompiledModuleToSortedList(cm)
+	}
+}
+
+func (e *engine) getCompiledModuleFromMemory(module *wasm.Module) (cm *compiledModule, ok bool) {
+	e.mux.RLock()
+	defer e.mux.RUnlock()
+	cm, ok = e.compiledModules[module.ID]
+	return
+}
+
+func (e *engine) addCompiledModuleToCache(module *wasm.Module, cm *compiledModule) (err error) {
+	if e.fileCache == nil || module.IsHostModule {
+		return
+	}
+	err = e.fileCache.Add(fileCacheKey(module), serializeCompiledModule(e.wazeroVersion, cm))
+	return
+}
+
+func (e *engine) getCompiledModuleFromCache(module *wasm.Module) (cm *compiledModule, hit bool, err error) {
+	if e.fileCache == nil || module.IsHostModule {
+		return
+	}
+
+	// Check if the entries exist in the external cache.
+	var cached io.ReadCloser
+	cached, hit, err = e.fileCache.Get(fileCacheKey(module))
+	if !hit || err != nil {
+		return
+	}
+
+	// Otherwise, we hit the cache on external cache.
+	// We retrieve *code structures from `cached`.
+	var staleCache bool
+	// Note: cached.Close is ensured to be called in deserializeCodes.
+	cm, staleCache, err = deserializeCompiledModule(e.wazeroVersion, cached)
+	if err != nil {
+		hit = false
+		return
+	} else if staleCache {
+		return nil, false, e.fileCache.Delete(fileCacheKey(module))
+	}
+	return
+}
+
+var magic = []byte{'W', 'A', 'Z', 'E', 'V', 'O'}
+
+func serializeCompiledModule(wazeroVersion string, cm *compiledModule) io.Reader {
+	buf := bytes.NewBuffer(nil)
+	// First 6 byte: WAZEVO header.
+	buf.Write(magic)
+	// Next 1 byte: length of version:
+	buf.WriteByte(byte(len(wazeroVersion)))
+	// Version of wazero.
+	buf.WriteString(wazeroVersion)
+	// Number of *code (== locally defined functions in the module): 4 bytes.
+	buf.Write(u32.LeBytes(uint32(len(cm.functionOffsets))))
+	for _, offset := range cm.functionOffsets {
+		// The offset of this function in the executable (8 bytes).
+		buf.Write(u64.LeBytes(uint64(offset)))
+	}
+	// The length of code segment (8 bytes).
+	buf.Write(u64.LeBytes(uint64(len(cm.executable))))
+	// Append the native code.
+	buf.Write(cm.executable)
+	// Append checksum.
+	checksum := crc32.Checksum(cm.executable, crc)
+	buf.Write(u32.LeBytes(checksum))
+	if sm := cm.sourceMap; len(sm.executableOffsets) > 0 {
+		buf.WriteByte(1) // indicates that source map is present.
+		l := len(sm.wasmBinaryOffsets)
+		buf.Write(u64.LeBytes(uint64(l)))
+		executableAddr := uintptr(unsafe.Pointer(&cm.executable[0]))
+		for i := 0; i < l; i++ {
+			buf.Write(u64.LeBytes(sm.wasmBinaryOffsets[i]))
+			// executableOffsets is absolute address, so we need to subtract executableAddr.
+			buf.Write(u64.LeBytes(uint64(sm.executableOffsets[i] - executableAddr)))
+		}
+	} else {
+		buf.WriteByte(0) // indicates that source map is not present.
+	}
+	return bytes.NewReader(buf.Bytes())
+}
+
+func deserializeCompiledModule(wazeroVersion string, reader io.ReadCloser) (cm *compiledModule, staleCache bool, err error) {
+	defer reader.Close()
+	cacheHeaderSize := len(magic) + 1 /* version size */ + len(wazeroVersion) + 4 /* number of functions */
+
+	// Read the header before the native code.
+	header := make([]byte, cacheHeaderSize)
+	n, err := reader.Read(header)
+	if err != nil {
+		return nil, false, fmt.Errorf("compilationcache: error reading header: %v", err)
+	}
+
+	if n != cacheHeaderSize {
+		return nil, false, fmt.Errorf("compilationcache: invalid header length: %d", n)
+	}
+
+	if !bytes.Equal(header[:len(magic)], magic) {
+		return nil, false, fmt.Errorf(
+			"compilationcache: invalid magic number: got %s but want %s", magic, header[:len(magic)])
+	}
+
+	// Check the version compatibility.
+	versionSize := int(header[len(magic)])
+
+	cachedVersionBegin, cachedVersionEnd := len(magic)+1, len(magic)+1+versionSize
+	if cachedVersionEnd >= len(header) {
+		staleCache = true
+		return
+	} else if cachedVersion := string(header[cachedVersionBegin:cachedVersionEnd]); cachedVersion != wazeroVersion {
+		staleCache = true
+		return
+	}
+
+	functionsNum := binary.LittleEndian.Uint32(header[len(header)-4:])
+	cm = &compiledModule{functionOffsets: make([]int, functionsNum), executables: &executables{}}
+
+	var eightBytes [8]byte
+	for i := uint32(0); i < functionsNum; i++ {
+		// Read the offset of each function in the executable.
+		var offset uint64
+		if offset, err = readUint64(reader, &eightBytes); err != nil {
+			err = fmt.Errorf("compilationcache: error reading func[%d] executable offset: %v", i, err)
+			return
+		}
+		cm.functionOffsets[i] = int(offset)
+	}
+
+	executableLen, err := readUint64(reader, &eightBytes)
+	if err != nil {
+		err = fmt.Errorf("compilationcache: error reading executable size: %v", err)
+		return
+	}
+
+	if executableLen > 0 {
+		executable, err := platform.MmapCodeSegment(int(executableLen))
+		if err != nil {
+			err = fmt.Errorf("compilationcache: error mmapping executable (len=%d): %v", executableLen, err)
+			return nil, false, err
+		}
+
+		_, err = io.ReadFull(reader, executable)
+		if err != nil {
+			err = fmt.Errorf("compilationcache: error reading executable (len=%d): %v", executableLen, err)
+			return nil, false, err
+		}
+
+		expected := crc32.Checksum(executable, crc)
+		if _, err = io.ReadFull(reader, eightBytes[:4]); err != nil {
+			return nil, false, fmt.Errorf("compilationcache: could not read checksum: %v", err)
+		} else if checksum := binary.LittleEndian.Uint32(eightBytes[:4]); expected != checksum {
+			return nil, false, fmt.Errorf("compilationcache: checksum mismatch (expected %d, got %d)", expected, checksum)
+		}
+
+		if runtime.GOARCH == "arm64" {
+			// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+			if err = platform.MprotectRX(executable); err != nil {
+				return nil, false, err
+			}
+		}
+		cm.executable = executable
+	}
+
+	if _, err := io.ReadFull(reader, eightBytes[:1]); err != nil {
+		return nil, false, fmt.Errorf("compilationcache: error reading source map presence: %v", err)
+	}
+
+	if eightBytes[0] == 1 {
+		sm := &cm.sourceMap
+		sourceMapLen, err := readUint64(reader, &eightBytes)
+		if err != nil {
+			err = fmt.Errorf("compilationcache: error reading source map length: %v", err)
+			return nil, false, err
+		}
+		executableOffset := uintptr(unsafe.Pointer(&cm.executable[0]))
+		for i := uint64(0); i < sourceMapLen; i++ {
+			wasmBinaryOffset, err := readUint64(reader, &eightBytes)
+			if err != nil {
+				err = fmt.Errorf("compilationcache: error reading source map[%d] wasm binary offset: %v", i, err)
+				return nil, false, err
+			}
+			executableRelativeOffset, err := readUint64(reader, &eightBytes)
+			if err != nil {
+				err = fmt.Errorf("compilationcache: error reading source map[%d] executable offset: %v", i, err)
+				return nil, false, err
+			}
+			sm.wasmBinaryOffsets = append(sm.wasmBinaryOffsets, wasmBinaryOffset)
+			// executableOffsets is absolute address, so we need to add executableOffset.
+			sm.executableOffsets = append(sm.executableOffsets, uintptr(executableRelativeOffset)+executableOffset)
+		}
+	}
+	return
+}
+
+// readUint64 strictly reads an uint64 in little-endian byte order, using the
+// given array as a buffer. This returns io.EOF if less than 8 bytes were read.
+func readUint64(reader io.Reader, b *[8]byte) (uint64, error) {
+	s := b[0:8]
+	n, err := reader.Read(s)
+	if err != nil {
+		return 0, err
+	} else if n < 8 { // more strict than reader.Read
+		return 0, io.EOF
+	}
+
+	// Read the u64 from the underlying buffer.
+	ret := binary.LittleEndian.Uint64(s)
+	return ret, nil
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go
@ -0,0 +1,15 @@
+//go:build amd64 && !tinygo
+
+package wazevo
+
+import _ "unsafe"
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname entrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64.entrypoint
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname afterGoFunctionCallEntrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64.afterGoFunctionCallEntrypoint
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go
@ -0,0 +1,15 @@
+//go:build arm64 && !tinygo
+
+package wazevo
+
+import _ "unsafe"
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname entrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64.entrypoint
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname afterGoFunctionCallEntrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64.afterGoFunctionCallEntrypoint
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go
@ -0,0 +1,15 @@
+//go:build (!arm64 && !amd64) || tinygo
+
+package wazevo
+
+import (
+	"runtime"
+)
+
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr) {
+	panic(runtime.GOARCH)
+}
+
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) {
+	panic(runtime.GOARCH)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
@ -0,0 +1,594 @@
+// Package frontend implements the translation of WebAssembly to SSA IR using the ssa package.
+package frontend
+
+import (
+	"bytes"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+// Compiler is in charge of lowering Wasm to SSA IR, and does the optimization
+// on top of it in architecture-independent way.
+type Compiler struct {
+	// Per-module data that is used across all functions.
+
+	m      *wasm.Module
+	offset *wazevoapi.ModuleContextOffsetData
+	// ssaBuilder is a ssa.Builder used by this frontend.
+	ssaBuilder             ssa.Builder
+	signatures             map[*wasm.FunctionType]*ssa.Signature
+	listenerSignatures     map[*wasm.FunctionType][2]*ssa.Signature
+	memoryGrowSig          ssa.Signature
+	memoryWait32Sig        ssa.Signature
+	memoryWait64Sig        ssa.Signature
+	memoryNotifySig        ssa.Signature
+	checkModuleExitCodeSig ssa.Signature
+	tableGrowSig           ssa.Signature
+	refFuncSig             ssa.Signature
+	memmoveSig             ssa.Signature
+	ensureTermination      bool
+
+	// Followings are reset by per function.
+
+	// wasmLocalToVariable maps the index (considered as wasm.Index of locals)
+	// to the corresponding ssa.Variable.
+	wasmLocalToVariable                   [] /* local index to */ ssa.Variable
+	wasmLocalFunctionIndex                wasm.Index
+	wasmFunctionTypeIndex                 wasm.Index
+	wasmFunctionTyp                       *wasm.FunctionType
+	wasmFunctionLocalTypes                []wasm.ValueType
+	wasmFunctionBody                      []byte
+	wasmFunctionBodyOffsetInCodeSection   uint64
+	memoryBaseVariable, memoryLenVariable ssa.Variable
+	needMemory                            bool
+	memoryShared                          bool
+	globalVariables                       []ssa.Variable
+	globalVariablesTypes                  []ssa.Type
+	mutableGlobalVariablesIndexes         []wasm.Index // index to ^.
+	needListener                          bool
+	needSourceOffsetInfo                  bool
+	// br is reused during lowering.
+	br            *bytes.Reader
+	loweringState loweringState
+
+	knownSafeBounds    [] /* ssa.ValueID to */ knownSafeBound
+	knownSafeBoundsSet []ssa.ValueID
+
+	knownSafeBoundsAtTheEndOfBlocks   [] /* ssa.BlockID to */ knownSafeBoundsAtTheEndOfBlock
+	varLengthKnownSafeBoundWithIDPool wazevoapi.VarLengthPool[knownSafeBoundWithID]
+
+	execCtxPtrValue, moduleCtxPtrValue ssa.Value
+
+	// Following are reused for the known safe bounds analysis.
+
+	pointers []int
+	bounds   [][]knownSafeBoundWithID
+}
+
+type (
+	// knownSafeBound represents a known safe bound for a value.
+	knownSafeBound struct {
+		// bound is a constant upper bound for the value.
+		bound uint64
+		// absoluteAddr is the absolute address of the value.
+		absoluteAddr ssa.Value
+	}
+	// knownSafeBoundWithID is a knownSafeBound with the ID of the value.
+	knownSafeBoundWithID struct {
+		knownSafeBound
+		id ssa.ValueID
+	}
+	knownSafeBoundsAtTheEndOfBlock = wazevoapi.VarLength[knownSafeBoundWithID]
+)
+
+var knownSafeBoundsAtTheEndOfBlockNil = wazevoapi.NewNilVarLength[knownSafeBoundWithID]()
+
+// NewFrontendCompiler returns a frontend Compiler.
+func NewFrontendCompiler(m *wasm.Module, ssaBuilder ssa.Builder, offset *wazevoapi.ModuleContextOffsetData, ensureTermination bool, listenerOn bool, sourceInfo bool) *Compiler {
+	c := &Compiler{
+		m:                                 m,
+		ssaBuilder:                        ssaBuilder,
+		br:                                bytes.NewReader(nil),
+		offset:                            offset,
+		ensureTermination:                 ensureTermination,
+		needSourceOffsetInfo:              sourceInfo,
+		varLengthKnownSafeBoundWithIDPool: wazevoapi.NewVarLengthPool[knownSafeBoundWithID](),
+	}
+	c.declareSignatures(listenerOn)
+	return c
+}
+
+func (c *Compiler) declareSignatures(listenerOn bool) {
+	m := c.m
+	c.signatures = make(map[*wasm.FunctionType]*ssa.Signature, len(m.TypeSection)+2)
+	if listenerOn {
+		c.listenerSignatures = make(map[*wasm.FunctionType][2]*ssa.Signature, len(m.TypeSection))
+	}
+	for i := range m.TypeSection {
+		wasmSig := &m.TypeSection[i]
+		sig := SignatureForWasmFunctionType(wasmSig)
+		sig.ID = ssa.SignatureID(i)
+		c.signatures[wasmSig] = &sig
+		c.ssaBuilder.DeclareSignature(&sig)
+
+		if listenerOn {
+			beforeSig, afterSig := SignatureForListener(wasmSig)
+			beforeSig.ID = ssa.SignatureID(i) + ssa.SignatureID(len(m.TypeSection))
+			afterSig.ID = ssa.SignatureID(i) + ssa.SignatureID(len(m.TypeSection))*2
+			c.listenerSignatures[wasmSig] = [2]*ssa.Signature{beforeSig, afterSig}
+			c.ssaBuilder.DeclareSignature(beforeSig)
+			c.ssaBuilder.DeclareSignature(afterSig)
+		}
+	}
+
+	begin := ssa.SignatureID(len(m.TypeSection))
+	if listenerOn {
+		begin *= 3
+	}
+	c.memoryGrowSig = ssa.Signature{
+		ID: begin,
+		// Takes execution context and the page size to grow.
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32},
+		// Returns the previous page size.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryGrowSig)
+
+	c.checkModuleExitCodeSig = ssa.Signature{
+		ID: c.memoryGrowSig.ID + 1,
+		// Only takes execution context.
+		Params: []ssa.Type{ssa.TypeI64},
+	}
+	c.ssaBuilder.DeclareSignature(&c.checkModuleExitCodeSig)
+
+	c.tableGrowSig = ssa.Signature{
+		ID:     c.checkModuleExitCodeSig.ID + 1,
+		Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* table index */, ssa.TypeI32 /* num */, ssa.TypeI64 /* ref */},
+		// Returns the previous size.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.tableGrowSig)
+
+	c.refFuncSig = ssa.Signature{
+		ID:     c.tableGrowSig.ID + 1,
+		Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* func index */},
+		// Returns the function reference.
+		Results: []ssa.Type{ssa.TypeI64},
+	}
+	c.ssaBuilder.DeclareSignature(&c.refFuncSig)
+
+	c.memmoveSig = ssa.Signature{
+		ID: c.refFuncSig.ID + 1,
+		// dst, src, and the byte count.
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64},
+	}
+
+	c.ssaBuilder.DeclareSignature(&c.memmoveSig)
+
+	c.memoryWait32Sig = ssa.Signature{
+		ID: c.memmoveSig.ID + 1,
+		// exec context, timeout, expected, addr
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+		// Returns the status.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryWait32Sig)
+
+	c.memoryWait64Sig = ssa.Signature{
+		ID: c.memoryWait32Sig.ID + 1,
+		// exec context, timeout, expected, addr
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64, ssa.TypeI64},
+		// Returns the status.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryWait64Sig)
+
+	c.memoryNotifySig = ssa.Signature{
+		ID: c.memoryWait64Sig.ID + 1,
+		// exec context, count, addr
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+		// Returns the number notified.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryNotifySig)
+}
+
+// SignatureForWasmFunctionType returns the ssa.Signature for the given wasm.FunctionType.
+func SignatureForWasmFunctionType(typ *wasm.FunctionType) ssa.Signature {
+	sig := ssa.Signature{
+		// +2 to pass moduleContextPtr and executionContextPtr. See the inline comment LowerToSSA.
+		Params:  make([]ssa.Type, len(typ.Params)+2),
+		Results: make([]ssa.Type, len(typ.Results)),
+	}
+	sig.Params[0] = executionContextPtrTyp
+	sig.Params[1] = moduleContextPtrTyp
+	for j, typ := range typ.Params {
+		sig.Params[j+2] = WasmTypeToSSAType(typ)
+	}
+	for j, typ := range typ.Results {
+		sig.Results[j] = WasmTypeToSSAType(typ)
+	}
+	return sig
+}
+
+// Init initializes the state of frontendCompiler and make it ready for a next function.
+func (c *Compiler) Init(idx, typIndex wasm.Index, typ *wasm.FunctionType, localTypes []wasm.ValueType, body []byte, needListener bool, bodyOffsetInCodeSection uint64) {
+	c.ssaBuilder.Init(c.signatures[typ])
+	c.loweringState.reset()
+
+	c.wasmFunctionTypeIndex = typIndex
+	c.wasmLocalFunctionIndex = idx
+	c.wasmFunctionTyp = typ
+	c.wasmFunctionLocalTypes = localTypes
+	c.wasmFunctionBody = body
+	c.wasmFunctionBodyOffsetInCodeSection = bodyOffsetInCodeSection
+	c.needListener = needListener
+	c.clearSafeBounds()
+	c.varLengthKnownSafeBoundWithIDPool.Reset()
+	c.knownSafeBoundsAtTheEndOfBlocks = c.knownSafeBoundsAtTheEndOfBlocks[:0]
+}
+
+// Note: this assumes 64-bit platform (I believe we won't have 32-bit backend ;)).
+const executionContextPtrTyp, moduleContextPtrTyp = ssa.TypeI64, ssa.TypeI64
+
+// LowerToSSA lowers the current function to SSA function which will be held by ssaBuilder.
+// After calling this, the caller will be able to access the SSA info in *Compiler.ssaBuilder.
+//
+// Note that this only does the naive lowering, and do not do any optimization, instead the caller is expected to do so.
+func (c *Compiler) LowerToSSA() {
+	builder := c.ssaBuilder
+
+	// Set up the entry block.
+	entryBlock := builder.AllocateBasicBlock()
+	builder.SetCurrentBlock(entryBlock)
+
+	// Functions always take two parameters in addition to Wasm-level parameters:
+	//
+	//  1. executionContextPtr: pointer to the *executionContext in wazevo package.
+	//    This will be used to exit the execution in the face of trap, plus used for host function calls.
+	//
+	// 	2. moduleContextPtr: pointer to the *moduleContextOpaque in wazevo package.
+	//	  This will be used to access memory, etc. Also, this will be used during host function calls.
+	//
+	// Note: it's clear that sometimes a function won't need them. For example,
+	//  if the function doesn't trap and doesn't make function call, then
+	// 	we might be able to eliminate the parameter. However, if that function
+	//	can be called via call_indirect, then we cannot eliminate because the
+	//  signature won't match with the expected one.
+	// TODO: maybe there's some way to do this optimization without glitches, but so far I have no clue about the feasibility.
+	//
+	// Note: In Wasmtime or many other runtimes, moduleContextPtr is called "vmContext". Also note that `moduleContextPtr`
+	//  is wazero-specific since other runtimes can naturally use the OS-level signal to do this job thanks to the fact that
+	//  they can use native stack vs wazero cannot use Go-routine stack and have to use Go-runtime allocated []byte as a stack.
+	c.execCtxPtrValue = entryBlock.AddParam(builder, executionContextPtrTyp)
+	c.moduleCtxPtrValue = entryBlock.AddParam(builder, moduleContextPtrTyp)
+	builder.AnnotateValue(c.execCtxPtrValue, "exec_ctx")
+	builder.AnnotateValue(c.moduleCtxPtrValue, "module_ctx")
+
+	for i, typ := range c.wasmFunctionTyp.Params {
+		st := WasmTypeToSSAType(typ)
+		variable := builder.DeclareVariable(st)
+		value := entryBlock.AddParam(builder, st)
+		builder.DefineVariable(variable, value, entryBlock)
+		c.setWasmLocalVariable(wasm.Index(i), variable)
+	}
+	c.declareWasmLocals(entryBlock)
+	c.declareNecessaryVariables()
+
+	c.lowerBody(entryBlock)
+}
+
+// localVariable returns the SSA variable for the given Wasm local index.
+func (c *Compiler) localVariable(index wasm.Index) ssa.Variable {
+	return c.wasmLocalToVariable[index]
+}
+
+func (c *Compiler) setWasmLocalVariable(index wasm.Index, variable ssa.Variable) {
+	idx := int(index)
+	if idx >= len(c.wasmLocalToVariable) {
+		c.wasmLocalToVariable = append(c.wasmLocalToVariable, make([]ssa.Variable, idx+1-len(c.wasmLocalToVariable))...)
+	}
+	c.wasmLocalToVariable[idx] = variable
+}
+
+// declareWasmLocals declares the SSA variables for the Wasm locals.
+func (c *Compiler) declareWasmLocals(entry ssa.BasicBlock) {
+	localCount := wasm.Index(len(c.wasmFunctionTyp.Params))
+	for i, typ := range c.wasmFunctionLocalTypes {
+		st := WasmTypeToSSAType(typ)
+		variable := c.ssaBuilder.DeclareVariable(st)
+		c.setWasmLocalVariable(wasm.Index(i)+localCount, variable)
+
+		zeroInst := c.ssaBuilder.AllocateInstruction()
+		switch st {
+		case ssa.TypeI32:
+			zeroInst.AsIconst32(0)
+		case ssa.TypeI64:
+			zeroInst.AsIconst64(0)
+		case ssa.TypeF32:
+			zeroInst.AsF32const(0)
+		case ssa.TypeF64:
+			zeroInst.AsF64const(0)
+		case ssa.TypeV128:
+			zeroInst.AsVconst(0, 0)
+		default:
+			panic("TODO: " + wasm.ValueTypeName(typ))
+		}
+
+		c.ssaBuilder.InsertInstruction(zeroInst)
+		value := zeroInst.Return()
+		c.ssaBuilder.DefineVariable(variable, value, entry)
+	}
+}
+
+func (c *Compiler) declareNecessaryVariables() {
+	if c.needMemory = c.m.MemorySection != nil; c.needMemory {
+		c.memoryShared = c.m.MemorySection.IsShared
+	} else if c.needMemory = c.m.ImportMemoryCount > 0; c.needMemory {
+		for _, imp := range c.m.ImportSection {
+			if imp.Type == wasm.ExternTypeMemory {
+				c.memoryShared = imp.DescMem.IsShared
+				break
+			}
+		}
+	}
+
+	if c.needMemory {
+		c.memoryBaseVariable = c.ssaBuilder.DeclareVariable(ssa.TypeI64)
+		c.memoryLenVariable = c.ssaBuilder.DeclareVariable(ssa.TypeI64)
+	}
+
+	c.globalVariables = c.globalVariables[:0]
+	c.mutableGlobalVariablesIndexes = c.mutableGlobalVariablesIndexes[:0]
+	c.globalVariablesTypes = c.globalVariablesTypes[:0]
+	for _, imp := range c.m.ImportSection {
+		if imp.Type == wasm.ExternTypeGlobal {
+			desc := imp.DescGlobal
+			c.declareWasmGlobal(desc.ValType, desc.Mutable)
+		}
+	}
+	for _, g := range c.m.GlobalSection {
+		desc := g.Type
+		c.declareWasmGlobal(desc.ValType, desc.Mutable)
+	}
+
+	// TODO: add tables.
+}
+
+func (c *Compiler) declareWasmGlobal(typ wasm.ValueType, mutable bool) {
+	var st ssa.Type
+	switch typ {
+	case wasm.ValueTypeI32:
+		st = ssa.TypeI32
+	case wasm.ValueTypeI64,
+		// Both externref and funcref are represented as I64 since we only support 64-bit platforms.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		st = ssa.TypeI64
+	case wasm.ValueTypeF32:
+		st = ssa.TypeF32
+	case wasm.ValueTypeF64:
+		st = ssa.TypeF64
+	case wasm.ValueTypeV128:
+		st = ssa.TypeV128
+	default:
+		panic("TODO: " + wasm.ValueTypeName(typ))
+	}
+	v := c.ssaBuilder.DeclareVariable(st)
+	index := wasm.Index(len(c.globalVariables))
+	c.globalVariables = append(c.globalVariables, v)
+	c.globalVariablesTypes = append(c.globalVariablesTypes, st)
+	if mutable {
+		c.mutableGlobalVariablesIndexes = append(c.mutableGlobalVariablesIndexes, index)
+	}
+}
+
+// WasmTypeToSSAType converts wasm.ValueType to ssa.Type.
+func WasmTypeToSSAType(vt wasm.ValueType) ssa.Type {
+	switch vt {
+	case wasm.ValueTypeI32:
+		return ssa.TypeI32
+	case wasm.ValueTypeI64,
+		// Both externref and funcref are represented as I64 since we only support 64-bit platforms.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		return ssa.TypeI64
+	case wasm.ValueTypeF32:
+		return ssa.TypeF32
+	case wasm.ValueTypeF64:
+		return ssa.TypeF64
+	case wasm.ValueTypeV128:
+		return ssa.TypeV128
+	default:
+		panic("TODO: " + wasm.ValueTypeName(vt))
+	}
+}
+
+// addBlockParamsFromWasmTypes adds the block parameters to the given block.
+func (c *Compiler) addBlockParamsFromWasmTypes(tps []wasm.ValueType, blk ssa.BasicBlock) {
+	for _, typ := range tps {
+		st := WasmTypeToSSAType(typ)
+		blk.AddParam(c.ssaBuilder, st)
+	}
+}
+
+// formatBuilder outputs the constructed SSA function as a string with a source information.
+func (c *Compiler) formatBuilder() string {
+	return c.ssaBuilder.Format()
+}
+
+// SignatureForListener returns the signatures for the listener functions.
+func SignatureForListener(wasmSig *wasm.FunctionType) (*ssa.Signature, *ssa.Signature) {
+	beforeSig := &ssa.Signature{}
+	beforeSig.Params = make([]ssa.Type, len(wasmSig.Params)+2)
+	beforeSig.Params[0] = ssa.TypeI64 // Execution context.
+	beforeSig.Params[1] = ssa.TypeI32 // Function index.
+	for i, p := range wasmSig.Params {
+		beforeSig.Params[i+2] = WasmTypeToSSAType(p)
+	}
+	afterSig := &ssa.Signature{}
+	afterSig.Params = make([]ssa.Type, len(wasmSig.Results)+2)
+	afterSig.Params[0] = ssa.TypeI64 // Execution context.
+	afterSig.Params[1] = ssa.TypeI32 // Function index.
+	for i, p := range wasmSig.Results {
+		afterSig.Params[i+2] = WasmTypeToSSAType(p)
+	}
+	return beforeSig, afterSig
+}
+
+// isBoundSafe returns true if the given value is known to be safe to access up to the given bound.
+func (c *Compiler) getKnownSafeBound(v ssa.ValueID) *knownSafeBound {
+	if int(v) >= len(c.knownSafeBounds) {
+		return nil
+	}
+	return &c.knownSafeBounds[v]
+}
+
+// recordKnownSafeBound records the given safe bound for the given value.
+func (c *Compiler) recordKnownSafeBound(v ssa.ValueID, safeBound uint64, absoluteAddr ssa.Value) {
+	if int(v) >= len(c.knownSafeBounds) {
+		c.knownSafeBounds = append(c.knownSafeBounds, make([]knownSafeBound, v+1)...)
+	}
+
+	if exiting := c.knownSafeBounds[v]; exiting.bound == 0 {
+		c.knownSafeBounds[v] = knownSafeBound{
+			bound:        safeBound,
+			absoluteAddr: absoluteAddr,
+		}
+		c.knownSafeBoundsSet = append(c.knownSafeBoundsSet, v)
+	} else if safeBound > exiting.bound {
+		c.knownSafeBounds[v].bound = safeBound
+	}
+}
+
+// clearSafeBounds clears the known safe bounds.
+func (c *Compiler) clearSafeBounds() {
+	for _, v := range c.knownSafeBoundsSet {
+		ptr := &c.knownSafeBounds[v]
+		ptr.bound = 0
+		ptr.absoluteAddr = ssa.ValueInvalid
+	}
+	c.knownSafeBoundsSet = c.knownSafeBoundsSet[:0]
+}
+
+// resetAbsoluteAddressInSafeBounds resets the absolute addresses recorded in the known safe bounds.
+func (c *Compiler) resetAbsoluteAddressInSafeBounds() {
+	for _, v := range c.knownSafeBoundsSet {
+		ptr := &c.knownSafeBounds[v]
+		ptr.absoluteAddr = ssa.ValueInvalid
+	}
+}
+
+func (k *knownSafeBound) valid() bool {
+	return k != nil && k.bound > 0
+}
+
+func (c *Compiler) allocateVarLengthValues(_cap int, vs ...ssa.Value) ssa.Values {
+	builder := c.ssaBuilder
+	pool := builder.VarLengthPool()
+	args := pool.Allocate(_cap)
+	args = args.Append(builder.VarLengthPool(), vs...)
+	return args
+}
+
+func (c *Compiler) finalizeKnownSafeBoundsAtTheEndOfBlock(bID ssa.BasicBlockID) {
+	_bID := int(bID)
+	if l := len(c.knownSafeBoundsAtTheEndOfBlocks); _bID >= l {
+		c.knownSafeBoundsAtTheEndOfBlocks = append(c.knownSafeBoundsAtTheEndOfBlocks,
+			make([]knownSafeBoundsAtTheEndOfBlock, _bID+1-len(c.knownSafeBoundsAtTheEndOfBlocks))...)
+		for i := l; i < len(c.knownSafeBoundsAtTheEndOfBlocks); i++ {
+			c.knownSafeBoundsAtTheEndOfBlocks[i] = knownSafeBoundsAtTheEndOfBlockNil
+		}
+	}
+	p := &c.varLengthKnownSafeBoundWithIDPool
+	size := len(c.knownSafeBoundsSet)
+	allocated := c.varLengthKnownSafeBoundWithIDPool.Allocate(size)
+	// Sort the known safe bounds by the value ID so that we can use the intersection algorithm in initializeCurrentBlockKnownBounds.
+	sortSSAValueIDs(c.knownSafeBoundsSet)
+	for _, vID := range c.knownSafeBoundsSet {
+		kb := c.knownSafeBounds[vID]
+		allocated = allocated.Append(p, knownSafeBoundWithID{
+			knownSafeBound: kb,
+			id:             vID,
+		})
+	}
+	c.knownSafeBoundsAtTheEndOfBlocks[bID] = allocated
+	c.clearSafeBounds()
+}
+
+func (c *Compiler) initializeCurrentBlockKnownBounds() {
+	currentBlk := c.ssaBuilder.CurrentBlock()
+	switch preds := currentBlk.Preds(); preds {
+	case 0:
+	case 1:
+		pred := currentBlk.Pred(0).ID()
+		for _, kb := range c.getKnownSafeBoundsAtTheEndOfBlocks(pred).View() {
+			// Unless the block is sealed, we cannot assume the absolute address is valid:
+			// later we might add another predecessor that has no visibility of that value.
+			addr := ssa.ValueInvalid
+			if currentBlk.Sealed() {
+				addr = kb.absoluteAddr
+			}
+			c.recordKnownSafeBound(kb.id, kb.bound, addr)
+		}
+	default:
+		c.pointers = c.pointers[:0]
+		c.bounds = c.bounds[:0]
+		for i := 0; i < preds; i++ {
+			c.bounds = append(c.bounds, c.getKnownSafeBoundsAtTheEndOfBlocks(currentBlk.Pred(i).ID()).View())
+			c.pointers = append(c.pointers, 0)
+		}
+
+		// If there are multiple predecessors, we need to find the intersection of the known safe bounds.
+
+	outer:
+		for {
+			smallestID := ssa.ValueID(math.MaxUint32)
+			for i, ptr := range c.pointers {
+				if ptr >= len(c.bounds[i]) {
+					break outer
+				}
+				cb := &c.bounds[i][ptr]
+				if id := cb.id; id < smallestID {
+					smallestID = cb.id
+				}
+			}
+
+			// Check if current elements are the same across all lists.
+			same := true
+			minBound := uint64(math.MaxUint64)
+			for i := 0; i < preds; i++ {
+				cb := &c.bounds[i][c.pointers[i]]
+				if cb.id != smallestID {
+					same = false
+					break
+				} else {
+					if cb.bound < minBound {
+						minBound = cb.bound
+					}
+				}
+			}
+
+			if same { // All elements are the same.
+				// Absolute address cannot be used in the intersection since the value might be only defined in one of the predecessors.
+				c.recordKnownSafeBound(smallestID, minBound, ssa.ValueInvalid)
+			}
+
+			// Move pointer(s) for the smallest ID forward (if same, move all).
+			for i := 0; i < preds; i++ {
+				cb := &c.bounds[i][c.pointers[i]]
+				if cb.id == smallestID {
+					c.pointers[i]++
+				}
+			}
+		}
+	}
+}
+
+func (c *Compiler) getKnownSafeBoundsAtTheEndOfBlocks(id ssa.BasicBlockID) knownSafeBoundsAtTheEndOfBlock {
+	if int(id) >= len(c.knownSafeBoundsAtTheEndOfBlocks) {
+		return knownSafeBoundsAtTheEndOfBlockNil
+	}
+	return c.knownSafeBoundsAtTheEndOfBlocks[id]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go
@ -0,0 +1,10 @@
+package frontend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+func FunctionIndexToFuncRef(idx wasm.Index) ssa.FuncRef {
+	return ssa.FuncRef(idx)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go
@ -0,0 +1,15 @@
+//go:build go1.21
+
+package frontend
+
+import (
+	"slices"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+func sortSSAValueIDs(IDs []ssa.ValueID) {
+	slices.SortFunc(IDs, func(i, j ssa.ValueID) int {
+		return int(i) - int(j)
+	})
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go
@ -0,0 +1,17 @@
+//go:build !go1.21
+
+// TODO: delete after the floor Go version is 1.21
+
+package frontend
+
+import (
+	"sort"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+func sortSSAValueIDs(IDs []ssa.ValueID) {
+	sort.SliceStable(IDs, func(i, j int) bool {
+		return int(IDs[i]) < int(IDs[j])
+	})
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go
@ -0,0 +1,82 @@
+package wazevo
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+func buildHostModuleOpaque(m *wasm.Module, listeners []experimental.FunctionListener) moduleContextOpaque {
+	size := len(m.CodeSection)*16 + 32
+	ret := newAlignedOpaque(size)
+
+	binary.LittleEndian.PutUint64(ret[0:], uint64(uintptr(unsafe.Pointer(m))))
+
+	if len(listeners) > 0 {
+		sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&listeners))
+		binary.LittleEndian.PutUint64(ret[8:], uint64(sliceHeader.Data))
+		binary.LittleEndian.PutUint64(ret[16:], uint64(sliceHeader.Len))
+		binary.LittleEndian.PutUint64(ret[24:], uint64(sliceHeader.Cap))
+	}
+
+	offset := 32
+	for i := range m.CodeSection {
+		goFn := m.CodeSection[i].GoFunc
+		writeIface(goFn, ret[offset:])
+		offset += 16
+	}
+	return ret
+}
+
+func hostModuleFromOpaque(opaqueBegin uintptr) *wasm.Module {
+	var opaqueViewOverSlice []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice))
+	sh.Data = opaqueBegin
+	sh.Len = 32
+	sh.Cap = 32
+	return *(**wasm.Module)(unsafe.Pointer(&opaqueViewOverSlice[0]))
+}
+
+func hostModuleListenersSliceFromOpaque(opaqueBegin uintptr) []experimental.FunctionListener {
+	var opaqueViewOverSlice []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice))
+	sh.Data = opaqueBegin
+	sh.Len = 32
+	sh.Cap = 32
+
+	b := binary.LittleEndian.Uint64(opaqueViewOverSlice[8:])
+	l := binary.LittleEndian.Uint64(opaqueViewOverSlice[16:])
+	c := binary.LittleEndian.Uint64(opaqueViewOverSlice[24:])
+	var ret []experimental.FunctionListener
+	sh = (*reflect.SliceHeader)(unsafe.Pointer(&ret))
+	sh.Data = uintptr(b)
+	setSliceLimits(sh, uintptr(l), uintptr(c))
+	return ret
+}
+
+func hostModuleGoFuncFromOpaque[T any](index int, opaqueBegin uintptr) T {
+	offset := uintptr(index*16) + 32
+	ptr := opaqueBegin + offset
+
+	var opaqueViewOverFunction []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverFunction))
+	sh.Data = ptr
+	sh.Len = 16
+	sh.Cap = 16
+	return readIface(opaqueViewOverFunction).(T)
+}
+
+func writeIface(goFn interface{}, buf []byte) {
+	goFnIface := *(*[2]uint64)(unsafe.Pointer(&goFn))
+	binary.LittleEndian.PutUint64(buf, goFnIface[0])
+	binary.LittleEndian.PutUint64(buf[8:], goFnIface[1])
+}
+
+func readIface(buf []byte) interface{} {
+	b := binary.LittleEndian.Uint64(buf)
+	s := binary.LittleEndian.Uint64(buf[8:])
+	return *(*interface{})(unsafe.Pointer(&[2]uint64{b, s}))
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go
@ -0,0 +1,30 @@
+//go:build amd64
+
+package wazevo
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64"
+)
+
+func newMachine() backend.Machine {
+	return amd64.NewBackend()
+}
+
+// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice.
+// The implementation must be aligned with the ABI/Calling convention.
+func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr {
+	return amd64.UnwindStack(sp, fp, top, returnAddresses)
+}
+
+// goCallStackView is a function to get a view of the stack before a Go call, which
+// is the view of the stack allocated in CompileGoFunctionTrampoline.
+func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	return amd64.GoCallStackView(stackPointerBeforeGoCall)
+}
+
+// adjustClonedStack is a function to adjust the stack after it is grown.
+// More precisely, absolute addresses (frame pointers) in the stack must be adjusted.
+func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) {
+	amd64.AdjustClonedStack(oldsp, oldTop, sp, fp, top)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go
@ -0,0 +1,32 @@
+//go:build arm64
+
+package wazevo
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64"
+)
+
+func newMachine() backend.Machine {
+	return arm64.NewBackend()
+}
+
+// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice.
+// The implementation must be aligned with the ABI/Calling convention.
+func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr {
+	return arm64.UnwindStack(sp, fp, top, returnAddresses)
+}
+
+// goCallStackView is a function to get a view of the stack before a Go call, which
+// is the view of the stack allocated in CompileGoFunctionTrampoline.
+func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	return arm64.GoCallStackView(stackPointerBeforeGoCall)
+}
+
+// adjustClonedStack is a function to adjust the stack after it is grown.
+// More precisely, absolute addresses (frame pointers) in the stack must be adjusted.
+func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) {
+	// TODO: currently, the frame pointers are not used, and saved old sps are relative to the current stack pointer,
+	//  so no need to adjustment on arm64. However, when we make it absolute, which in my opinion is better perf-wise
+	//  at the expense of slightly costly stack growth, we need to adjust the pushed frame pointers.
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go
@ -0,0 +1,29 @@
+//go:build !(amd64 || arm64)
+
+package wazevo
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+)
+
+func newMachine() backend.Machine {
+	panic("unsupported architecture")
+}
+
+// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice.
+// The implementation must be aligned with the ABI/Calling convention.
+func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr {
+	panic("unsupported architecture")
+}
+
+// goCallStackView is a function to get a view of the stack before a Go call, which
+// is the view of the stack allocated in CompileGoFunctionTrampoline.
+func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	panic("unsupported architecture")
+}
+
+// adjustClonedStack is a function to adjust the stack after it is grown.
+// More precisely, absolute addresses (frame pointers) in the stack must be adjusted.
+func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) {
+	panic("unsupported architecture")
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go
@ -0,0 +1,11 @@
+package wazevo
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+//go:linkname memmove runtime.memmove
+func memmove(_, _ unsafe.Pointer, _ uintptr)
+
+var memmovPtr = reflect.ValueOf(memmove).Pointer()
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
@ -0,0 +1,344 @@
+package wazevo
+
+import (
+	"encoding/binary"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/wasm"
+	"github.com/tetratelabs/wazero/internal/wasmruntime"
+)
+
+type (
+	// moduleEngine implements wasm.ModuleEngine.
+	moduleEngine struct {
+		// opaquePtr equals &opaque[0].
+		opaquePtr              *byte
+		parent                 *compiledModule
+		module                 *wasm.ModuleInstance
+		opaque                 moduleContextOpaque
+		localFunctionInstances []*functionInstance
+		importedFunctions      []importedFunction
+		listeners              []experimental.FunctionListener
+	}
+
+	functionInstance struct {
+		executable             *byte
+		moduleContextOpaquePtr *byte
+		typeID                 wasm.FunctionTypeID
+		indexInModule          wasm.Index
+	}
+
+	importedFunction struct {
+		me            *moduleEngine
+		indexInModule wasm.Index
+	}
+
+	// moduleContextOpaque is the opaque byte slice of Module instance specific contents whose size
+	// is only Wasm-compile-time known, hence dynamic. Its contents are basically the pointers to the module instance,
+	// specific objects as well as functions. This is sometimes called "VMContext" in other Wasm runtimes.
+	//
+	// Internally, the buffer is structured as follows:
+	//
+	// 	type moduleContextOpaque struct {
+	// 	    moduleInstance                            *wasm.ModuleInstance
+	// 	    localMemoryBufferPtr                      *byte                (optional)
+	// 	    localMemoryLength                         uint64               (optional)
+	// 	    importedMemoryInstance                    *wasm.MemoryInstance (optional)
+	// 	    importedMemoryOwnerOpaqueCtx              *byte                (optional)
+	// 	    importedFunctions                         [# of importedFunctions]functionInstance
+	//      importedGlobals                           []ImportedGlobal       (optional)
+	//      localGlobals                              []Global               (optional)
+	//      typeIDsBegin                              &wasm.ModuleInstance.TypeIDs[0]  (optional)
+	//      tables                                    []*wasm.TableInstance  (optional)
+	// 	    beforeListenerTrampolines1stElement       **byte                 (optional)
+	// 	    afterListenerTrampolines1stElement        **byte                 (optional)
+	//      dataInstances1stElement                   []wasm.DataInstance    (optional)
+	//      elementInstances1stElement                []wasm.ElementInstance (optional)
+	// 	}
+	//
+	//  type ImportedGlobal struct {
+	// 		*Global
+	// 		_ uint64 // padding
+	//  }
+	//
+	//  type Global struct {
+	// 		Val, ValHi uint64
+	//  }
+	//
+	// See wazevoapi.NewModuleContextOffsetData for the details of the offsets.
+	//
+	// Note that for host modules, the structure is entirely different. See buildHostModuleOpaque.
+	moduleContextOpaque []byte
+)
+
+func newAlignedOpaque(size int) moduleContextOpaque {
+	// Check if the size is a multiple of 16.
+	if size%16 != 0 {
+		panic("size must be a multiple of 16")
+	}
+	buf := make([]byte, size+16)
+	// Align the buffer to 16 bytes.
+	rem := uintptr(unsafe.Pointer(&buf[0])) % 16
+	buf = buf[16-rem:]
+	return buf
+}
+
+func putLocalMemory(opaque []byte, offset wazevoapi.Offset, mem *wasm.MemoryInstance) {
+	s := uint64(len(mem.Buffer))
+	var b uint64
+	if len(mem.Buffer) > 0 {
+		b = uint64(uintptr(unsafe.Pointer(&mem.Buffer[0])))
+	}
+	binary.LittleEndian.PutUint64(opaque[offset:], b)
+	binary.LittleEndian.PutUint64(opaque[offset+8:], s)
+}
+
+func (m *moduleEngine) setupOpaque() {
+	inst := m.module
+	offsets := &m.parent.offsets
+	opaque := m.opaque
+
+	binary.LittleEndian.PutUint64(opaque[offsets.ModuleInstanceOffset:],
+		uint64(uintptr(unsafe.Pointer(m.module))),
+	)
+
+	if lm := offsets.LocalMemoryBegin; lm >= 0 {
+		putLocalMemory(opaque, lm, inst.MemoryInstance)
+	}
+
+	// Note: imported memory is resolved in ResolveImportedFunction.
+
+	// Note: imported functions are resolved in ResolveImportedFunction.
+
+	if globalOffset := offsets.GlobalsBegin; globalOffset >= 0 {
+		for i, g := range inst.Globals {
+			if i < int(inst.Source.ImportGlobalCount) {
+				importedME := g.Me.(*moduleEngine)
+				offset := importedME.parent.offsets.GlobalInstanceOffset(g.Index)
+				importedMEOpaque := importedME.opaque
+				binary.LittleEndian.PutUint64(opaque[globalOffset:],
+					uint64(uintptr(unsafe.Pointer(&importedMEOpaque[offset]))))
+			} else {
+				binary.LittleEndian.PutUint64(opaque[globalOffset:], g.Val)
+				binary.LittleEndian.PutUint64(opaque[globalOffset+8:], g.ValHi)
+			}
+			globalOffset += 16
+		}
+	}
+
+	if tableOffset := offsets.TablesBegin; tableOffset >= 0 {
+		// First we write the first element's address of typeIDs.
+		if len(inst.TypeIDs) > 0 {
+			binary.LittleEndian.PutUint64(opaque[offsets.TypeIDs1stElement:], uint64(uintptr(unsafe.Pointer(&inst.TypeIDs[0]))))
+		}
+
+		// Then we write the table addresses.
+		for _, table := range inst.Tables {
+			binary.LittleEndian.PutUint64(opaque[tableOffset:], uint64(uintptr(unsafe.Pointer(table))))
+			tableOffset += 8
+		}
+	}
+
+	if beforeListenerOffset := offsets.BeforeListenerTrampolines1stElement; beforeListenerOffset >= 0 {
+		binary.LittleEndian.PutUint64(opaque[beforeListenerOffset:], uint64(uintptr(unsafe.Pointer(&m.parent.listenerBeforeTrampolines[0]))))
+	}
+	if afterListenerOffset := offsets.AfterListenerTrampolines1stElement; afterListenerOffset >= 0 {
+		binary.LittleEndian.PutUint64(opaque[afterListenerOffset:], uint64(uintptr(unsafe.Pointer(&m.parent.listenerAfterTrampolines[0]))))
+	}
+	if len(inst.DataInstances) > 0 {
+		binary.LittleEndian.PutUint64(opaque[offsets.DataInstances1stElement:], uint64(uintptr(unsafe.Pointer(&inst.DataInstances[0]))))
+	}
+	if len(inst.ElementInstances) > 0 {
+		binary.LittleEndian.PutUint64(opaque[offsets.ElementInstances1stElement:], uint64(uintptr(unsafe.Pointer(&inst.ElementInstances[0]))))
+	}
+}
+
+// NewFunction implements wasm.ModuleEngine.
+func (m *moduleEngine) NewFunction(index wasm.Index) api.Function {
+	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+		panic("When PrintMachineCodeHexPerFunctionDisassemblable enabled, functions must not be called")
+	}
+
+	localIndex := index
+	if importedFnCount := m.module.Source.ImportFunctionCount; index < importedFnCount {
+		imported := &m.importedFunctions[index]
+		return imported.me.NewFunction(imported.indexInModule)
+	} else {
+		localIndex -= importedFnCount
+	}
+
+	src := m.module.Source
+	typIndex := src.FunctionSection[localIndex]
+	typ := src.TypeSection[typIndex]
+	sizeOfParamResultSlice := typ.ResultNumInUint64
+	if ps := typ.ParamNumInUint64; ps > sizeOfParamResultSlice {
+		sizeOfParamResultSlice = ps
+	}
+	p := m.parent
+	offset := p.functionOffsets[localIndex]
+
+	ce := &callEngine{
+		indexInModule:          index,
+		executable:             &p.executable[offset],
+		parent:                 m,
+		preambleExecutable:     &m.parent.entryPreambles[typIndex][0],
+		sizeOfParamResultSlice: sizeOfParamResultSlice,
+		requiredParams:         typ.ParamNumInUint64,
+		numberOfResults:        typ.ResultNumInUint64,
+	}
+
+	ce.execCtx.memoryGrowTrampolineAddress = &m.parent.sharedFunctions.memoryGrowExecutable[0]
+	ce.execCtx.stackGrowCallTrampolineAddress = &m.parent.sharedFunctions.stackGrowExecutable[0]
+	ce.execCtx.checkModuleExitCodeTrampolineAddress = &m.parent.sharedFunctions.checkModuleExitCode[0]
+	ce.execCtx.tableGrowTrampolineAddress = &m.parent.sharedFunctions.tableGrowExecutable[0]
+	ce.execCtx.refFuncTrampolineAddress = &m.parent.sharedFunctions.refFuncExecutable[0]
+	ce.execCtx.memoryWait32TrampolineAddress = &m.parent.sharedFunctions.memoryWait32Executable[0]
+	ce.execCtx.memoryWait64TrampolineAddress = &m.parent.sharedFunctions.memoryWait64Executable[0]
+	ce.execCtx.memoryNotifyTrampolineAddress = &m.parent.sharedFunctions.memoryNotifyExecutable[0]
+	ce.execCtx.memmoveAddress = memmovPtr
+	ce.init()
+	return ce
+}
+
+// GetGlobalValue implements the same method as documented on wasm.ModuleEngine.
+func (m *moduleEngine) GetGlobalValue(i wasm.Index) (lo, hi uint64) {
+	offset := m.parent.offsets.GlobalInstanceOffset(i)
+	buf := m.opaque[offset:]
+	if i < m.module.Source.ImportGlobalCount {
+		panic("GetGlobalValue should not be called for imported globals")
+	}
+	return binary.LittleEndian.Uint64(buf), binary.LittleEndian.Uint64(buf[8:])
+}
+
+// SetGlobalValue implements the same method as documented on wasm.ModuleEngine.
+func (m *moduleEngine) SetGlobalValue(i wasm.Index, lo, hi uint64) {
+	offset := m.parent.offsets.GlobalInstanceOffset(i)
+	buf := m.opaque[offset:]
+	if i < m.module.Source.ImportGlobalCount {
+		panic("GetGlobalValue should not be called for imported globals")
+	}
+	binary.LittleEndian.PutUint64(buf, lo)
+	binary.LittleEndian.PutUint64(buf[8:], hi)
+}
+
+// OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
+func (m *moduleEngine) OwnsGlobals() bool { return true }
+
+// ResolveImportedFunction implements wasm.ModuleEngine.
+func (m *moduleEngine) ResolveImportedFunction(index, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) {
+	executableOffset, moduleCtxOffset, typeIDOffset := m.parent.offsets.ImportedFunctionOffset(index)
+	importedME := importedModuleEngine.(*moduleEngine)
+
+	if int(indexInImportedModule) >= len(importedME.importedFunctions) {
+		indexInImportedModule -= wasm.Index(len(importedME.importedFunctions))
+	} else {
+		imported := &importedME.importedFunctions[indexInImportedModule]
+		m.ResolveImportedFunction(index, imported.indexInModule, imported.me)
+		return // Recursively resolve the imported function.
+	}
+
+	offset := importedME.parent.functionOffsets[indexInImportedModule]
+	typeID := getTypeIDOf(indexInImportedModule, importedME.module)
+	executable := &importedME.parent.executable[offset]
+	// Write functionInstance.
+	binary.LittleEndian.PutUint64(m.opaque[executableOffset:], uint64(uintptr(unsafe.Pointer(executable))))
+	binary.LittleEndian.PutUint64(m.opaque[moduleCtxOffset:], uint64(uintptr(unsafe.Pointer(importedME.opaquePtr))))
+	binary.LittleEndian.PutUint64(m.opaque[typeIDOffset:], uint64(typeID))
+
+	// Write importedFunction so that it can be used by NewFunction.
+	m.importedFunctions[index] = importedFunction{me: importedME, indexInModule: indexInImportedModule}
+}
+
+func getTypeIDOf(funcIndex wasm.Index, m *wasm.ModuleInstance) wasm.FunctionTypeID {
+	source := m.Source
+
+	var typeIndex wasm.Index
+	if funcIndex >= source.ImportFunctionCount {
+		funcIndex -= source.ImportFunctionCount
+		typeIndex = source.FunctionSection[funcIndex]
+	} else {
+		var cnt wasm.Index
+		for i := range source.ImportSection {
+			if source.ImportSection[i].Type == wasm.ExternTypeFunc {
+				if cnt == funcIndex {
+					typeIndex = source.ImportSection[i].DescFunc
+					break
+				}
+				cnt++
+			}
+		}
+	}
+	return m.TypeIDs[typeIndex]
+}
+
+// ResolveImportedMemory implements wasm.ModuleEngine.
+func (m *moduleEngine) ResolveImportedMemory(importedModuleEngine wasm.ModuleEngine) {
+	importedME := importedModuleEngine.(*moduleEngine)
+	inst := importedME.module
+
+	var memInstPtr uint64
+	var memOwnerOpaquePtr uint64
+	if offs := importedME.parent.offsets; offs.ImportedMemoryBegin >= 0 {
+		offset := offs.ImportedMemoryBegin
+		memInstPtr = binary.LittleEndian.Uint64(importedME.opaque[offset:])
+		memOwnerOpaquePtr = binary.LittleEndian.Uint64(importedME.opaque[offset+8:])
+	} else {
+		memInstPtr = uint64(uintptr(unsafe.Pointer(inst.MemoryInstance)))
+		memOwnerOpaquePtr = uint64(uintptr(unsafe.Pointer(importedME.opaquePtr)))
+	}
+	offset := m.parent.offsets.ImportedMemoryBegin
+	binary.LittleEndian.PutUint64(m.opaque[offset:], memInstPtr)
+	binary.LittleEndian.PutUint64(m.opaque[offset+8:], memOwnerOpaquePtr)
+}
+
+// DoneInstantiation implements wasm.ModuleEngine.
+func (m *moduleEngine) DoneInstantiation() {
+	if !m.module.Source.IsHostModule {
+		m.setupOpaque()
+	}
+}
+
+// FunctionInstanceReference implements wasm.ModuleEngine.
+func (m *moduleEngine) FunctionInstanceReference(funcIndex wasm.Index) wasm.Reference {
+	if funcIndex < m.module.Source.ImportFunctionCount {
+		begin, _, _ := m.parent.offsets.ImportedFunctionOffset(funcIndex)
+		return uintptr(unsafe.Pointer(&m.opaque[begin]))
+	}
+	localIndex := funcIndex - m.module.Source.ImportFunctionCount
+	p := m.parent
+	executable := &p.executable[p.functionOffsets[localIndex]]
+	typeID := m.module.TypeIDs[m.module.Source.FunctionSection[localIndex]]
+
+	lf := &functionInstance{
+		executable:             executable,
+		moduleContextOpaquePtr: m.opaquePtr,
+		typeID:                 typeID,
+		indexInModule:          funcIndex,
+	}
+	m.localFunctionInstances = append(m.localFunctionInstances, lf)
+	return uintptr(unsafe.Pointer(lf))
+}
+
+// LookupFunction implements wasm.ModuleEngine.
+func (m *moduleEngine) LookupFunction(t *wasm.TableInstance, typeId wasm.FunctionTypeID, tableOffset wasm.Index) (*wasm.ModuleInstance, wasm.Index) {
+	if tableOffset >= uint32(len(t.References)) || t.Type != wasm.RefTypeFuncref {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+	rawPtr := t.References[tableOffset]
+	if rawPtr == 0 {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+
+	tf := wazevoapi.PtrFromUintptr[functionInstance](rawPtr)
+	if tf.typeID != typeId {
+		panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
+	}
+	return moduleInstanceFromOpaquePtr(tf.moduleContextOpaquePtr), tf.indexInModule
+}
+
+func moduleInstanceFromOpaquePtr(ptr *byte) *wasm.ModuleInstance {
+	return *(**wasm.ModuleInstance)(unsafe.Pointer(ptr))
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go
@ -0,0 +1,11 @@
+//go:build !tinygo
+
+package wazevo
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) {
+	s.Len = int(l)
+	s.Cap = int(c)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go
@ -0,0 +1,11 @@
+//go:build tinygo
+
+package wazevo
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) {
+	s.Len = l
+	s.Cap = c
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
@ -0,0 +1,407 @@
+package ssa
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// BasicBlock represents the Basic Block of an SSA function.
+// Each BasicBlock always ends with branching instructions (e.g. Branch, Return, etc.),
+// and at most two branches are allowed. If there's two branches, these two are placed together at the end of the block.
+// In other words, there's no branching instruction in the middle of the block.
+//
+// Note: we use the "block argument" variant of SSA, instead of PHI functions. See the package level doc comments.
+//
+// Note: we use "parameter/param" as a placeholder which represents a variant of PHI, and "argument/arg" as an actual
+// Value passed to that "parameter/param".
+type BasicBlock interface {
+	// ID returns the unique ID of this block.
+	ID() BasicBlockID
+
+	// Name returns the unique string ID of this block. e.g. blk0, blk1, ...
+	Name() string
+
+	// AddParam adds the parameter to the block whose type specified by `t`.
+	AddParam(b Builder, t Type) Value
+
+	// Params returns the number of parameters to this block.
+	Params() int
+
+	// Param returns (Variable, Value) which corresponds to the i-th parameter of this block.
+	// The returned Value is the definition of the param in this block.
+	Param(i int) Value
+
+	// InsertInstruction inserts an instruction that implements Value into the tail of this block.
+	InsertInstruction(raw *Instruction)
+
+	// Root returns the root instruction of this block.
+	Root() *Instruction
+
+	// Tail returns the tail instruction of this block.
+	Tail() *Instruction
+
+	// EntryBlock returns true if this block represents the function entry.
+	EntryBlock() bool
+
+	// ReturnBlock returns ture if this block represents the function return.
+	ReturnBlock() bool
+
+	// FormatHeader returns the debug string of this block, not including instruction.
+	FormatHeader(b Builder) string
+
+	// Valid is true if this block is still valid even after optimizations.
+	Valid() bool
+
+	// Sealed is true if this block has been sealed.
+	Sealed() bool
+
+	// BeginPredIterator returns the first predecessor of this block.
+	BeginPredIterator() BasicBlock
+
+	// NextPredIterator returns the next predecessor of this block.
+	NextPredIterator() BasicBlock
+
+	// Preds returns the number of predecessors of this block.
+	Preds() int
+
+	// Pred returns the i-th predecessor of this block.
+	Pred(i int) BasicBlock
+
+	// Succs returns the number of successors of this block.
+	Succs() int
+
+	// Succ returns the i-th successor of this block.
+	Succ(i int) BasicBlock
+
+	// LoopHeader returns true if this block is a loop header.
+	LoopHeader() bool
+
+	// LoopNestingForestChildren returns the children of this block in the loop nesting forest.
+	LoopNestingForestChildren() []BasicBlock
+}
+
+type (
+	// basicBlock is a basic block in a SSA-transformed function.
+	basicBlock struct {
+		id                      BasicBlockID
+		rootInstr, currentInstr *Instruction
+		params                  []blockParam
+		predIter                int
+		preds                   []basicBlockPredecessorInfo
+		success                 []*basicBlock
+		// singlePred is the alias to preds[0] for fast lookup, and only set after Seal is called.
+		singlePred *basicBlock
+		// lastDefinitions maps Variable to its last definition in this block.
+		lastDefinitions map[Variable]Value
+		// unknownsValues are used in builder.findValue. The usage is well-described in the paper.
+		unknownValues []unknownValue
+		// invalid is true if this block is made invalid during optimizations.
+		invalid bool
+		// sealed is true if this is sealed (all the predecessors are known).
+		sealed bool
+		// loopHeader is true if this block is a loop header:
+		//
+		// > A loop header (sometimes called the entry point of the loop) is a dominator that is the target
+		// > of a loop-forming back edge. The loop header dominates all blocks in the loop body.
+		// > A block may be a loop header for more than one loop. A loop may have multiple entry points,
+		// > in which case it has no "loop header".
+		//
+		// See https://en.wikipedia.org/wiki/Control-flow_graph for more details.
+		//
+		// This is modified during the subPassLoopDetection pass.
+		loopHeader bool
+
+		// loopNestingForestChildren holds the children of this block in the loop nesting forest.
+		// Non-empty if and only if this block is a loop header (i.e. loopHeader=true)
+		loopNestingForestChildren []BasicBlock
+
+		// reversePostOrder is used to sort all the blocks in the function in reverse post order.
+		// This is used in builder.LayoutBlocks.
+		reversePostOrder int
+
+		// child and sibling are the ones in the dominator tree.
+		child, sibling *basicBlock
+	}
+	// BasicBlockID is the unique ID of a basicBlock.
+	BasicBlockID uint32
+
+	// blockParam implements Value and represents a parameter to a basicBlock.
+	blockParam struct {
+		// value is the Value that corresponds to the parameter in this block,
+		// and can be considered as an output of PHI instruction in traditional SSA.
+		value Value
+		// typ is the type of the parameter.
+		typ Type
+	}
+
+	unknownValue struct {
+		// variable is the variable that this unknownValue represents.
+		variable Variable
+		// value is the value that this unknownValue represents.
+		value Value
+	}
+)
+
+const basicBlockIDReturnBlock = 0xffffffff
+
+// Name implements BasicBlock.Name.
+func (bb *basicBlock) Name() string {
+	if bb.id == basicBlockIDReturnBlock {
+		return "blk_ret"
+	} else {
+		return fmt.Sprintf("blk%d", bb.id)
+	}
+}
+
+// String implements fmt.Stringer for debugging.
+func (bid BasicBlockID) String() string {
+	if bid == basicBlockIDReturnBlock {
+		return "blk_ret"
+	} else {
+		return fmt.Sprintf("blk%d", bid)
+	}
+}
+
+// ID implements BasicBlock.ID.
+func (bb *basicBlock) ID() BasicBlockID {
+	return bb.id
+}
+
+// basicBlockPredecessorInfo is the information of a predecessor of a basicBlock.
+// predecessor is determined by a pair of block and the branch instruction used to jump to the successor.
+type basicBlockPredecessorInfo struct {
+	blk    *basicBlock
+	branch *Instruction
+}
+
+// EntryBlock implements BasicBlock.EntryBlock.
+func (bb *basicBlock) EntryBlock() bool {
+	return bb.id == 0
+}
+
+// ReturnBlock implements BasicBlock.ReturnBlock.
+func (bb *basicBlock) ReturnBlock() bool {
+	return bb.id == basicBlockIDReturnBlock
+}
+
+// AddParam implements BasicBlock.AddParam.
+func (bb *basicBlock) AddParam(b Builder, typ Type) Value {
+	paramValue := b.allocateValue(typ)
+	bb.params = append(bb.params, blockParam{typ: typ, value: paramValue})
+	return paramValue
+}
+
+// addParamOn adds a parameter to this block whose value is already allocated.
+func (bb *basicBlock) addParamOn(typ Type, value Value) {
+	bb.params = append(bb.params, blockParam{typ: typ, value: value})
+}
+
+// Params implements BasicBlock.Params.
+func (bb *basicBlock) Params() int {
+	return len(bb.params)
+}
+
+// Param implements BasicBlock.Param.
+func (bb *basicBlock) Param(i int) Value {
+	p := &bb.params[i]
+	return p.value
+}
+
+// Valid implements BasicBlock.Valid.
+func (bb *basicBlock) Valid() bool {
+	return !bb.invalid
+}
+
+// Sealed implements BasicBlock.Sealed.
+func (bb *basicBlock) Sealed() bool {
+	return bb.sealed
+}
+
+// InsertInstruction implements BasicBlock.InsertInstruction.
+func (bb *basicBlock) InsertInstruction(next *Instruction) {
+	current := bb.currentInstr
+	if current != nil {
+		current.next = next
+		next.prev = current
+	} else {
+		bb.rootInstr = next
+	}
+	bb.currentInstr = next
+
+	switch next.opcode {
+	case OpcodeJump, OpcodeBrz, OpcodeBrnz:
+		target := next.blk.(*basicBlock)
+		target.addPred(bb, next)
+	case OpcodeBrTable:
+		for _, _target := range next.targets {
+			target := _target.(*basicBlock)
+			target.addPred(bb, next)
+		}
+	}
+}
+
+// NumPreds implements BasicBlock.NumPreds.
+func (bb *basicBlock) NumPreds() int {
+	return len(bb.preds)
+}
+
+// BeginPredIterator implements BasicBlock.BeginPredIterator.
+func (bb *basicBlock) BeginPredIterator() BasicBlock {
+	bb.predIter = 0
+	return bb.NextPredIterator()
+}
+
+// NextPredIterator implements BasicBlock.NextPredIterator.
+func (bb *basicBlock) NextPredIterator() BasicBlock {
+	if bb.predIter >= len(bb.preds) {
+		return nil
+	}
+	pred := bb.preds[bb.predIter].blk
+	bb.predIter++
+	return pred
+}
+
+// Preds implements BasicBlock.Preds.
+func (bb *basicBlock) Preds() int {
+	return len(bb.preds)
+}
+
+// Pred implements BasicBlock.Pred.
+func (bb *basicBlock) Pred(i int) BasicBlock {
+	return bb.preds[i].blk
+}
+
+// Succs implements BasicBlock.Succs.
+func (bb *basicBlock) Succs() int {
+	return len(bb.success)
+}
+
+// Succ implements BasicBlock.Succ.
+func (bb *basicBlock) Succ(i int) BasicBlock {
+	return bb.success[i]
+}
+
+// Root implements BasicBlock.Root.
+func (bb *basicBlock) Root() *Instruction {
+	return bb.rootInstr
+}
+
+// Tail implements BasicBlock.Tail.
+func (bb *basicBlock) Tail() *Instruction {
+	return bb.currentInstr
+}
+
+// reset resets the basicBlock to its initial state so that it can be reused for another function.
+func resetBasicBlock(bb *basicBlock) {
+	bb.params = bb.params[:0]
+	bb.rootInstr, bb.currentInstr = nil, nil
+	bb.preds = bb.preds[:0]
+	bb.success = bb.success[:0]
+	bb.invalid, bb.sealed = false, false
+	bb.singlePred = nil
+	bb.unknownValues = bb.unknownValues[:0]
+	bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions)
+	bb.reversePostOrder = -1
+	bb.loopNestingForestChildren = bb.loopNestingForestChildren[:0]
+	bb.loopHeader = false
+	bb.sibling = nil
+	bb.child = nil
+}
+
+// addPred adds a predecessor to this block specified by the branch instruction.
+func (bb *basicBlock) addPred(blk BasicBlock, branch *Instruction) {
+	if bb.sealed {
+		panic("BUG: trying to add predecessor to a sealed block: " + bb.Name())
+	}
+
+	pred := blk.(*basicBlock)
+	for i := range bb.preds {
+		existingPred := &bb.preds[i]
+		if existingPred.blk == pred && existingPred.branch != branch {
+			// If the target is already added, then this must come from the same BrTable,
+			// otherwise such redundant branch should be eliminated by the frontend. (which should be simpler).
+			panic(fmt.Sprintf("BUG: redundant non BrTable jumps in %s whose targes are the same", bb.Name()))
+		}
+	}
+
+	bb.preds = append(bb.preds, basicBlockPredecessorInfo{
+		blk:    pred,
+		branch: branch,
+	})
+
+	pred.success = append(pred.success, bb)
+}
+
+// FormatHeader implements BasicBlock.FormatHeader.
+func (bb *basicBlock) FormatHeader(b Builder) string {
+	ps := make([]string, len(bb.params))
+	for i, p := range bb.params {
+		ps[i] = p.value.formatWithType(b)
+	}
+
+	if len(bb.preds) > 0 {
+		preds := make([]string, 0, len(bb.preds))
+		for _, pred := range bb.preds {
+			if pred.blk.invalid {
+				continue
+			}
+			preds = append(preds, fmt.Sprintf("blk%d", pred.blk.id))
+
+		}
+		return fmt.Sprintf("blk%d: (%s) <-- (%s)",
+			bb.id, strings.Join(ps, ","), strings.Join(preds, ","))
+	} else {
+		return fmt.Sprintf("blk%d: (%s)", bb.id, strings.Join(ps, ", "))
+	}
+}
+
+// validates validates the basicBlock for debugging purpose.
+func (bb *basicBlock) validate(b *builder) {
+	if bb.invalid {
+		panic("BUG: trying to validate an invalid block: " + bb.Name())
+	}
+	if len(bb.preds) > 0 {
+		for _, pred := range bb.preds {
+			if pred.branch.opcode != OpcodeBrTable {
+				if target := pred.branch.blk; target != bb {
+					panic(fmt.Sprintf("BUG: '%s' is not branch to %s, but to %s",
+						pred.branch.Format(b), bb.Name(), target.Name()))
+				}
+			}
+
+			var exp int
+			if bb.ReturnBlock() {
+				exp = len(b.currentSignature.Results)
+			} else {
+				exp = len(bb.params)
+			}
+
+			if len(pred.branch.vs.View()) != exp {
+				panic(fmt.Sprintf(
+					"BUG: len(argument at %s) != len(params at %s): %d != %d: %s",
+					pred.blk.Name(), bb.Name(),
+					len(pred.branch.vs.View()), len(bb.params), pred.branch.Format(b),
+				))
+			}
+
+		}
+	}
+}
+
+// String implements fmt.Stringer for debugging purpose only.
+func (bb *basicBlock) String() string {
+	return strconv.Itoa(int(bb.id))
+}
+
+// LoopNestingForestChildren implements BasicBlock.LoopNestingForestChildren.
+func (bb *basicBlock) LoopNestingForestChildren() []BasicBlock {
+	return bb.loopNestingForestChildren
+}
+
+// LoopHeader implements BasicBlock.LoopHeader.
+func (bb *basicBlock) LoopHeader() bool {
+	return bb.loopHeader
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go
@ -0,0 +1,34 @@
+//go:build go1.21
+
+package ssa
+
+import (
+	"slices"
+)
+
+func sortBlocks(blocks []*basicBlock) {
+	slices.SortFunc(blocks, func(i, j *basicBlock) int {
+		jIsReturn := j.ReturnBlock()
+		iIsReturn := i.ReturnBlock()
+		if iIsReturn && jIsReturn {
+			return 0
+		}
+		if jIsReturn {
+			return 1
+		}
+		if iIsReturn {
+			return -1
+		}
+		iRoot, jRoot := i.rootInstr, j.rootInstr
+		if iRoot == nil && jRoot == nil { // For testing.
+			return 0
+		}
+		if jRoot == nil {
+			return 1
+		}
+		if iRoot == nil {
+			return -1
+		}
+		return i.rootInstr.id - j.rootInstr.id
+	})
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go
@ -0,0 +1,24 @@
+//go:build !go1.21
+
+// TODO: delete after the floor Go version is 1.21
+
+package ssa
+
+import "sort"
+
+func sortBlocks(blocks []*basicBlock) {
+	sort.SliceStable(blocks, func(i, j int) bool {
+		iBlk, jBlk := blocks[i], blocks[j]
+		if jBlk.ReturnBlock() {
+			return true
+		}
+		if iBlk.ReturnBlock() {
+			return false
+		}
+		iRoot, jRoot := iBlk.rootInstr, jBlk.rootInstr
+		if iRoot == nil || jRoot == nil { // For testing.
+			return true
+		}
+		return iBlk.rootInstr.id < jBlk.rootInstr.id
+	})
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
@ -0,0 +1,731 @@
+package ssa
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// Builder is used to builds SSA consisting of Basic Blocks per function.
+type Builder interface {
+	// Init must be called to reuse this builder for the next function.
+	Init(typ *Signature)
+
+	// Signature returns the Signature of the currently-compiled function.
+	Signature() *Signature
+
+	// BlockIDMax returns the maximum value of BasicBlocksID existing in the currently-compiled function.
+	BlockIDMax() BasicBlockID
+
+	// AllocateBasicBlock creates a basic block in SSA function.
+	AllocateBasicBlock() BasicBlock
+
+	// CurrentBlock returns the currently handled BasicBlock which is set by the latest call to SetCurrentBlock.
+	CurrentBlock() BasicBlock
+
+	// EntryBlock returns the entry BasicBlock of the currently-compiled function.
+	EntryBlock() BasicBlock
+
+	// SetCurrentBlock sets the instruction insertion target to the BasicBlock `b`.
+	SetCurrentBlock(b BasicBlock)
+
+	// DeclareVariable declares a Variable of the given Type.
+	DeclareVariable(Type) Variable
+
+	// DefineVariable defines a variable in the `block` with value.
+	// The defining instruction will be inserted into the `block`.
+	DefineVariable(variable Variable, value Value, block BasicBlock)
+
+	// DefineVariableInCurrentBB is the same as DefineVariable except the definition is
+	// inserted into the current BasicBlock. Alias to DefineVariable(x, y, CurrentBlock()).
+	DefineVariableInCurrentBB(variable Variable, value Value)
+
+	// AllocateInstruction returns a new Instruction.
+	AllocateInstruction() *Instruction
+
+	// InsertInstruction executes BasicBlock.InsertInstruction for the currently handled basic block.
+	InsertInstruction(raw *Instruction)
+
+	// allocateValue allocates an unused Value.
+	allocateValue(typ Type) Value
+
+	// MustFindValue searches the latest definition of the given Variable and returns the result.
+	MustFindValue(variable Variable) Value
+
+	// MustFindValueInBlk is the same as MustFindValue except it searches the latest definition from the given BasicBlock.
+	MustFindValueInBlk(variable Variable, blk BasicBlock) Value
+
+	// FindValueInLinearPath tries to find the latest definition of the given Variable in the linear path to the current BasicBlock.
+	// If it cannot find the definition, or it's not sealed yet, it returns ValueInvalid.
+	FindValueInLinearPath(variable Variable) Value
+
+	// Seal declares that we've known all the predecessors to this block and were added via AddPred.
+	// After calling this, AddPred will be forbidden.
+	Seal(blk BasicBlock)
+
+	// AnnotateValue is for debugging purpose.
+	AnnotateValue(value Value, annotation string)
+
+	// DeclareSignature appends the *Signature to be referenced by various instructions (e.g. OpcodeCall).
+	DeclareSignature(signature *Signature)
+
+	// Signatures returns the slice of declared Signatures.
+	Signatures() []*Signature
+
+	// ResolveSignature returns the Signature which corresponds to SignatureID.
+	ResolveSignature(id SignatureID) *Signature
+
+	// RunPasses runs various passes on the constructed SSA function.
+	RunPasses()
+
+	// Format returns the debugging string of the SSA function.
+	Format() string
+
+	// BlockIteratorBegin initializes the state to iterate over all the valid BasicBlock(s) compiled.
+	// Combined with BlockIteratorNext, we can use this like:
+	//
+	// 	for blk := builder.BlockIteratorBegin(); blk != nil; blk = builder.BlockIteratorNext() {
+	// 		// ...
+	//	}
+	//
+	// The returned blocks are ordered in the order of AllocateBasicBlock being called.
+	BlockIteratorBegin() BasicBlock
+
+	// BlockIteratorNext advances the state for iteration initialized by BlockIteratorBegin.
+	// Returns nil if there's no unseen BasicBlock.
+	BlockIteratorNext() BasicBlock
+
+	// ValueRefCounts returns the map of ValueID to its reference count.
+	// The returned slice must not be modified.
+	ValueRefCounts() []int
+
+	// BlockIteratorReversePostOrderBegin is almost the same as BlockIteratorBegin except it returns the BasicBlock in the reverse post-order.
+	// This is available after RunPasses is run.
+	BlockIteratorReversePostOrderBegin() BasicBlock
+
+	// BlockIteratorReversePostOrderNext is almost the same as BlockIteratorPostOrderNext except it returns the BasicBlock in the reverse post-order.
+	// This is available after RunPasses is run.
+	BlockIteratorReversePostOrderNext() BasicBlock
+
+	// ReturnBlock returns the BasicBlock which is used to return from the function.
+	ReturnBlock() BasicBlock
+
+	// InsertUndefined inserts an undefined instruction at the current position.
+	InsertUndefined()
+
+	// SetCurrentSourceOffset sets the current source offset. The incoming instruction will be annotated with this offset.
+	SetCurrentSourceOffset(line SourceOffset)
+
+	// LoopNestingForestRoots returns the roots of the loop nesting forest.
+	LoopNestingForestRoots() []BasicBlock
+
+	// LowestCommonAncestor returns the lowest common ancestor in the dominator tree of the given BasicBlock(s).
+	LowestCommonAncestor(blk1, blk2 BasicBlock) BasicBlock
+
+	// Idom returns the immediate dominator of the given BasicBlock.
+	Idom(blk BasicBlock) BasicBlock
+
+	VarLengthPool() *wazevoapi.VarLengthPool[Value]
+}
+
+// NewBuilder returns a new Builder implementation.
+func NewBuilder() Builder {
+	return &builder{
+		instructionsPool:               wazevoapi.NewPool[Instruction](resetInstruction),
+		basicBlocksPool:                wazevoapi.NewPool[basicBlock](resetBasicBlock),
+		varLengthPool:                  wazevoapi.NewVarLengthPool[Value](),
+		valueAnnotations:               make(map[ValueID]string),
+		signatures:                     make(map[SignatureID]*Signature),
+		blkVisited:                     make(map[*basicBlock]int),
+		valueIDAliases:                 make(map[ValueID]Value),
+		redundantParameterIndexToValue: make(map[int]Value),
+		returnBlk:                      &basicBlock{id: basicBlockIDReturnBlock},
+	}
+}
+
+// builder implements Builder interface.
+type builder struct {
+	basicBlocksPool  wazevoapi.Pool[basicBlock]
+	instructionsPool wazevoapi.Pool[Instruction]
+	varLengthPool    wazevoapi.VarLengthPool[Value]
+	signatures       map[SignatureID]*Signature
+	currentSignature *Signature
+
+	// reversePostOrderedBasicBlocks are the BasicBlock(s) ordered in the reverse post-order after passCalculateImmediateDominators.
+	reversePostOrderedBasicBlocks []*basicBlock
+	currentBB                     *basicBlock
+	returnBlk                     *basicBlock
+
+	// variables track the types for Variable with the index regarded Variable.
+	variables []Type
+	// nextValueID is used by builder.AllocateValue.
+	nextValueID ValueID
+	// nextVariable is used by builder.AllocateVariable.
+	nextVariable Variable
+
+	valueIDAliases   map[ValueID]Value
+	valueAnnotations map[ValueID]string
+
+	// valueRefCounts is used to lower the SSA in backend, and will be calculated
+	// by the last SSA-level optimization pass.
+	valueRefCounts []int
+
+	// dominators stores the immediate dominator of each BasicBlock.
+	// The index is blockID of the BasicBlock.
+	dominators []*basicBlock
+	sparseTree dominatorSparseTree
+
+	// loopNestingForestRoots are the roots of the loop nesting forest.
+	loopNestingForestRoots []BasicBlock
+
+	// The followings are used for optimization passes/deterministic compilation.
+	instStack                      []*Instruction
+	blkVisited                     map[*basicBlock]int
+	valueIDToInstruction           []*Instruction
+	blkStack                       []*basicBlock
+	blkStack2                      []*basicBlock
+	ints                           []int
+	redundantParameterIndexToValue map[int]Value
+
+	// blockIterCur is used to implement blockIteratorBegin and blockIteratorNext.
+	blockIterCur int
+
+	// donePreBlockLayoutPasses is true if all the passes before LayoutBlocks are called.
+	donePreBlockLayoutPasses bool
+	// doneBlockLayout is true if LayoutBlocks is called.
+	doneBlockLayout bool
+	// donePostBlockLayoutPasses is true if all the passes after LayoutBlocks are called.
+	donePostBlockLayoutPasses bool
+
+	currentSourceOffset SourceOffset
+}
+
+func (b *builder) VarLengthPool() *wazevoapi.VarLengthPool[Value] {
+	return &b.varLengthPool
+}
+
+// ReturnBlock implements Builder.ReturnBlock.
+func (b *builder) ReturnBlock() BasicBlock {
+	return b.returnBlk
+}
+
+// Init implements Builder.Reset.
+func (b *builder) Init(s *Signature) {
+	b.nextVariable = 0
+	b.currentSignature = s
+	resetBasicBlock(b.returnBlk)
+	b.instructionsPool.Reset()
+	b.basicBlocksPool.Reset()
+	b.varLengthPool.Reset()
+	b.donePreBlockLayoutPasses = false
+	b.doneBlockLayout = false
+	b.donePostBlockLayoutPasses = false
+	for _, sig := range b.signatures {
+		sig.used = false
+	}
+
+	b.ints = b.ints[:0]
+	b.blkStack = b.blkStack[:0]
+	b.blkStack2 = b.blkStack2[:0]
+	b.dominators = b.dominators[:0]
+	b.loopNestingForestRoots = b.loopNestingForestRoots[:0]
+
+	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
+		blk := b.basicBlocksPool.View(i)
+		delete(b.blkVisited, blk)
+	}
+	b.basicBlocksPool.Reset()
+
+	for v := ValueID(0); v < b.nextValueID; v++ {
+		delete(b.valueAnnotations, v)
+		delete(b.valueIDAliases, v)
+		b.valueRefCounts[v] = 0
+		b.valueIDToInstruction[v] = nil
+	}
+	b.nextValueID = 0
+	b.reversePostOrderedBasicBlocks = b.reversePostOrderedBasicBlocks[:0]
+	b.doneBlockLayout = false
+	for i := range b.valueRefCounts {
+		b.valueRefCounts[i] = 0
+	}
+
+	b.currentSourceOffset = sourceOffsetUnknown
+}
+
+// Signature implements Builder.Signature.
+func (b *builder) Signature() *Signature {
+	return b.currentSignature
+}
+
+// AnnotateValue implements Builder.AnnotateValue.
+func (b *builder) AnnotateValue(value Value, a string) {
+	b.valueAnnotations[value.ID()] = a
+}
+
+// AllocateInstruction implements Builder.AllocateInstruction.
+func (b *builder) AllocateInstruction() *Instruction {
+	instr := b.instructionsPool.Allocate()
+	instr.id = b.instructionsPool.Allocated()
+	return instr
+}
+
+// DeclareSignature implements Builder.AnnotateValue.
+func (b *builder) DeclareSignature(s *Signature) {
+	b.signatures[s.ID] = s
+	s.used = false
+}
+
+// Signatures implements Builder.Signatures.
+func (b *builder) Signatures() (ret []*Signature) {
+	for _, sig := range b.signatures {
+		ret = append(ret, sig)
+	}
+	sort.Slice(ret, func(i, j int) bool {
+		return ret[i].ID < ret[j].ID
+	})
+	return
+}
+
+// SetCurrentSourceOffset implements Builder.SetCurrentSourceOffset.
+func (b *builder) SetCurrentSourceOffset(l SourceOffset) {
+	b.currentSourceOffset = l
+}
+
+func (b *builder) usedSignatures() (ret []*Signature) {
+	for _, sig := range b.signatures {
+		if sig.used {
+			ret = append(ret, sig)
+		}
+	}
+	sort.Slice(ret, func(i, j int) bool {
+		return ret[i].ID < ret[j].ID
+	})
+	return
+}
+
+// ResolveSignature implements Builder.ResolveSignature.
+func (b *builder) ResolveSignature(id SignatureID) *Signature {
+	return b.signatures[id]
+}
+
+// AllocateBasicBlock implements Builder.AllocateBasicBlock.
+func (b *builder) AllocateBasicBlock() BasicBlock {
+	return b.allocateBasicBlock()
+}
+
+// allocateBasicBlock allocates a new basicBlock.
+func (b *builder) allocateBasicBlock() *basicBlock {
+	id := BasicBlockID(b.basicBlocksPool.Allocated())
+	blk := b.basicBlocksPool.Allocate()
+	blk.id = id
+	return blk
+}
+
+// Idom implements Builder.Idom.
+func (b *builder) Idom(blk BasicBlock) BasicBlock {
+	return b.dominators[blk.ID()]
+}
+
+// InsertInstruction implements Builder.InsertInstruction.
+func (b *builder) InsertInstruction(instr *Instruction) {
+	b.currentBB.InsertInstruction(instr)
+
+	if l := b.currentSourceOffset; l.Valid() {
+		// Emit the source offset info only when the instruction has side effect because
+		// these are the only instructions that are accessed by stack unwinding.
+		// This reduces the significant amount of the offset info in the binary.
+		if instr.sideEffect() != sideEffectNone {
+			instr.annotateSourceOffset(l)
+		}
+	}
+
+	resultTypesFn := instructionReturnTypes[instr.opcode]
+	if resultTypesFn == nil {
+		panic("TODO: " + instr.Format(b))
+	}
+
+	t1, ts := resultTypesFn(b, instr)
+	if t1.invalid() {
+		return
+	}
+
+	r1 := b.allocateValue(t1)
+	instr.rValue = r1
+
+	tsl := len(ts)
+	if tsl == 0 {
+		return
+	}
+
+	rValues := b.varLengthPool.Allocate(tsl)
+	for i := 0; i < tsl; i++ {
+		rValues = rValues.Append(&b.varLengthPool, b.allocateValue(ts[i]))
+	}
+	instr.rValues = rValues
+}
+
+// DefineVariable implements Builder.DefineVariable.
+func (b *builder) DefineVariable(variable Variable, value Value, block BasicBlock) {
+	if b.variables[variable].invalid() {
+		panic("BUG: trying to define variable " + variable.String() + " but is not declared yet")
+	}
+
+	if b.variables[variable] != value.Type() {
+		panic(fmt.Sprintf("BUG: inconsistent type for variable %d: expected %s but got %s", variable, b.variables[variable], value.Type()))
+	}
+	bb := block.(*basicBlock)
+	bb.lastDefinitions[variable] = value
+}
+
+// DefineVariableInCurrentBB implements Builder.DefineVariableInCurrentBB.
+func (b *builder) DefineVariableInCurrentBB(variable Variable, value Value) {
+	b.DefineVariable(variable, value, b.currentBB)
+}
+
+// SetCurrentBlock implements Builder.SetCurrentBlock.
+func (b *builder) SetCurrentBlock(bb BasicBlock) {
+	b.currentBB = bb.(*basicBlock)
+}
+
+// CurrentBlock implements Builder.CurrentBlock.
+func (b *builder) CurrentBlock() BasicBlock {
+	return b.currentBB
+}
+
+// EntryBlock implements Builder.EntryBlock.
+func (b *builder) EntryBlock() BasicBlock {
+	return b.entryBlk()
+}
+
+// DeclareVariable implements Builder.DeclareVariable.
+func (b *builder) DeclareVariable(typ Type) Variable {
+	v := b.allocateVariable()
+	iv := int(v)
+	if l := len(b.variables); l <= iv {
+		b.variables = append(b.variables, make([]Type, 2*(l+1))...)
+	}
+	b.variables[v] = typ
+	return v
+}
+
+// allocateVariable allocates a new variable.
+func (b *builder) allocateVariable() (ret Variable) {
+	ret = b.nextVariable
+	b.nextVariable++
+	return
+}
+
+// allocateValue implements Builder.AllocateValue.
+func (b *builder) allocateValue(typ Type) (v Value) {
+	v = Value(b.nextValueID)
+	v = v.setType(typ)
+	b.nextValueID++
+	return
+}
+
+// FindValueInLinearPath implements Builder.FindValueInLinearPath.
+func (b *builder) FindValueInLinearPath(variable Variable) Value {
+	return b.findValueInLinearPath(variable, b.currentBB)
+}
+
+func (b *builder) findValueInLinearPath(variable Variable, blk *basicBlock) Value {
+	if val, ok := blk.lastDefinitions[variable]; ok {
+		return val
+	} else if !blk.sealed {
+		return ValueInvalid
+	}
+
+	if pred := blk.singlePred; pred != nil {
+		// If this block is sealed and have only one predecessor,
+		// we can use the value in that block without ambiguity on definition.
+		return b.findValueInLinearPath(variable, pred)
+	}
+	if len(blk.preds) == 1 {
+		panic("BUG")
+	}
+	return ValueInvalid
+}
+
+func (b *builder) MustFindValueInBlk(variable Variable, blk BasicBlock) Value {
+	typ := b.definedVariableType(variable)
+	return b.findValue(typ, variable, blk.(*basicBlock))
+}
+
+// MustFindValue implements Builder.MustFindValue.
+func (b *builder) MustFindValue(variable Variable) Value {
+	typ := b.definedVariableType(variable)
+	return b.findValue(typ, variable, b.currentBB)
+}
+
+// findValue recursively tries to find the latest definition of a `variable`. The algorithm is described in
+// the section 2 of the paper https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf.
+//
+// TODO: reimplement this in iterative, not recursive, to avoid stack overflow.
+func (b *builder) findValue(typ Type, variable Variable, blk *basicBlock) Value {
+	if val, ok := blk.lastDefinitions[variable]; ok {
+		// The value is already defined in this block!
+		return val
+	} else if !blk.sealed { // Incomplete CFG as in the paper.
+		// If this is not sealed, that means it might have additional unknown predecessor later on.
+		// So we temporarily define the placeholder value here (not add as a parameter yet!),
+		// and record it as unknown.
+		// The unknown values are resolved when we call seal this block via BasicBlock.Seal().
+		value := b.allocateValue(typ)
+		if wazevoapi.SSALoggingEnabled {
+			fmt.Printf("adding unknown value placeholder for %s at %d\n", variable, blk.id)
+		}
+		blk.lastDefinitions[variable] = value
+		blk.unknownValues = append(blk.unknownValues, unknownValue{
+			variable: variable,
+			value:    value,
+		})
+		return value
+	}
+
+	if pred := blk.singlePred; pred != nil {
+		// If this block is sealed and have only one predecessor,
+		// we can use the value in that block without ambiguity on definition.
+		return b.findValue(typ, variable, pred)
+	} else if len(blk.preds) == 0 {
+		panic("BUG: value is not defined for " + variable.String())
+	}
+
+	// If this block has multiple predecessors, we have to gather the definitions,
+	// and treat them as an argument to this block.
+	//
+	// The first thing is to define a new parameter to this block which may or may not be redundant, but
+	// later we eliminate trivial params in an optimization pass. This must be done before finding the
+	// definitions in the predecessors so that we can break the cycle.
+	paramValue := blk.AddParam(b, typ)
+	b.DefineVariable(variable, paramValue, blk)
+
+	// After the new param is added, we have to manipulate the original branching instructions
+	// in predecessors so that they would pass the definition of `variable` as the argument to
+	// the newly added PHI.
+	for i := range blk.preds {
+		pred := &blk.preds[i]
+		value := b.findValue(typ, variable, pred.blk)
+		pred.branch.addArgumentBranchInst(b, value)
+	}
+	return paramValue
+}
+
+// Seal implements Builder.Seal.
+func (b *builder) Seal(raw BasicBlock) {
+	blk := raw.(*basicBlock)
+	if len(blk.preds) == 1 {
+		blk.singlePred = blk.preds[0].blk
+	}
+	blk.sealed = true
+
+	for _, v := range blk.unknownValues {
+		variable, phiValue := v.variable, v.value
+		typ := b.definedVariableType(variable)
+		blk.addParamOn(typ, phiValue)
+		for i := range blk.preds {
+			pred := &blk.preds[i]
+			predValue := b.findValue(typ, variable, pred.blk)
+			if !predValue.Valid() {
+				panic("BUG: value is not defined anywhere in the predecessors in the CFG")
+			}
+			pred.branch.addArgumentBranchInst(b, predValue)
+		}
+	}
+}
+
+// definedVariableType returns the type of the given variable. If the variable is not defined yet, it panics.
+func (b *builder) definedVariableType(variable Variable) Type {
+	typ := b.variables[variable]
+	if typ.invalid() {
+		panic(fmt.Sprintf("%s is not defined yet", variable))
+	}
+	return typ
+}
+
+// Format implements Builder.Format.
+func (b *builder) Format() string {
+	str := strings.Builder{}
+	usedSigs := b.usedSignatures()
+	if len(usedSigs) > 0 {
+		str.WriteByte('\n')
+		str.WriteString("signatures:\n")
+		for _, sig := range usedSigs {
+			str.WriteByte('\t')
+			str.WriteString(sig.String())
+			str.WriteByte('\n')
+		}
+	}
+
+	var iterBegin, iterNext func() *basicBlock
+	if b.doneBlockLayout {
+		iterBegin, iterNext = b.blockIteratorReversePostOrderBegin, b.blockIteratorReversePostOrderNext
+	} else {
+		iterBegin, iterNext = b.blockIteratorBegin, b.blockIteratorNext
+	}
+	for bb := iterBegin(); bb != nil; bb = iterNext() {
+		str.WriteByte('\n')
+		str.WriteString(bb.FormatHeader(b))
+		str.WriteByte('\n')
+
+		for cur := bb.Root(); cur != nil; cur = cur.Next() {
+			str.WriteByte('\t')
+			str.WriteString(cur.Format(b))
+			str.WriteByte('\n')
+		}
+	}
+	return str.String()
+}
+
+// BlockIteratorNext implements Builder.BlockIteratorNext.
+func (b *builder) BlockIteratorNext() BasicBlock {
+	if blk := b.blockIteratorNext(); blk == nil {
+		return nil // BasicBlock((*basicBlock)(nil)) != BasicBlock(nil)
+	} else {
+		return blk
+	}
+}
+
+// BlockIteratorNext implements Builder.BlockIteratorNext.
+func (b *builder) blockIteratorNext() *basicBlock {
+	index := b.blockIterCur
+	for {
+		if index == b.basicBlocksPool.Allocated() {
+			return nil
+		}
+		ret := b.basicBlocksPool.View(index)
+		index++
+		if !ret.invalid {
+			b.blockIterCur = index
+			return ret
+		}
+	}
+}
+
+// BlockIteratorBegin implements Builder.BlockIteratorBegin.
+func (b *builder) BlockIteratorBegin() BasicBlock {
+	return b.blockIteratorBegin()
+}
+
+// BlockIteratorBegin implements Builder.BlockIteratorBegin.
+func (b *builder) blockIteratorBegin() *basicBlock {
+	b.blockIterCur = 0
+	return b.blockIteratorNext()
+}
+
+// BlockIteratorReversePostOrderBegin implements Builder.BlockIteratorReversePostOrderBegin.
+func (b *builder) BlockIteratorReversePostOrderBegin() BasicBlock {
+	return b.blockIteratorReversePostOrderBegin()
+}
+
+// BlockIteratorBegin implements Builder.BlockIteratorBegin.
+func (b *builder) blockIteratorReversePostOrderBegin() *basicBlock {
+	b.blockIterCur = 0
+	return b.blockIteratorReversePostOrderNext()
+}
+
+// BlockIteratorReversePostOrderNext implements Builder.BlockIteratorReversePostOrderNext.
+func (b *builder) BlockIteratorReversePostOrderNext() BasicBlock {
+	if blk := b.blockIteratorReversePostOrderNext(); blk == nil {
+		return nil // BasicBlock((*basicBlock)(nil)) != BasicBlock(nil)
+	} else {
+		return blk
+	}
+}
+
+// BlockIteratorNext implements Builder.BlockIteratorNext.
+func (b *builder) blockIteratorReversePostOrderNext() *basicBlock {
+	if b.blockIterCur >= len(b.reversePostOrderedBasicBlocks) {
+		return nil
+	} else {
+		ret := b.reversePostOrderedBasicBlocks[b.blockIterCur]
+		b.blockIterCur++
+		return ret
+	}
+}
+
+// ValueRefCounts implements Builder.ValueRefCounts.
+func (b *builder) ValueRefCounts() []int {
+	return b.valueRefCounts
+}
+
+// alias records the alias of the given values. The alias(es) will be
+// eliminated in the optimization pass via resolveArgumentAlias.
+func (b *builder) alias(dst, src Value) {
+	b.valueIDAliases[dst.ID()] = src
+}
+
+// resolveArgumentAlias resolves the alias of the arguments of the given instruction.
+func (b *builder) resolveArgumentAlias(instr *Instruction) {
+	if instr.v.Valid() {
+		instr.v = b.resolveAlias(instr.v)
+	}
+
+	if instr.v2.Valid() {
+		instr.v2 = b.resolveAlias(instr.v2)
+	}
+
+	if instr.v3.Valid() {
+		instr.v3 = b.resolveAlias(instr.v3)
+	}
+
+	view := instr.vs.View()
+	for i, v := range view {
+		view[i] = b.resolveAlias(v)
+	}
+}
+
+// resolveAlias resolves the alias of the given value.
+func (b *builder) resolveAlias(v Value) Value {
+	// Some aliases are chained, so we need to resolve them recursively.
+	for {
+		if src, ok := b.valueIDAliases[v.ID()]; ok {
+			v = src
+		} else {
+			break
+		}
+	}
+	return v
+}
+
+// entryBlk returns the entry block of the function.
+func (b *builder) entryBlk() *basicBlock {
+	return b.basicBlocksPool.View(0)
+}
+
+// isDominatedBy returns true if the given block `n` is dominated by the given block `d`.
+// Before calling this, the builder must pass by passCalculateImmediateDominators.
+func (b *builder) isDominatedBy(n *basicBlock, d *basicBlock) bool {
+	if len(b.dominators) == 0 {
+		panic("BUG: passCalculateImmediateDominators must be called before calling isDominatedBy")
+	}
+	ent := b.entryBlk()
+	doms := b.dominators
+	for n != d && n != ent {
+		n = doms[n.id]
+	}
+	return n == d
+}
+
+// BlockIDMax implements Builder.BlockIDMax.
+func (b *builder) BlockIDMax() BasicBlockID {
+	return BasicBlockID(b.basicBlocksPool.Allocated())
+}
+
+// InsertUndefined implements Builder.InsertUndefined.
+func (b *builder) InsertUndefined() {
+	instr := b.AllocateInstruction()
+	instr.opcode = OpcodeUndefined
+	b.InsertInstruction(instr)
+}
+
+// LoopNestingForestRoots implements Builder.LoopNestingForestRoots.
+func (b *builder) LoopNestingForestRoots() []BasicBlock {
+	return b.loopNestingForestRoots
+}
+
+// LowestCommonAncestor implements Builder.LowestCommonAncestor.
+func (b *builder) LowestCommonAncestor(blk1, blk2 BasicBlock) BasicBlock {
+	return b.sparseTree.findLCA(blk1.ID(), blk2.ID())
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go
@ -0,0 +1,107 @@
+package ssa
+
+// IntegerCmpCond represents a condition for integer comparison.
+type IntegerCmpCond byte
+
+const (
+	// IntegerCmpCondInvalid represents an invalid condition.
+	IntegerCmpCondInvalid IntegerCmpCond = iota
+	// IntegerCmpCondEqual represents "==".
+	IntegerCmpCondEqual
+	// IntegerCmpCondNotEqual represents "!=".
+	IntegerCmpCondNotEqual
+	// IntegerCmpCondSignedLessThan represents Signed "<".
+	IntegerCmpCondSignedLessThan
+	// IntegerCmpCondSignedGreaterThanOrEqual represents Signed ">=".
+	IntegerCmpCondSignedGreaterThanOrEqual
+	// IntegerCmpCondSignedGreaterThan represents Signed ">".
+	IntegerCmpCondSignedGreaterThan
+	// IntegerCmpCondSignedLessThanOrEqual represents Signed "<=".
+	IntegerCmpCondSignedLessThanOrEqual
+	// IntegerCmpCondUnsignedLessThan represents Unsigned "<".
+	IntegerCmpCondUnsignedLessThan
+	// IntegerCmpCondUnsignedGreaterThanOrEqual represents Unsigned ">=".
+	IntegerCmpCondUnsignedGreaterThanOrEqual
+	// IntegerCmpCondUnsignedGreaterThan represents Unsigned ">".
+	IntegerCmpCondUnsignedGreaterThan
+	// IntegerCmpCondUnsignedLessThanOrEqual represents Unsigned "<=".
+	IntegerCmpCondUnsignedLessThanOrEqual
+)
+
+// String implements fmt.Stringer.
+func (i IntegerCmpCond) String() string {
+	switch i {
+	case IntegerCmpCondEqual:
+		return "eq"
+	case IntegerCmpCondNotEqual:
+		return "neq"
+	case IntegerCmpCondSignedLessThan:
+		return "lt_s"
+	case IntegerCmpCondSignedGreaterThanOrEqual:
+		return "ge_s"
+	case IntegerCmpCondSignedGreaterThan:
+		return "gt_s"
+	case IntegerCmpCondSignedLessThanOrEqual:
+		return "le_s"
+	case IntegerCmpCondUnsignedLessThan:
+		return "lt_u"
+	case IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return "ge_u"
+	case IntegerCmpCondUnsignedGreaterThan:
+		return "gt_u"
+	case IntegerCmpCondUnsignedLessThanOrEqual:
+		return "le_u"
+	default:
+		panic("invalid integer comparison condition")
+	}
+}
+
+// Signed returns true if the condition is signed integer comparison.
+func (i IntegerCmpCond) Signed() bool {
+	switch i {
+	case IntegerCmpCondSignedLessThan, IntegerCmpCondSignedGreaterThanOrEqual,
+		IntegerCmpCondSignedGreaterThan, IntegerCmpCondSignedLessThanOrEqual:
+		return true
+	default:
+		return false
+	}
+}
+
+type FloatCmpCond byte
+
+const (
+	// FloatCmpCondInvalid represents an invalid condition.
+	FloatCmpCondInvalid FloatCmpCond = iota
+	// FloatCmpCondEqual represents "==".
+	FloatCmpCondEqual
+	// FloatCmpCondNotEqual represents "!=".
+	FloatCmpCondNotEqual
+	// FloatCmpCondLessThan represents "<".
+	FloatCmpCondLessThan
+	// FloatCmpCondLessThanOrEqual represents "<=".
+	FloatCmpCondLessThanOrEqual
+	// FloatCmpCondGreaterThan represents ">".
+	FloatCmpCondGreaterThan
+	// FloatCmpCondGreaterThanOrEqual represents ">=".
+	FloatCmpCondGreaterThanOrEqual
+)
+
+// String implements fmt.Stringer.
+func (f FloatCmpCond) String() string {
+	switch f {
+	case FloatCmpCondEqual:
+		return "eq"
+	case FloatCmpCondNotEqual:
+		return "neq"
+	case FloatCmpCondLessThan:
+		return "lt"
+	case FloatCmpCondLessThanOrEqual:
+		return "le"
+	case FloatCmpCondGreaterThan:
+		return "gt"
+	case FloatCmpCondGreaterThanOrEqual:
+		return "ge"
+	default:
+		panic("invalid float comparison condition")
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go
@ -0,0 +1,12 @@
+package ssa
+
+import "fmt"
+
+// FuncRef is a unique identifier for a function of the frontend,
+// and is used to reference the function in function call.
+type FuncRef uint32
+
+// String implements fmt.Stringer.
+func (r FuncRef) String() string {
+	return fmt.Sprintf("f%d", r)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
@ -0,0 +1,417 @@
+package ssa
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// RunPasses implements Builder.RunPasses.
+//
+// The order here matters; some pass depends on the previous ones.
+//
+// Note that passes suffixed with "Opt" are the optimization passes, meaning that they edit the instructions and blocks
+// while the other passes are not, like passEstimateBranchProbabilities does not edit them, but only calculates the additional information.
+func (b *builder) RunPasses() {
+	b.runPreBlockLayoutPasses()
+	b.runBlockLayoutPass()
+	b.runPostBlockLayoutPasses()
+	b.runFinalizingPasses()
+}
+
+func (b *builder) runPreBlockLayoutPasses() {
+	passSortSuccessors(b)
+	passDeadBlockEliminationOpt(b)
+	passRedundantPhiEliminationOpt(b)
+	// The result of passCalculateImmediateDominators will be used by various passes below.
+	passCalculateImmediateDominators(b)
+	passNopInstElimination(b)
+
+	// TODO: implement either conversion of irreducible CFG into reducible one, or irreducible CFG detection where we panic.
+	// 	WebAssembly program shouldn't result in irreducible CFG, but we should handle it properly in just in case.
+	// 	See FixIrreducible pass in LLVM: https://llvm.org/doxygen/FixIrreducible_8cpp_source.html
+
+	// TODO: implement more optimization passes like:
+	// 	block coalescing.
+	// 	Copy-propagation.
+	// 	Constant folding.
+	// 	Common subexpression elimination.
+	// 	Arithmetic simplifications.
+	// 	and more!
+
+	// passDeadCodeEliminationOpt could be more accurate if we do this after other optimizations.
+	passDeadCodeEliminationOpt(b)
+	b.donePreBlockLayoutPasses = true
+}
+
+func (b *builder) runBlockLayoutPass() {
+	if !b.donePreBlockLayoutPasses {
+		panic("runBlockLayoutPass must be called after all pre passes are done")
+	}
+	passLayoutBlocks(b)
+	b.doneBlockLayout = true
+}
+
+// runPostBlockLayoutPasses runs the post block layout passes. After this point, CFG is somewhat stable,
+// but still can be modified before finalizing passes. At this point, critical edges are split by passLayoutBlocks.
+func (b *builder) runPostBlockLayoutPasses() {
+	if !b.doneBlockLayout {
+		panic("runPostBlockLayoutPasses must be called after block layout pass is done")
+	}
+	// TODO: Do more. e.g. tail duplication, loop unrolling, etc.
+
+	b.donePostBlockLayoutPasses = true
+}
+
+// runFinalizingPasses runs the finalizing passes. After this point, CFG should not be modified.
+func (b *builder) runFinalizingPasses() {
+	if !b.donePostBlockLayoutPasses {
+		panic("runFinalizingPasses must be called after post block layout passes are done")
+	}
+	// Critical edges are split, so we fix the loop nesting forest.
+	passBuildLoopNestingForest(b)
+	passBuildDominatorTree(b)
+	// Now that we know the final placement of the blocks, we can explicitly mark the fallthrough jumps.
+	b.markFallthroughJumps()
+}
+
+// passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so.
+func passDeadBlockEliminationOpt(b *builder) {
+	entryBlk := b.entryBlk()
+	b.clearBlkVisited()
+	b.blkStack = append(b.blkStack, entryBlk)
+	for len(b.blkStack) > 0 {
+		reachableBlk := b.blkStack[len(b.blkStack)-1]
+		b.blkStack = b.blkStack[:len(b.blkStack)-1]
+		b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass.
+
+		if !reachableBlk.sealed && !reachableBlk.ReturnBlock() {
+			panic(fmt.Sprintf("%s is not sealed", reachableBlk))
+		}
+
+		if wazevoapi.SSAValidationEnabled {
+			reachableBlk.validate(b)
+		}
+
+		for _, succ := range reachableBlk.success {
+			if _, ok := b.blkVisited[succ]; ok {
+				continue
+			}
+			b.blkStack = append(b.blkStack, succ)
+		}
+	}
+
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		if _, ok := b.blkVisited[blk]; !ok {
+			blk.invalid = true
+		}
+	}
+}
+
+// passRedundantPhiEliminationOpt eliminates the redundant PHIs (in our terminology, parameters of a block).
+func passRedundantPhiEliminationOpt(b *builder) {
+	redundantParameterIndexes := b.ints[:0] // reuse the slice from previous iterations.
+
+	// TODO: this might be costly for large programs, but at least, as far as I did the experiment, it's almost the
+	//  same as the single iteration version in terms of the overall compilation time. That *might be* mostly thanks to the fact
+	//  that removing many PHIs results in the reduction of the total instructions, not because of this indefinite iteration is
+	//  relatively small. For example, sqlite speedtest binary results in the large number of redundant PHIs,
+	//  the maximum number of iteration was 22, which seems to be acceptable but not that small either since the
+	//  complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands.
+	for {
+		changed := false
+		_ = b.blockIteratorBegin() // skip entry block!
+		// Below, we intentionally use the named iteration variable name, as this comes with inevitable nested for loops!
+		for blk := b.blockIteratorNext(); blk != nil; blk = b.blockIteratorNext() {
+			paramNum := len(blk.params)
+
+			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
+				phiValue := blk.params[paramIndex].value
+				redundant := true
+
+				nonSelfReferencingValue := ValueInvalid
+				for predIndex := range blk.preds {
+					br := blk.preds[predIndex].branch
+					// Resolve the alias in the arguments so that we could use the previous iteration's result.
+					b.resolveArgumentAlias(br)
+					pred := br.vs.View()[paramIndex]
+					if pred == phiValue {
+						// This is self-referencing: PHI from the same PHI.
+						continue
+					}
+
+					if !nonSelfReferencingValue.Valid() {
+						nonSelfReferencingValue = pred
+						continue
+					}
+
+					if nonSelfReferencingValue != pred {
+						redundant = false
+						break
+					}
+				}
+
+				if !nonSelfReferencingValue.Valid() {
+					// This shouldn't happen, and must be a bug in builder.go.
+					panic("BUG: params added but only self-referencing")
+				}
+
+				if redundant {
+					b.redundantParameterIndexToValue[paramIndex] = nonSelfReferencingValue
+					redundantParameterIndexes = append(redundantParameterIndexes, paramIndex)
+				}
+			}
+
+			if len(b.redundantParameterIndexToValue) == 0 {
+				continue
+			}
+			changed = true
+
+			// Remove the redundant PHIs from the argument list of branching instructions.
+			for predIndex := range blk.preds {
+				var cur int
+				predBlk := blk.preds[predIndex]
+				branchInst := predBlk.branch
+				view := branchInst.vs.View()
+				for argIndex, value := range view {
+					if _, ok := b.redundantParameterIndexToValue[argIndex]; !ok {
+						view[cur] = value
+						cur++
+					}
+				}
+				branchInst.vs.Cut(cur)
+			}
+
+			// Still need to have the definition of the value of the PHI (previously as the parameter).
+			for _, redundantParamIndex := range redundantParameterIndexes {
+				phiValue := blk.params[redundantParamIndex].value
+				onlyValue := b.redundantParameterIndexToValue[redundantParamIndex]
+				// Create an alias in this block from the only phi argument to the phi value.
+				b.alias(phiValue, onlyValue)
+			}
+
+			// Finally, Remove the param from the blk.
+			var cur int
+			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
+				param := blk.params[paramIndex]
+				if _, ok := b.redundantParameterIndexToValue[paramIndex]; !ok {
+					blk.params[cur] = param
+					cur++
+				}
+			}
+			blk.params = blk.params[:cur]
+
+			// Clears the map for the next iteration.
+			for _, paramIndex := range redundantParameterIndexes {
+				delete(b.redundantParameterIndexToValue, paramIndex)
+			}
+			redundantParameterIndexes = redundantParameterIndexes[:0]
+		}
+
+		if !changed {
+			break
+		}
+	}
+
+	// Reuse the slice for the future passes.
+	b.ints = redundantParameterIndexes
+}
+
+// passDeadCodeEliminationOpt traverses all the instructions, and calculates the reference count of each Value, and
+// eliminates all the unnecessary instructions whose ref count is zero.
+// The results are stored at builder.valueRefCounts. This also assigns a InstructionGroupID to each Instruction
+// during the process. This is the last SSA-level optimization pass and after this,
+// the SSA function is ready to be used by backends.
+//
+// TODO: the algorithm here might not be efficient. Get back to this later.
+func passDeadCodeEliminationOpt(b *builder) {
+	nvid := int(b.nextValueID)
+	if nvid >= len(b.valueRefCounts) {
+		b.valueRefCounts = append(b.valueRefCounts, make([]int, b.nextValueID)...)
+	}
+	if nvid >= len(b.valueIDToInstruction) {
+		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
+	}
+
+	// First, we gather all the instructions with side effects.
+	liveInstructions := b.instStack[:0]
+	// During the process, we will assign InstructionGroupID to each instruction, which is not
+	// relevant to dead code elimination, but we need in the backend.
+	var gid InstructionGroupID
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			cur.gid = gid
+			switch cur.sideEffect() {
+			case sideEffectTraps:
+				// The trappable should always be alive.
+				liveInstructions = append(liveInstructions, cur)
+			case sideEffectStrict:
+				liveInstructions = append(liveInstructions, cur)
+				// The strict side effect should create different instruction groups.
+				gid++
+			}
+
+			r1, rs := cur.Returns()
+			if r1.Valid() {
+				b.valueIDToInstruction[r1.ID()] = cur
+			}
+			for _, r := range rs {
+				b.valueIDToInstruction[r.ID()] = cur
+			}
+		}
+	}
+
+	// Find all the instructions referenced by live instructions transitively.
+	for len(liveInstructions) > 0 {
+		tail := len(liveInstructions) - 1
+		live := liveInstructions[tail]
+		liveInstructions = liveInstructions[:tail]
+		if live.live {
+			// If it's already marked alive, this is referenced multiple times,
+			// so we can skip it.
+			continue
+		}
+		live.live = true
+
+		// Before we walk, we need to resolve the alias first.
+		b.resolveArgumentAlias(live)
+
+		v1, v2, v3, vs := live.Args()
+		if v1.Valid() {
+			producingInst := b.valueIDToInstruction[v1.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+
+		if v2.Valid() {
+			producingInst := b.valueIDToInstruction[v2.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+
+		if v3.Valid() {
+			producingInst := b.valueIDToInstruction[v3.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+
+		for _, v := range vs {
+			producingInst := b.valueIDToInstruction[v.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+	}
+
+	// Now that all the live instructions are flagged as live=true, we eliminate all dead instructions.
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			if !cur.live {
+				// Remove the instruction from the list.
+				if prev := cur.prev; prev != nil {
+					prev.next = cur.next
+				} else {
+					blk.rootInstr = cur.next
+				}
+				if next := cur.next; next != nil {
+					next.prev = cur.prev
+				}
+				continue
+			}
+
+			// If the value alive, we can be sure that arguments are used definitely.
+			// Hence, we can increment the value reference counts.
+			v1, v2, v3, vs := cur.Args()
+			if v1.Valid() {
+				b.incRefCount(v1.ID(), cur)
+			}
+			if v2.Valid() {
+				b.incRefCount(v2.ID(), cur)
+			}
+			if v3.Valid() {
+				b.incRefCount(v3.ID(), cur)
+			}
+			for _, v := range vs {
+				b.incRefCount(v.ID(), cur)
+			}
+		}
+	}
+
+	b.instStack = liveInstructions // we reuse the stack for the next iteration.
+}
+
+func (b *builder) incRefCount(id ValueID, from *Instruction) {
+	if wazevoapi.SSALoggingEnabled {
+		fmt.Printf("v%d referenced from %v\n", id, from.Format(b))
+	}
+	b.valueRefCounts[id]++
+}
+
+// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places.
+func (b *builder) clearBlkVisited() {
+	b.blkStack2 = b.blkStack2[:0]
+	for key := range b.blkVisited {
+		b.blkStack2 = append(b.blkStack2, key)
+	}
+	for _, blk := range b.blkStack2 {
+		delete(b.blkVisited, blk)
+	}
+	b.blkStack2 = b.blkStack2[:0]
+}
+
+// passNopInstElimination eliminates the instructions which is essentially a no-op.
+func passNopInstElimination(b *builder) {
+	if int(b.nextValueID) >= len(b.valueIDToInstruction) {
+		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
+	}
+
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			r1, rs := cur.Returns()
+			if r1.Valid() {
+				b.valueIDToInstruction[r1.ID()] = cur
+			}
+			for _, r := range rs {
+				b.valueIDToInstruction[r.ID()] = cur
+			}
+		}
+	}
+
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			switch cur.Opcode() {
+			// TODO: add more logics here.
+			case OpcodeIshl, OpcodeSshr, OpcodeUshr:
+				x, amount := cur.Arg2()
+				definingInst := b.valueIDToInstruction[amount.ID()]
+				if definingInst == nil {
+					// If there's no defining instruction, that means the amount is coming from the parameter.
+					continue
+				}
+				if definingInst.Constant() {
+					v := definingInst.ConstantVal()
+
+					if x.Type().Bits() == 64 {
+						v = v % 64
+					} else {
+						v = v % 32
+					}
+					if v == 0 {
+						b.alias(cur.Return(), x)
+					}
+				}
+			}
+		}
+	}
+}
+
+// passSortSuccessors sorts the successors of each block in the natural program order.
+func passSortSuccessors(b *builder) {
+	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
+		blk := b.basicBlocksPool.View(i)
+		sortBlocks(blk.success)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
@ -0,0 +1,335 @@
+package ssa
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// passLayoutBlocks implements Builder.LayoutBlocks. This re-organizes builder.reversePostOrderedBasicBlocks.
+//
+// TODO: there are tons of room for improvement here. e.g. LLVM has BlockPlacementPass using BlockFrequencyInfo,
+// BranchProbabilityInfo, and LoopInfo to do a much better job. Also, if we have the profiling instrumentation
+// like ball-larus algorithm, then we could do profile-guided optimization. Basically all of them are trying
+// to maximize the fall-through opportunities which is most efficient.
+//
+// Here, fallthrough happens when a block ends with jump instruction whose target is the right next block in the
+// builder.reversePostOrderedBasicBlocks.
+//
+// Currently, we just place blocks using the DFS reverse post-order of the dominator tree with the heuristics:
+//  1. a split edge trampoline towards a loop header will be placed as a fallthrough.
+//  2. we invert the brz and brnz if it makes the fallthrough more likely.
+//
+// This heuristic is done in maybeInvertBranches function.
+func passLayoutBlocks(b *builder) {
+	b.clearBlkVisited()
+
+	// We might end up splitting critical edges which adds more basic blocks,
+	// so we store the currently existing basic blocks in nonSplitBlocks temporarily.
+	// That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks.
+	nonSplitBlocks := b.blkStack[:0]
+	for i, blk := range b.reversePostOrderedBasicBlocks {
+		if !blk.Valid() {
+			continue
+		}
+		nonSplitBlocks = append(nonSplitBlocks, blk)
+		if i != len(b.reversePostOrderedBasicBlocks)-1 {
+			_ = maybeInvertBranches(blk, b.reversePostOrderedBasicBlocks[i+1])
+		}
+	}
+
+	var trampolines []*basicBlock
+
+	// Reset the order slice since we update on the fly by splitting critical edges.
+	b.reversePostOrderedBasicBlocks = b.reversePostOrderedBasicBlocks[:0]
+	uninsertedTrampolines := b.blkStack2[:0]
+	for _, blk := range nonSplitBlocks {
+		for i := range blk.preds {
+			pred := blk.preds[i].blk
+			if _, ok := b.blkVisited[pred]; ok || !pred.Valid() {
+				continue
+			} else if pred.reversePostOrder < blk.reversePostOrder {
+				// This means the edge is critical, and this pred is the trampoline and yet to be inserted.
+				// Split edge trampolines must come before the destination in reverse post-order.
+				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred)
+				b.blkVisited[pred] = 0 // mark as inserted, the value is not used.
+			}
+		}
+
+		// Now that we've already added all the potential trampoline blocks incoming to this block,
+		// we can add this block itself.
+		b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk)
+		b.blkVisited[blk] = 0 // mark as inserted, the value is not used.
+
+		if len(blk.success) < 2 {
+			// There won't be critical edge originating from this block.
+			continue
+		} else if blk.currentInstr.opcode == OpcodeBrTable {
+			// We don't split critical edges here, because at the construction site of BrTable, we already split the edges.
+			continue
+		}
+
+		for sidx, succ := range blk.success {
+			if !succ.ReturnBlock() && // If the successor is a return block, we need to split the edge any way because we need "epilogue" to be inserted.
+				// Plus if there's no multiple incoming edges to this successor, (pred, succ) is not critical.
+				len(succ.preds) < 2 {
+				continue
+			}
+
+			// Otherwise, we are sure this is a critical edge. To modify the CFG, we need to find the predecessor info
+			// from the successor.
+			var predInfo *basicBlockPredecessorInfo
+			for i := range succ.preds { // This linear search should not be a problem since the number of predecessors should almost always small.
+				pred := &succ.preds[i]
+				if pred.blk == blk {
+					predInfo = pred
+					break
+				}
+			}
+
+			if predInfo == nil {
+				// This must be a bug in somewhere around branch manipulation.
+				panic("BUG: predecessor info not found while the successor exists in successors list")
+			}
+
+			if wazevoapi.SSALoggingEnabled {
+				fmt.Printf("trying to split edge from %d->%d at %s\n",
+					blk.ID(), succ.ID(), predInfo.branch.Format(b))
+			}
+
+			trampoline := b.splitCriticalEdge(blk, succ, predInfo)
+			// Update the successors slice because the target is no longer the original `succ`.
+			blk.success[sidx] = trampoline
+
+			if wazevoapi.SSAValidationEnabled {
+				trampolines = append(trampolines, trampoline)
+			}
+
+			if wazevoapi.SSALoggingEnabled {
+				fmt.Printf("edge split from %d->%d at %s as %d->%d->%d \n",
+					blk.ID(), succ.ID(), predInfo.branch.Format(b),
+					blk.ID(), trampoline.ID(), succ.ID())
+			}
+
+			fallthroughBranch := blk.currentInstr
+			if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline {
+				// This can be lowered as fallthrough at the end of the block.
+				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
+				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+			} else {
+				uninsertedTrampolines = append(uninsertedTrampolines, trampoline)
+			}
+		}
+
+		for _, trampoline := range uninsertedTrampolines {
+			if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself.
+				// This means the critical edge was backward, so we insert after the current block immediately.
+				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
+				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+			} // If the target is forward, we can wait to insert until the target is inserted.
+		}
+		uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block.
+	}
+
+	if wazevoapi.SSALoggingEnabled {
+		var bs []string
+		for _, blk := range b.reversePostOrderedBasicBlocks {
+			bs = append(bs, blk.Name())
+		}
+		fmt.Println("ordered blocks: ", strings.Join(bs, ", "))
+	}
+
+	if wazevoapi.SSAValidationEnabled {
+		for _, trampoline := range trampolines {
+			if _, ok := b.blkVisited[trampoline]; !ok {
+				panic("BUG: trampoline block not inserted: " + trampoline.FormatHeader(b))
+			}
+			trampoline.validate(b)
+		}
+	}
+
+	// Reuse the stack for the next iteration.
+	b.blkStack2 = uninsertedTrampolines[:0]
+}
+
+// markFallthroughJumps finds the fallthrough jumps and marks them as such.
+func (b *builder) markFallthroughJumps() {
+	l := len(b.reversePostOrderedBasicBlocks) - 1
+	for i, blk := range b.reversePostOrderedBasicBlocks {
+		if i < l {
+			cur := blk.currentInstr
+			if cur.opcode == OpcodeJump && cur.blk == b.reversePostOrderedBasicBlocks[i+1] {
+				cur.AsFallthroughJump()
+			}
+		}
+	}
+}
+
+// maybeInvertBranches inverts the branch instructions if it is likely possible to the fallthrough more likely with simple heuristics.
+// nextInRPO is the next block in the reverse post-order.
+//
+// Returns true if the branch is inverted for testing purpose.
+func maybeInvertBranches(now *basicBlock, nextInRPO *basicBlock) bool {
+	fallthroughBranch := now.currentInstr
+	if fallthroughBranch.opcode == OpcodeBrTable {
+		return false
+	}
+
+	condBranch := fallthroughBranch.prev
+	if condBranch == nil || (condBranch.opcode != OpcodeBrnz && condBranch.opcode != OpcodeBrz) {
+		return false
+	}
+
+	if len(fallthroughBranch.vs.View()) != 0 || len(condBranch.vs.View()) != 0 {
+		// If either one of them has arguments, we don't invert the branches.
+		return false
+	}
+
+	// So this block has two branches (a conditional branch followed by an unconditional branch) at the end.
+	// We can invert the condition of the branch if it makes the fallthrough more likely.
+
+	fallthroughTarget, condTarget := fallthroughBranch.blk.(*basicBlock), condBranch.blk.(*basicBlock)
+
+	if fallthroughTarget.loopHeader {
+		// First, if the tail's target is loopHeader, we don't need to do anything here,
+		// because the edge is likely to be critical edge for complex loops (e.g. loop with branches inside it).
+		// That means, we will split the edge in the end of LayoutBlocks function, and insert the trampoline block
+		// right after this block, which will be fallthrough in any way.
+		return false
+	} else if condTarget.loopHeader {
+		// On the other hand, if the condBranch's target is loopHeader, we invert the condition of the branch
+		// so that we could get the fallthrough to the trampoline block.
+		goto invert
+	}
+
+	if fallthroughTarget == nextInRPO {
+		// Also, if the tail's target is the next block in the reverse post-order, we don't need to do anything here,
+		// because if this is not critical edge, we would end up placing these two blocks adjacent to each other.
+		// Even if it is the critical edge, we place the trampoline block right after this block, which will be fallthrough in any way.
+		return false
+	} else if condTarget == nextInRPO {
+		// If the condBranch's target is the next block in the reverse post-order, we invert the condition of the branch
+		// so that we could get the fallthrough to the block.
+		goto invert
+	} else {
+		return false
+	}
+
+invert:
+	for i := range fallthroughTarget.preds {
+		pred := &fallthroughTarget.preds[i]
+		if pred.branch == fallthroughBranch {
+			pred.branch = condBranch
+			break
+		}
+	}
+	for i := range condTarget.preds {
+		pred := &condTarget.preds[i]
+		if pred.branch == condBranch {
+			pred.branch = fallthroughBranch
+			break
+		}
+	}
+
+	condBranch.InvertBrx()
+	condBranch.blk = fallthroughTarget
+	fallthroughBranch.blk = condTarget
+	if wazevoapi.SSALoggingEnabled {
+		fmt.Printf("inverting branches at %d->%d and %d->%d\n",
+			now.ID(), fallthroughTarget.ID(), now.ID(), condTarget.ID())
+	}
+
+	return true
+}
+
+// splitCriticalEdge splits the critical edge between the given predecessor (`pred`) and successor (owning `predInfo`).
+//
+// - `pred` is the source of the critical edge,
+// - `succ` is the destination of the critical edge,
+// - `predInfo` is the predecessor info in the succ.preds slice which represents the critical edge.
+//
+// Why splitting critical edges is important? See following links:
+//
+//   - https://en.wikipedia.org/wiki/Control-flow_graph
+//   - https://nickdesaulniers.github.io/blog/2023/01/27/critical-edge-splitting/
+//
+// The returned basic block is the trampoline block which is inserted to split the critical edge.
+func (b *builder) splitCriticalEdge(pred, succ *basicBlock, predInfo *basicBlockPredecessorInfo) *basicBlock {
+	// In the following, we convert the following CFG:
+	//
+	//     pred --(originalBranch)--> succ
+	//
+	// to the following CFG:
+	//
+	//     pred --(newBranch)--> trampoline --(originalBranch)-> succ
+	//
+	// where trampoline is a new basic block which is created to split the critical edge.
+
+	trampoline := b.allocateBasicBlock()
+	if int(trampoline.id) >= len(b.dominators) {
+		b.dominators = append(b.dominators, make([]*basicBlock, trampoline.id+1)...)
+	}
+	b.dominators[trampoline.id] = pred
+
+	originalBranch := predInfo.branch
+
+	// Replace originalBranch with the newBranch.
+	newBranch := b.AllocateInstruction()
+	newBranch.opcode = originalBranch.opcode
+	newBranch.blk = trampoline
+	switch originalBranch.opcode {
+	case OpcodeJump:
+	case OpcodeBrz, OpcodeBrnz:
+		originalBranch.opcode = OpcodeJump // Trampoline consists of one unconditional branch.
+		newBranch.v = originalBranch.v
+		originalBranch.v = ValueInvalid
+	default:
+		panic("BUG: critical edge shouldn't be originated from br_table")
+	}
+	swapInstruction(pred, originalBranch, newBranch)
+
+	// Replace the original branch with the new branch.
+	trampoline.rootInstr = originalBranch
+	trampoline.currentInstr = originalBranch
+	trampoline.success = append(trampoline.success, succ) // Do not use []*basicBlock{pred} because we might have already allocated the slice.
+	trampoline.preds = append(trampoline.preds,           // same as ^.
+		basicBlockPredecessorInfo{blk: pred, branch: newBranch})
+	b.Seal(trampoline)
+
+	// Update the original branch to point to the trampoline.
+	predInfo.blk = trampoline
+	predInfo.branch = originalBranch
+
+	if wazevoapi.SSAValidationEnabled {
+		trampoline.validate(b)
+	}
+
+	if len(trampoline.params) > 0 {
+		panic("trampoline should not have params")
+	}
+
+	// Assign the same order as the original block so that this will be placed before the actual destination.
+	trampoline.reversePostOrder = pred.reversePostOrder
+	return trampoline
+}
+
+// swapInstruction replaces `old` in the block `blk` with `New`.
+func swapInstruction(blk *basicBlock, old, New *Instruction) {
+	if blk.rootInstr == old {
+		blk.rootInstr = New
+		next := old.next
+		New.next = next
+		next.prev = New
+	} else {
+		if blk.currentInstr == old {
+			blk.currentInstr = New
+		}
+		prev := old.prev
+		prev.next, New.prev = New, prev
+		if next := old.next; next != nil {
+			New.next, next.prev = next, New
+		}
+	}
+	old.prev, old.next = nil, nil
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
@ -0,0 +1,312 @@
+package ssa
+
+import (
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// passCalculateImmediateDominators calculates immediate dominators for each basic block.
+// The result is stored in b.dominators. This make it possible for the following passes to
+// use builder.isDominatedBy to check if a block is dominated by another block.
+//
+// At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag.
+func passCalculateImmediateDominators(b *builder) {
+	reversePostOrder := b.reversePostOrderedBasicBlocks[:0]
+	exploreStack := b.blkStack[:0]
+	b.clearBlkVisited()
+
+	entryBlk := b.entryBlk()
+
+	// Store the reverse postorder from the entrypoint into reversePostOrder slice.
+	// This calculation of reverse postorder is not described in the paper,
+	// so we use heuristic to calculate it so that we could potentially handle arbitrary
+	// complex CFGs under the assumption that success is sorted in program's natural order.
+	// That means blk.success[i] always appears before blk.success[i+1] in the source program,
+	// which is a reasonable assumption as long as SSA Builder is properly used.
+	//
+	// First we push blocks in postorder iteratively visit successors of the entry block.
+	exploreStack = append(exploreStack, entryBlk)
+	const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2
+	b.blkVisited[entryBlk] = visitStateSeen
+	for len(exploreStack) > 0 {
+		tail := len(exploreStack) - 1
+		blk := exploreStack[tail]
+		exploreStack = exploreStack[:tail]
+		switch b.blkVisited[blk] {
+		case visitStateUnseen:
+			// This is likely a bug in the frontend.
+			panic("BUG: unsupported CFG")
+		case visitStateSeen:
+			// This is the first time to pop this block, and we have to see the successors first.
+			// So push this block again to the stack.
+			exploreStack = append(exploreStack, blk)
+			// And push the successors to the stack if necessary.
+			for _, succ := range blk.success {
+				if succ.ReturnBlock() || succ.invalid {
+					continue
+				}
+				if b.blkVisited[succ] == visitStateUnseen {
+					b.blkVisited[succ] = visitStateSeen
+					exploreStack = append(exploreStack, succ)
+				}
+			}
+			// Finally, we could pop this block once we pop all of its successors.
+			b.blkVisited[blk] = visitStateDone
+		case visitStateDone:
+			// Note: at this point we push blk in postorder despite its name.
+			reversePostOrder = append(reversePostOrder, blk)
+		}
+	}
+	// At this point, reversePostOrder has postorder actually, so we reverse it.
+	for i := len(reversePostOrder)/2 - 1; i >= 0; i-- {
+		j := len(reversePostOrder) - 1 - i
+		reversePostOrder[i], reversePostOrder[j] = reversePostOrder[j], reversePostOrder[i]
+	}
+
+	for i, blk := range reversePostOrder {
+		blk.reversePostOrder = i
+	}
+
+	// Reuse the dominators slice if possible from the previous computation of function.
+	b.dominators = b.dominators[:cap(b.dominators)]
+	if len(b.dominators) < b.basicBlocksPool.Allocated() {
+		// Generously reserve space in the slice because the slice will be reused future allocation.
+		b.dominators = append(b.dominators, make([]*basicBlock, b.basicBlocksPool.Allocated())...)
+	}
+	calculateDominators(reversePostOrder, b.dominators)
+
+	// Reuse the slices for the future use.
+	b.blkStack = exploreStack
+
+	// For the following passes.
+	b.reversePostOrderedBasicBlocks = reversePostOrder
+
+	// Ready to detect loops!
+	subPassLoopDetection(b)
+}
+
+// calculateDominators calculates the immediate dominator of each node in the CFG, and store the result in `doms`.
+// The algorithm is based on the one described in the paper "A Simple, Fast Dominance Algorithm"
+// https://www.cs.rice.edu/~keith/EMBED/dom.pdf which is a faster/simple alternative to the well known Lengauer-Tarjan algorithm.
+//
+// The following code almost matches the pseudocode in the paper with one exception (see the code comment below).
+//
+// The result slice `doms` must be pre-allocated with the size larger than the size of dfsBlocks.
+func calculateDominators(reversePostOrderedBlks []*basicBlock, doms []*basicBlock) {
+	entry, reversePostOrderedBlks := reversePostOrderedBlks[0], reversePostOrderedBlks[1: /* skips entry point */]
+	for _, blk := range reversePostOrderedBlks {
+		doms[blk.id] = nil
+	}
+	doms[entry.id] = entry
+
+	changed := true
+	for changed {
+		changed = false
+		for _, blk := range reversePostOrderedBlks {
+			var u *basicBlock
+			for i := range blk.preds {
+				pred := blk.preds[i].blk
+				// Skip if this pred is not reachable yet. Note that this is not described in the paper,
+				// but it is necessary to handle nested loops etc.
+				if doms[pred.id] == nil {
+					continue
+				}
+
+				if u == nil {
+					u = pred
+					continue
+				} else {
+					u = intersect(doms, u, pred)
+				}
+			}
+			if doms[blk.id] != u {
+				doms[blk.id] = u
+				changed = true
+			}
+		}
+	}
+}
+
+// intersect returns the common dominator of blk1 and blk2.
+//
+// This is the `intersect` function in the paper.
+func intersect(doms []*basicBlock, blk1 *basicBlock, blk2 *basicBlock) *basicBlock {
+	finger1, finger2 := blk1, blk2
+	for finger1 != finger2 {
+		// Move the 'finger1' upwards to its immediate dominator.
+		for finger1.reversePostOrder > finger2.reversePostOrder {
+			finger1 = doms[finger1.id]
+		}
+		// Move the 'finger2' upwards to its immediate dominator.
+		for finger2.reversePostOrder > finger1.reversePostOrder {
+			finger2 = doms[finger2.id]
+		}
+	}
+	return finger1
+}
+
+// subPassLoopDetection detects loops in the function using the immediate dominators.
+//
+// This is run at the last of passCalculateImmediateDominators.
+func subPassLoopDetection(b *builder) {
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for i := range blk.preds {
+			pred := blk.preds[i].blk
+			if pred.invalid {
+				continue
+			}
+			if b.isDominatedBy(pred, blk) {
+				blk.loopHeader = true
+			}
+		}
+	}
+}
+
+// buildLoopNestingForest builds the loop nesting forest for the function.
+// This must be called after branch splitting since it relies on the CFG.
+func passBuildLoopNestingForest(b *builder) {
+	ent := b.entryBlk()
+	doms := b.dominators
+	for _, blk := range b.reversePostOrderedBasicBlocks {
+		n := doms[blk.id]
+		for !n.loopHeader && n != ent {
+			n = doms[n.id]
+		}
+
+		if n == ent && blk.loopHeader {
+			b.loopNestingForestRoots = append(b.loopNestingForestRoots, blk)
+		} else if n == ent {
+		} else if n.loopHeader {
+			n.loopNestingForestChildren = append(n.loopNestingForestChildren, blk)
+		}
+	}
+
+	if wazevoapi.SSALoggingEnabled {
+		for _, root := range b.loopNestingForestRoots {
+			printLoopNestingForest(root.(*basicBlock), 0)
+		}
+	}
+}
+
+func printLoopNestingForest(root *basicBlock, depth int) {
+	fmt.Println(strings.Repeat("\t", depth), "loop nesting forest root:", root.ID())
+	for _, child := range root.loopNestingForestChildren {
+		fmt.Println(strings.Repeat("\t", depth+1), "child:", child.ID())
+		if child.LoopHeader() {
+			printLoopNestingForest(child.(*basicBlock), depth+2)
+		}
+	}
+}
+
+type dominatorSparseTree struct {
+	time         int
+	euler        []*basicBlock
+	first, depth []int
+	table        [][]int
+}
+
+// passBuildDominatorTree builds the dominator tree for the function, and constructs builder.sparseTree.
+func passBuildDominatorTree(b *builder) {
+	// First we materialize the children of each node in the dominator tree.
+	idoms := b.dominators
+	for _, blk := range b.reversePostOrderedBasicBlocks {
+		parent := idoms[blk.id]
+		if parent == nil {
+			panic("BUG")
+		} else if parent == blk {
+			// This is the entry block.
+			continue
+		}
+		if prev := parent.child; prev == nil {
+			parent.child = blk
+		} else {
+			parent.child = blk
+			blk.sibling = prev
+		}
+	}
+
+	// Reset the state from the previous computation.
+	n := b.basicBlocksPool.Allocated()
+	st := &b.sparseTree
+	st.euler = append(st.euler[:0], make([]*basicBlock, 2*n-1)...)
+	st.first = append(st.first[:0], make([]int, n)...)
+	for i := range st.first {
+		st.first[i] = -1
+	}
+	st.depth = append(st.depth[:0], make([]int, 2*n-1)...)
+	st.time = 0
+
+	// Start building the sparse tree.
+	st.eulerTour(b.entryBlk(), 0)
+	st.buildSparseTable()
+}
+
+func (dt *dominatorSparseTree) eulerTour(node *basicBlock, height int) {
+	if wazevoapi.SSALoggingEnabled {
+		fmt.Println(strings.Repeat("\t", height), "euler tour:", node.ID())
+	}
+	dt.euler[dt.time] = node
+	dt.depth[dt.time] = height
+	if dt.first[node.id] == -1 {
+		dt.first[node.id] = dt.time
+	}
+	dt.time++
+
+	for child := node.child; child != nil; child = child.sibling {
+		dt.eulerTour(child, height+1)
+		dt.euler[dt.time] = node // add the current node again after visiting a child
+		dt.depth[dt.time] = height
+		dt.time++
+	}
+}
+
+// buildSparseTable builds a sparse table for RMQ queries.
+func (dt *dominatorSparseTree) buildSparseTable() {
+	n := len(dt.depth)
+	k := int(math.Log2(float64(n))) + 1
+	table := dt.table
+
+	if n >= len(table) {
+		table = append(table, make([][]int, n+1)...)
+	}
+	for i := range table {
+		if len(table[i]) < k {
+			table[i] = append(table[i], make([]int, k)...)
+		}
+		table[i][0] = i
+	}
+
+	for j := 1; 1<<j <= n; j++ {
+		for i := 0; i+(1<<j)-1 < n; i++ {
+			if dt.depth[table[i][j-1]] < dt.depth[table[i+(1<<(j-1))][j-1]] {
+				table[i][j] = table[i][j-1]
+			} else {
+				table[i][j] = table[i+(1<<(j-1))][j-1]
+			}
+		}
+	}
+	dt.table = table
+}
+
+// rmq performs a range minimum query on the sparse table.
+func (dt *dominatorSparseTree) rmq(l, r int) int {
+	table := dt.table
+	depth := dt.depth
+	j := int(math.Log2(float64(r - l + 1)))
+	if depth[table[l][j]] <= depth[table[r-(1<<j)+1][j]] {
+		return table[l][j]
+	}
+	return table[r-(1<<j)+1][j]
+}
+
+// findLCA finds the LCA using the Euler tour and RMQ.
+func (dt *dominatorSparseTree) findLCA(u, v BasicBlockID) *basicBlock {
+	first := dt.first
+	if first[u] > first[v] {
+		u, v = v, u
+	}
+	return dt.euler[dt.rmq(first[u], first[v])]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go
@ -0,0 +1,49 @@
+package ssa
+
+import (
+	"fmt"
+	"strings"
+)
+
+// Signature is a function prototype.
+type Signature struct {
+	// ID is a unique identifier for this signature used to lookup.
+	ID SignatureID
+	// Params and Results are the types of the parameters and results of the function.
+	Params, Results []Type
+
+	// used is true if this is used by the currently-compiled function.
+	// Debugging only.
+	used bool
+}
+
+// String implements fmt.Stringer.
+func (s *Signature) String() string {
+	str := strings.Builder{}
+	str.WriteString(s.ID.String())
+	str.WriteString(": ")
+	if len(s.Params) > 0 {
+		for _, typ := range s.Params {
+			str.WriteString(typ.String())
+		}
+	} else {
+		str.WriteByte('v')
+	}
+	str.WriteByte('_')
+	if len(s.Results) > 0 {
+		for _, typ := range s.Results {
+			str.WriteString(typ.String())
+		}
+	} else {
+		str.WriteByte('v')
+	}
+	return str.String()
+}
+
+// SignatureID is an unique identifier used to lookup.
+type SignatureID int
+
+// String implements fmt.Stringer.
+func (s SignatureID) String() string {
+	return fmt.Sprintf("sig%d", s)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go
@ -0,0 +1,14 @@
+// Package ssa is used to construct SSA function. By nature this is free of Wasm specific thing
+// and ISA.
+//
+// We use the "block argument" variant of SSA: https://en.wikipedia.org/wiki/Static_single-assignment_form#Block_arguments
+// which is equivalent to the traditional PHI function based one, but more convenient during optimizations.
+// However, in this package's source code comment, we might use PHI whenever it seems necessary in order to be aligned with
+// existing literatures, e.g. SSA level optimization algorithms are often described using PHI nodes.
+//
+// The rationale doc for the choice of "block argument" by MLIR of LLVM is worth a read:
+// https://mlir.llvm.org/docs/Rationale/Rationale/#block-arguments-vs-phi-nodes
+//
+// The algorithm to resolve variable definitions used here is based on the paper
+// "Simple and Efficient Construction of Static Single Assignment Form": https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf.
+package ssa
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
@ -0,0 +1,112 @@
+package ssa
+
+type Type byte
+
+const (
+	typeInvalid Type = iota
+
+	// TODO: add 8, 16 bit types when it's needed for optimizations.
+
+	// TypeI32 represents an integer type with 32 bits.
+	TypeI32
+
+	// TypeI64 represents an integer type with 64 bits.
+	TypeI64
+
+	// TypeF32 represents 32-bit floats in the IEEE 754.
+	TypeF32
+
+	// TypeF64 represents 64-bit floats in the IEEE 754.
+	TypeF64
+
+	// TypeV128 represents 128-bit SIMD vectors.
+	TypeV128
+)
+
+// String implements fmt.Stringer.
+func (t Type) String() (ret string) {
+	switch t {
+	case typeInvalid:
+		return "invalid"
+	case TypeI32:
+		return "i32"
+	case TypeI64:
+		return "i64"
+	case TypeF32:
+		return "f32"
+	case TypeF64:
+		return "f64"
+	case TypeV128:
+		return "v128"
+	default:
+		panic(int(t))
+	}
+}
+
+// IsInt returns true if the type is an integer type.
+func (t Type) IsInt() bool {
+	return t == TypeI32 || t == TypeI64
+}
+
+// IsFloat returns true if the type is a floating point type.
+func (t Type) IsFloat() bool {
+	return t == TypeF32 || t == TypeF64
+}
+
+// Bits returns the number of bits required to represent the type.
+func (t Type) Bits() byte {
+	switch t {
+	case TypeI32, TypeF32:
+		return 32
+	case TypeI64, TypeF64:
+		return 64
+	case TypeV128:
+		return 128
+	default:
+		panic(int(t))
+	}
+}
+
+// Size returns the number of bytes required to represent the type.
+func (t Type) Size() byte {
+	return t.Bits() / 8
+}
+
+func (t Type) invalid() bool {
+	return t == typeInvalid
+}
+
+// VecLane represents a lane in a SIMD vector.
+type VecLane byte
+
+const (
+	VecLaneInvalid VecLane = 1 + iota
+	VecLaneI8x16
+	VecLaneI16x8
+	VecLaneI32x4
+	VecLaneI64x2
+	VecLaneF32x4
+	VecLaneF64x2
+)
+
+// String implements fmt.Stringer.
+func (vl VecLane) String() (ret string) {
+	switch vl {
+	case VecLaneInvalid:
+		return "invalid"
+	case VecLaneI8x16:
+		return "i8x16"
+	case VecLaneI16x8:
+		return "i16x8"
+	case VecLaneI32x4:
+		return "i32x4"
+	case VecLaneI64x2:
+		return "i64x2"
+	case VecLaneF32x4:
+		return "f32x4"
+	case VecLaneF64x2:
+		return "f64x2"
+	default:
+		panic(int(vl))
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go
@ -0,0 +1,87 @@
+package ssa
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// Variable is a unique identifier for a source program's variable and will correspond to
+// multiple ssa Value(s).
+//
+// For example, `Local 1` is a Variable in WebAssembly, and Value(s) will be created for it
+// whenever it executes `local.set 1`.
+//
+// Variable is useful to track the SSA Values of a variable in the source program, and
+// can be used to find the corresponding latest SSA Value via Builder.FindValue.
+type Variable uint32
+
+// String implements fmt.Stringer.
+func (v Variable) String() string {
+	return fmt.Sprintf("var%d", v)
+}
+
+// Value represents an SSA value with a type information. The relationship with Variable is 1: N (including 0),
+// that means there might be multiple Variable(s) for a Value.
+//
+// Higher 32-bit is used to store Type for this value.
+type Value uint64
+
+// ValueID is the lower 32bit of Value, which is the pure identifier of Value without type info.
+type ValueID uint32
+
+const (
+	valueIDInvalid ValueID = math.MaxUint32
+	ValueInvalid   Value   = Value(valueIDInvalid)
+)
+
+// Format creates a debug string for this Value using the data stored in Builder.
+func (v Value) Format(b Builder) string {
+	if annotation, ok := b.(*builder).valueAnnotations[v.ID()]; ok {
+		return annotation
+	}
+	return fmt.Sprintf("v%d", v.ID())
+}
+
+func (v Value) formatWithType(b Builder) (ret string) {
+	if annotation, ok := b.(*builder).valueAnnotations[v.ID()]; ok {
+		ret = annotation + ":" + v.Type().String()
+	} else {
+		ret = fmt.Sprintf("v%d:%s", v.ID(), v.Type())
+	}
+
+	if wazevoapi.SSALoggingEnabled { // This is useful to check live value analysis bugs.
+		if bd := b.(*builder); bd.donePostBlockLayoutPasses {
+			id := v.ID()
+			ret += fmt.Sprintf("(ref=%d)", bd.valueRefCounts[id])
+		}
+	}
+	return ret
+}
+
+// Valid returns true if this value is valid.
+func (v Value) Valid() bool {
+	return v.ID() != valueIDInvalid
+}
+
+// Type returns the Type of this value.
+func (v Value) Type() Type {
+	return Type(v >> 32)
+}
+
+// ID returns the valueID of this value.
+func (v Value) ID() ValueID {
+	return ValueID(v)
+}
+
+// setType sets a type to this Value and returns the updated Value.
+func (v Value) setType(typ Type) Value {
+	return v | Value(typ)<<32
+}
+
+// Values is a slice of Value. Use this instead of []Value to reuse the underlying memory.
+type Values = wazevoapi.VarLength[Value]
+
+// ValuesNil is a nil Values.
+var ValuesNil = wazevoapi.NewNilVarLength[Value]()
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go
@ -0,0 +1,196 @@
+package wazevoapi
+
+import (
+	"context"
+	"encoding/hex"
+	"fmt"
+	"math/rand"
+	"os"
+	"time"
+)
+
+// These consts are used various places in the wazevo implementations.
+// Instead of defining them in each file, we define them here so that we can quickly iterate on
+// debugging without spending "where do we have debug logging?" time.
+
+// ----- Debug logging -----
+// These consts must be disabled by default. Enable them only when debugging.
+
+const (
+	FrontEndLoggingEnabled = false
+	SSALoggingEnabled      = false
+	RegAllocLoggingEnabled = false
+)
+
+// ----- Output prints -----
+// These consts must be disabled by default. Enable them only when debugging.
+
+const (
+	PrintSSA                                 = false
+	PrintOptimizedSSA                        = false
+	PrintSSAToBackendIRLowering              = false
+	PrintRegisterAllocated                   = false
+	PrintFinalizedMachineCode                = false
+	PrintMachineCodeHexPerFunction           = printMachineCodeHexPerFunctionUnmodified || PrintMachineCodeHexPerFunctionDisassemblable //nolint
+	printMachineCodeHexPerFunctionUnmodified = false
+	// PrintMachineCodeHexPerFunctionDisassemblable prints the machine code while modifying the actual result
+	// to make it disassemblable. This is useful when debugging the final machine code. See the places where this is used for detail.
+	// When this is enabled, functions must not be called.
+	PrintMachineCodeHexPerFunctionDisassemblable = false
+)
+
+// printTarget is the function index to print the machine code. This is used for debugging to print the machine code
+// of a specific function.
+const printTarget = -1
+
+// PrintEnabledIndex returns true if the current function index is the print target.
+func PrintEnabledIndex(ctx context.Context) bool {
+	if printTarget == -1 {
+		return true
+	}
+	return GetCurrentFunctionIndex(ctx) == printTarget
+}
+
+// ----- Validations -----
+const (
+	// SSAValidationEnabled enables the SSA validation. This is disabled by default since the operation is expensive.
+	SSAValidationEnabled = false
+)
+
+// ----- Stack Guard Check -----
+const (
+	// StackGuardCheckEnabled enables the stack guard check to ensure that our stack bounds check works correctly.
+	StackGuardCheckEnabled       = false
+	StackGuardCheckGuardPageSize = 8096
+)
+
+// CheckStackGuardPage checks the given stack guard page is not corrupted.
+func CheckStackGuardPage(s []byte) {
+	for i := 0; i < StackGuardCheckGuardPageSize; i++ {
+		if s[i] != 0 {
+			panic(
+				fmt.Sprintf("BUG: stack guard page is corrupted:\n\tguard_page=%s\n\tstack=%s",
+					hex.EncodeToString(s[:StackGuardCheckGuardPageSize]),
+					hex.EncodeToString(s[StackGuardCheckGuardPageSize:]),
+				))
+		}
+	}
+}
+
+// ----- Deterministic compilation verifier -----
+
+const (
+	// DeterministicCompilationVerifierEnabled enables the deterministic compilation verifier. This is disabled by default
+	// since the operation is expensive. But when in doubt, enable this to make sure the compilation is deterministic.
+	DeterministicCompilationVerifierEnabled = false
+	DeterministicCompilationVerifyingIter   = 5
+)
+
+type (
+	verifierState struct {
+		initialCompilationDone bool
+		maybeRandomizedIndexes []int
+		r                      *rand.Rand
+		values                 map[string]string
+	}
+	verifierStateContextKey struct{}
+	currentFunctionNameKey  struct{}
+	currentFunctionIndexKey struct{}
+)
+
+// NewDeterministicCompilationVerifierContext creates a new context with the deterministic compilation verifier used per wasm.Module.
+func NewDeterministicCompilationVerifierContext(ctx context.Context, localFunctions int) context.Context {
+	maybeRandomizedIndexes := make([]int, localFunctions)
+	for i := range maybeRandomizedIndexes {
+		maybeRandomizedIndexes[i] = i
+	}
+	r := rand.New(rand.NewSource(time.Now().UnixNano()))
+	return context.WithValue(ctx, verifierStateContextKey{}, &verifierState{
+		r: r, maybeRandomizedIndexes: maybeRandomizedIndexes, values: map[string]string{},
+	})
+}
+
+// DeterministicCompilationVerifierRandomizeIndexes randomizes the indexes for the deterministic compilation verifier.
+// To get the randomized index, use DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex.
+func DeterministicCompilationVerifierRandomizeIndexes(ctx context.Context) {
+	state := ctx.Value(verifierStateContextKey{}).(*verifierState)
+	if !state.initialCompilationDone {
+		// If this is the first attempt, we use the index as-is order.
+		state.initialCompilationDone = true
+		return
+	}
+	r := state.r
+	r.Shuffle(len(state.maybeRandomizedIndexes), func(i, j int) {
+		state.maybeRandomizedIndexes[i], state.maybeRandomizedIndexes[j] = state.maybeRandomizedIndexes[j], state.maybeRandomizedIndexes[i]
+	})
+}
+
+// DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex returns the randomized index for the given `index`
+// which is assigned by DeterministicCompilationVerifierRandomizeIndexes.
+func DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx context.Context, index int) int {
+	state := ctx.Value(verifierStateContextKey{}).(*verifierState)
+	ret := state.maybeRandomizedIndexes[index]
+	return ret
+}
+
+// VerifyOrSetDeterministicCompilationContextValue verifies that the `newValue` is the same as the previous value for the given `scope`
+// and the current function name. If the previous value doesn't exist, it sets the value to the given `newValue`.
+//
+// If the verification fails, this prints the diff and exits the process.
+func VerifyOrSetDeterministicCompilationContextValue(ctx context.Context, scope string, newValue string) {
+	fn := ctx.Value(currentFunctionNameKey{}).(string)
+	key := fn + ": " + scope
+	verifierCtx := ctx.Value(verifierStateContextKey{}).(*verifierState)
+	oldValue, ok := verifierCtx.values[key]
+	if !ok {
+		verifierCtx.values[key] = newValue
+		return
+	}
+	if oldValue != newValue {
+		fmt.Printf(
+			`BUG: Deterministic compilation failed for function%s at scope="%s".
+
+This is mostly due to (but might not be limited to):
+	* Resetting ssa.Builder, backend.Compiler or frontend.Compiler, etc doens't work as expected, and the compilation has been affected by the previous iterations.
+	* Using a map with non-deterministic iteration order.
+
+---------- [old] ----------
+%s
+
+---------- [new] ----------
+%s
+`,
+			fn, scope, oldValue, newValue,
+		)
+		os.Exit(1)
+	}
+}
+
+// nolint
+const NeedFunctionNameInContext = PrintSSA ||
+	PrintOptimizedSSA ||
+	PrintSSAToBackendIRLowering ||
+	PrintRegisterAllocated ||
+	PrintFinalizedMachineCode ||
+	PrintMachineCodeHexPerFunction ||
+	DeterministicCompilationVerifierEnabled ||
+	PerfMapEnabled
+
+// SetCurrentFunctionName sets the current function name to the given `functionName`.
+func SetCurrentFunctionName(ctx context.Context, index int, functionName string) context.Context {
+	ctx = context.WithValue(ctx, currentFunctionNameKey{}, functionName)
+	ctx = context.WithValue(ctx, currentFunctionIndexKey{}, index)
+	return ctx
+}
+
+// GetCurrentFunctionName returns the current function name.
+func GetCurrentFunctionName(ctx context.Context) string {
+	ret, _ := ctx.Value(currentFunctionNameKey{}).(string)
+	return ret
+}
+
+// GetCurrentFunctionIndex returns the current function index.
+func GetCurrentFunctionIndex(ctx context.Context) int {
+	ret, _ := ctx.Value(currentFunctionIndexKey{}).(int)
+	return ret
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go
@ -0,0 +1,109 @@
+package wazevoapi
+
+// ExitCode is an exit code of an execution of a function.
+type ExitCode uint32
+
+const (
+	ExitCodeOK ExitCode = iota
+	ExitCodeGrowStack
+	ExitCodeGrowMemory
+	ExitCodeUnreachable
+	ExitCodeMemoryOutOfBounds
+	// ExitCodeCallGoModuleFunction is an exit code for a call to an api.GoModuleFunction.
+	ExitCodeCallGoModuleFunction
+	// ExitCodeCallGoFunction is an exit code for a call to an api.GoFunction.
+	ExitCodeCallGoFunction
+	ExitCodeTableOutOfBounds
+	ExitCodeIndirectCallNullPointer
+	ExitCodeIndirectCallTypeMismatch
+	ExitCodeIntegerDivisionByZero
+	ExitCodeIntegerOverflow
+	ExitCodeInvalidConversionToInteger
+	ExitCodeCheckModuleExitCode
+	ExitCodeCallListenerBefore
+	ExitCodeCallListenerAfter
+	ExitCodeCallGoModuleFunctionWithListener
+	ExitCodeCallGoFunctionWithListener
+	ExitCodeTableGrow
+	ExitCodeRefFunc
+	ExitCodeMemoryWait32
+	ExitCodeMemoryWait64
+	ExitCodeMemoryNotify
+	ExitCodeUnalignedAtomic
+	exitCodeMax
+)
+
+const ExitCodeMask = 0xff
+
+// String implements fmt.Stringer.
+func (e ExitCode) String() string {
+	switch e {
+	case ExitCodeOK:
+		return "ok"
+	case ExitCodeGrowStack:
+		return "grow_stack"
+	case ExitCodeCallGoModuleFunction:
+		return "call_go_module_function"
+	case ExitCodeCallGoFunction:
+		return "call_go_function"
+	case ExitCodeUnreachable:
+		return "unreachable"
+	case ExitCodeMemoryOutOfBounds:
+		return "memory_out_of_bounds"
+	case ExitCodeUnalignedAtomic:
+		return "unaligned_atomic"
+	case ExitCodeTableOutOfBounds:
+		return "table_out_of_bounds"
+	case ExitCodeIndirectCallNullPointer:
+		return "indirect_call_null_pointer"
+	case ExitCodeIndirectCallTypeMismatch:
+		return "indirect_call_type_mismatch"
+	case ExitCodeIntegerDivisionByZero:
+		return "integer_division_by_zero"
+	case ExitCodeIntegerOverflow:
+		return "integer_overflow"
+	case ExitCodeInvalidConversionToInteger:
+		return "invalid_conversion_to_integer"
+	case ExitCodeCheckModuleExitCode:
+		return "check_module_exit_code"
+	case ExitCodeCallListenerBefore:
+		return "call_listener_before"
+	case ExitCodeCallListenerAfter:
+		return "call_listener_after"
+	case ExitCodeCallGoModuleFunctionWithListener:
+		return "call_go_module_function_with_listener"
+	case ExitCodeCallGoFunctionWithListener:
+		return "call_go_function_with_listener"
+	case ExitCodeGrowMemory:
+		return "grow_memory"
+	case ExitCodeTableGrow:
+		return "table_grow"
+	case ExitCodeRefFunc:
+		return "ref_func"
+	case ExitCodeMemoryWait32:
+		return "memory_wait32"
+	case ExitCodeMemoryWait64:
+		return "memory_wait64"
+	case ExitCodeMemoryNotify:
+		return "memory_notify"
+	}
+	panic("TODO")
+}
+
+func ExitCodeCallGoModuleFunctionWithIndex(index int, withListener bool) ExitCode {
+	if withListener {
+		return ExitCodeCallGoModuleFunctionWithListener | ExitCode(index<<8)
+	}
+	return ExitCodeCallGoModuleFunction | ExitCode(index<<8)
+}
+
+func ExitCodeCallGoFunctionWithIndex(index int, withListener bool) ExitCode {
+	if withListener {
+		return ExitCodeCallGoFunctionWithListener | ExitCode(index<<8)
+	}
+	return ExitCodeCallGoFunction | ExitCode(index<<8)
+}
+
+func GoFunctionIndexFromExitCode(exitCode ExitCode) int {
+	return int(exitCode >> 8)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go
@ -0,0 +1,216 @@
+package wazevoapi
+
+import (
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+const (
+	// FunctionInstanceSize is the size of wazevo.functionInstance.
+	FunctionInstanceSize = 24
+	// FunctionInstanceExecutableOffset is an offset of `executable` field in wazevo.functionInstance
+	FunctionInstanceExecutableOffset = 0
+	// FunctionInstanceModuleContextOpaquePtrOffset is an offset of `moduleContextOpaquePtr` field in wazevo.functionInstance
+	FunctionInstanceModuleContextOpaquePtrOffset = 8
+	// FunctionInstanceTypeIDOffset is an offset of `typeID` field in wazevo.functionInstance
+	FunctionInstanceTypeIDOffset = 16
+)
+
+const (
+	// ExecutionContextOffsetExitCodeOffset is an offset of `exitCode` field in wazevo.executionContext
+	ExecutionContextOffsetExitCodeOffset Offset = 0
+	// ExecutionContextOffsetCallerModuleContextPtr is an offset of `callerModuleContextPtr` field in wazevo.executionContext
+	ExecutionContextOffsetCallerModuleContextPtr Offset = 8
+	// ExecutionContextOffsetOriginalFramePointer is an offset of `originalFramePointer` field in wazevo.executionContext
+	ExecutionContextOffsetOriginalFramePointer Offset = 16
+	// ExecutionContextOffsetOriginalStackPointer is an offset of `originalStackPointer` field in wazevo.executionContext
+	ExecutionContextOffsetOriginalStackPointer Offset = 24
+	// ExecutionContextOffsetGoReturnAddress is an offset of `goReturnAddress` field in wazevo.executionContext
+	ExecutionContextOffsetGoReturnAddress Offset = 32
+	// ExecutionContextOffsetStackBottomPtr is an offset of `stackBottomPtr` field in wazevo.executionContext
+	ExecutionContextOffsetStackBottomPtr Offset = 40
+	// ExecutionContextOffsetGoCallReturnAddress is an offset of `goCallReturnAddress` field in wazevo.executionContext
+	ExecutionContextOffsetGoCallReturnAddress Offset = 48
+	// ExecutionContextOffsetStackPointerBeforeGoCall is an offset of `StackPointerBeforeGoCall` field in wazevo.executionContext
+	ExecutionContextOffsetStackPointerBeforeGoCall Offset = 56
+	// ExecutionContextOffsetStackGrowRequiredSize is an offset of `stackGrowRequiredSize` field in wazevo.executionContext
+	ExecutionContextOffsetStackGrowRequiredSize Offset = 64
+	// ExecutionContextOffsetMemoryGrowTrampolineAddress is an offset of `memoryGrowTrampolineAddress` field in wazevo.executionContext
+	ExecutionContextOffsetMemoryGrowTrampolineAddress Offset = 72
+	// ExecutionContextOffsetStackGrowCallTrampolineAddress is an offset of `stackGrowCallTrampolineAddress` field in wazevo.executionContext.
+	ExecutionContextOffsetStackGrowCallTrampolineAddress Offset = 80
+	// ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress is an offset of `checkModuleExitCodeTrampolineAddress` field in wazevo.executionContext.
+	ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress Offset = 88
+	// ExecutionContextOffsetSavedRegistersBegin is an offset of the first element of `savedRegisters` field in wazevo.executionContext
+	ExecutionContextOffsetSavedRegistersBegin Offset = 96
+	// ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque is an offset of `goFunctionCallCalleeModuleContextOpaque` field in wazevo.executionContext
+	ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque Offset = 1120
+	// ExecutionContextOffsetTableGrowTrampolineAddress is an offset of `tableGrowTrampolineAddress` field in wazevo.executionContext
+	ExecutionContextOffsetTableGrowTrampolineAddress Offset = 1128
+	// ExecutionContextOffsetRefFuncTrampolineAddress is an offset of `refFuncTrampolineAddress` field in wazevo.executionContext
+	ExecutionContextOffsetRefFuncTrampolineAddress      Offset = 1136
+	ExecutionContextOffsetMemmoveAddress                Offset = 1144
+	ExecutionContextOffsetFramePointerBeforeGoCall      Offset = 1152
+	ExecutionContextOffsetMemoryWait32TrampolineAddress Offset = 1160
+	ExecutionContextOffsetMemoryWait64TrampolineAddress Offset = 1168
+	ExecutionContextOffsetMemoryNotifyTrampolineAddress Offset = 1176
+)
+
+// ModuleContextOffsetData allows the compilers to get the information about offsets to the fields of wazevo.moduleContextOpaque,
+// This is unique per module.
+type ModuleContextOffsetData struct {
+	TotalSize int
+	ModuleInstanceOffset,
+	LocalMemoryBegin,
+	ImportedMemoryBegin,
+	ImportedFunctionsBegin,
+	GlobalsBegin,
+	TypeIDs1stElement,
+	TablesBegin,
+	BeforeListenerTrampolines1stElement,
+	AfterListenerTrampolines1stElement,
+	DataInstances1stElement,
+	ElementInstances1stElement Offset
+}
+
+// ImportedFunctionOffset returns an offset of the i-th imported function.
+// Each item is stored as wazevo.functionInstance whose size matches FunctionInstanceSize.
+func (m *ModuleContextOffsetData) ImportedFunctionOffset(i wasm.Index) (
+	executableOffset, moduleCtxOffset, typeIDOffset Offset,
+) {
+	base := m.ImportedFunctionsBegin + Offset(i)*FunctionInstanceSize
+	return base, base + 8, base + 16
+}
+
+// GlobalInstanceOffset returns an offset of the i-th global instance.
+func (m *ModuleContextOffsetData) GlobalInstanceOffset(i wasm.Index) Offset {
+	return m.GlobalsBegin + Offset(i)*16
+}
+
+// Offset represents an offset of a field of a struct.
+type Offset int32
+
+// U32 encodes an Offset as uint32 for convenience.
+func (o Offset) U32() uint32 {
+	return uint32(o)
+}
+
+// I64 encodes an Offset as int64 for convenience.
+func (o Offset) I64() int64 {
+	return int64(o)
+}
+
+// U64 encodes an Offset as int64 for convenience.
+func (o Offset) U64() uint64 {
+	return uint64(o)
+}
+
+// LocalMemoryBase returns an offset of the first byte of the local memory.
+func (m *ModuleContextOffsetData) LocalMemoryBase() Offset {
+	return m.LocalMemoryBegin
+}
+
+// LocalMemoryLen returns an offset of the length of the local memory buffer.
+func (m *ModuleContextOffsetData) LocalMemoryLen() Offset {
+	if l := m.LocalMemoryBegin; l >= 0 {
+		return l + 8
+	}
+	return -1
+}
+
+// TableOffset returns an offset of the i-th table instance.
+func (m *ModuleContextOffsetData) TableOffset(tableIndex int) Offset {
+	return m.TablesBegin + Offset(tableIndex)*8
+}
+
+// NewModuleContextOffsetData creates a ModuleContextOffsetData determining the structure of moduleContextOpaque for the given Module.
+// The structure is described in the comment of wazevo.moduleContextOpaque.
+func NewModuleContextOffsetData(m *wasm.Module, withListener bool) ModuleContextOffsetData {
+	ret := ModuleContextOffsetData{}
+	var offset Offset
+
+	ret.ModuleInstanceOffset = 0
+	offset += 8
+
+	if m.MemorySection != nil {
+		ret.LocalMemoryBegin = offset
+		// buffer base + memory size.
+		const localMemorySizeInOpaqueModuleContext = 16
+		offset += localMemorySizeInOpaqueModuleContext
+	} else {
+		// Indicates that there's no local memory
+		ret.LocalMemoryBegin = -1
+	}
+
+	if m.ImportMemoryCount > 0 {
+		offset = align8(offset)
+		// *wasm.MemoryInstance + imported memory's owner (moduleContextOpaque)
+		const importedMemorySizeInOpaqueModuleContext = 16
+		ret.ImportedMemoryBegin = offset
+		offset += importedMemorySizeInOpaqueModuleContext
+	} else {
+		// Indicates that there's no imported memory
+		ret.ImportedMemoryBegin = -1
+	}
+
+	if m.ImportFunctionCount > 0 {
+		offset = align8(offset)
+		ret.ImportedFunctionsBegin = offset
+		// Each function is stored wazevo.functionInstance.
+		size := int(m.ImportFunctionCount) * FunctionInstanceSize
+		offset += Offset(size)
+	} else {
+		ret.ImportedFunctionsBegin = -1
+	}
+
+	if globals := int(m.ImportGlobalCount) + len(m.GlobalSection); globals > 0 {
+		// Align to 16 bytes for globals, as f32/f64/v128 might be loaded via SIMD instructions.
+		offset = align16(offset)
+		ret.GlobalsBegin = offset
+		// Pointers to *wasm.GlobalInstance.
+		offset += Offset(globals) * 16
+	} else {
+		ret.GlobalsBegin = -1
+	}
+
+	if tables := len(m.TableSection) + int(m.ImportTableCount); tables > 0 {
+		offset = align8(offset)
+		ret.TypeIDs1stElement = offset
+		offset += 8 // First element of TypeIDs.
+
+		ret.TablesBegin = offset
+		// Pointers to *wasm.TableInstance.
+		offset += Offset(tables) * 8
+	} else {
+		ret.TypeIDs1stElement = -1
+		ret.TablesBegin = -1
+	}
+
+	if withListener {
+		offset = align8(offset)
+		ret.BeforeListenerTrampolines1stElement = offset
+		offset += 8 // First element of BeforeListenerTrampolines.
+
+		ret.AfterListenerTrampolines1stElement = offset
+		offset += 8 // First element of AfterListenerTrampolines.
+	} else {
+		ret.BeforeListenerTrampolines1stElement = -1
+		ret.AfterListenerTrampolines1stElement = -1
+	}
+
+	ret.DataInstances1stElement = offset
+	offset += 8 // First element of DataInstances.
+
+	ret.ElementInstances1stElement = offset
+	offset += 8 // First element of ElementInstances.
+
+	ret.TotalSize = int(align16(offset))
+	return ret
+}
+
+func align16(o Offset) Offset {
+	return (o + 15) &^ 15
+}
+
+func align8(o Offset) Offset {
+	return (o + 7) &^ 7
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go
@ -0,0 +1,96 @@
+package wazevoapi
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+	"sync"
+)
+
+var PerfMap *Perfmap
+
+func init() {
+	if PerfMapEnabled {
+		pid := os.Getpid()
+		filename := "/tmp/perf-" + strconv.Itoa(pid) + ".map"
+
+		fh, err := os.OpenFile(filename, os.O_APPEND|os.O_RDWR|os.O_CREATE, 0o644)
+		if err != nil {
+			panic(err)
+		}
+
+		PerfMap = &Perfmap{fh: fh}
+	}
+}
+
+// Perfmap holds perfmap entries to be flushed into a perfmap file.
+type Perfmap struct {
+	entries []entry
+	mux     sync.Mutex
+	fh      *os.File
+}
+
+type entry struct {
+	index  int
+	offset int64
+	size   uint64
+	name   string
+}
+
+func (f *Perfmap) Lock() {
+	f.mux.Lock()
+}
+
+func (f *Perfmap) Unlock() {
+	f.mux.Unlock()
+}
+
+// AddModuleEntry adds a perfmap entry into the perfmap file.
+// index is the index of the function in the module, offset is the offset of the function in the module,
+// size is the size of the function, and name is the name of the function.
+//
+// Note that the entries are not flushed into the perfmap file until Flush is called,
+// and the entries are module-scoped; Perfmap must be locked until Flush is called.
+func (f *Perfmap) AddModuleEntry(index int, offset int64, size uint64, name string) {
+	e := entry{index: index, offset: offset, size: size, name: name}
+	if f.entries == nil {
+		f.entries = []entry{e}
+		return
+	}
+	f.entries = append(f.entries, e)
+}
+
+// Flush writes the perfmap entries into the perfmap file where the entries are adjusted by the given `addr` and `functionOffsets`.
+func (f *Perfmap) Flush(addr uintptr, functionOffsets []int) {
+	defer func() {
+		_ = f.fh.Sync()
+	}()
+
+	for _, e := range f.entries {
+		if _, err := f.fh.WriteString(fmt.Sprintf("%x %s %s\n",
+			uintptr(e.offset)+addr+uintptr(functionOffsets[e.index]),
+			strconv.FormatUint(e.size, 16),
+			e.name,
+		)); err != nil {
+			panic(err)
+		}
+	}
+	f.entries = f.entries[:0]
+}
+
+// Clear clears the perfmap entries not yet flushed.
+func (f *Perfmap) Clear() {
+	f.entries = f.entries[:0]
+}
+
+// AddEntry writes a perfmap entry directly into the perfmap file, not using the entries.
+func (f *Perfmap) AddEntry(addr uintptr, size uint64, name string) {
+	_, err := f.fh.WriteString(fmt.Sprintf("%x %s %s\n",
+		addr,
+		strconv.FormatUint(size, 16),
+		name,
+	))
+	if err != nil {
+		panic(err)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go
@ -0,0 +1,5 @@
+//go:build !perfmap
+
+package wazevoapi
+
+const PerfMapEnabled = false
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go
@ -0,0 +1,5 @@
+//go:build perfmap
+
+package wazevoapi
+
+const PerfMapEnabled = true
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
@ -0,0 +1,215 @@
+package wazevoapi
+
+const poolPageSize = 128
+
+// Pool is a pool of T that can be allocated and reset.
+// This is useful to avoid unnecessary allocations.
+type Pool[T any] struct {
+	pages            []*[poolPageSize]T
+	resetFn          func(*T)
+	allocated, index int
+}
+
+// NewPool returns a new Pool.
+// resetFn is called when a new T is allocated in Pool.Allocate.
+func NewPool[T any](resetFn func(*T)) Pool[T] {
+	var ret Pool[T]
+	ret.resetFn = resetFn
+	ret.Reset()
+	return ret
+}
+
+// Allocated returns the number of allocated T currently in the pool.
+func (p *Pool[T]) Allocated() int {
+	return p.allocated
+}
+
+// Allocate allocates a new T from the pool.
+func (p *Pool[T]) Allocate() *T {
+	if p.index == poolPageSize {
+		if len(p.pages) == cap(p.pages) {
+			p.pages = append(p.pages, new([poolPageSize]T))
+		} else {
+			i := len(p.pages)
+			p.pages = p.pages[:i+1]
+			if p.pages[i] == nil {
+				p.pages[i] = new([poolPageSize]T)
+			}
+		}
+		p.index = 0
+	}
+	ret := &p.pages[len(p.pages)-1][p.index]
+	if p.resetFn != nil {
+		p.resetFn(ret)
+	}
+	p.index++
+	p.allocated++
+	return ret
+}
+
+// View returns the pointer to i-th item from the pool.
+func (p *Pool[T]) View(i int) *T {
+	page, index := i/poolPageSize, i%poolPageSize
+	return &p.pages[page][index]
+}
+
+// Reset resets the pool.
+func (p *Pool[T]) Reset() {
+	p.pages = p.pages[:0]
+	p.index = poolPageSize
+	p.allocated = 0
+}
+
+// IDedPool is a pool of T that can be allocated and reset, with a way to get T by an ID.
+type IDedPool[T any] struct {
+	pool             Pool[T]
+	idToItems        []*T
+	maxIDEncountered int
+}
+
+// NewIDedPool returns a new IDedPool.
+func NewIDedPool[T any](resetFn func(*T)) IDedPool[T] {
+	return IDedPool[T]{pool: NewPool[T](resetFn)}
+}
+
+// GetOrAllocate returns the T with the given id.
+func (p *IDedPool[T]) GetOrAllocate(id int) *T {
+	if p.maxIDEncountered < id {
+		p.maxIDEncountered = id
+	}
+	if id >= len(p.idToItems) {
+		p.idToItems = append(p.idToItems, make([]*T, id-len(p.idToItems)+1)...)
+	}
+	if p.idToItems[id] == nil {
+		p.idToItems[id] = p.pool.Allocate()
+	}
+	return p.idToItems[id]
+}
+
+// Get returns the T with the given id, or nil if it's not allocated.
+func (p *IDedPool[T]) Get(id int) *T {
+	if id >= len(p.idToItems) {
+		return nil
+	}
+	return p.idToItems[id]
+}
+
+// Reset resets the pool.
+func (p *IDedPool[T]) Reset() {
+	p.pool.Reset()
+	for i := range p.idToItems {
+		p.idToItems[i] = nil
+	}
+	p.maxIDEncountered = -1
+}
+
+// MaxIDEncountered returns the maximum id encountered so far.
+func (p *IDedPool[T]) MaxIDEncountered() int {
+	return p.maxIDEncountered
+}
+
+// arraySize is the size of the array used in VarLengthPool's arrayPool.
+// This is chosen to be 8, which is empirically a good number among 8, 12, 16 and 20.
+const arraySize = 8
+
+// VarLengthPool is a pool of VarLength[T] that can be allocated and reset.
+type (
+	VarLengthPool[T any] struct {
+		arrayPool Pool[varLengthPoolArray[T]]
+		slicePool Pool[[]T]
+	}
+	// varLengthPoolArray wraps an array and keeps track of the next index to be used to avoid the heap allocation.
+	varLengthPoolArray[T any] struct {
+		arr  [arraySize]T
+		next int
+	}
+)
+
+// VarLength is a variable length array that can be reused via a pool.
+type VarLength[T any] struct {
+	arr *varLengthPoolArray[T]
+	slc *[]T
+}
+
+// NewVarLengthPool returns a new VarLengthPool.
+func NewVarLengthPool[T any]() VarLengthPool[T] {
+	return VarLengthPool[T]{
+		arrayPool: NewPool[varLengthPoolArray[T]](func(v *varLengthPoolArray[T]) {
+			v.next = 0
+		}),
+		slicePool: NewPool[[]T](func(i *[]T) {
+			*i = (*i)[:0]
+		}),
+	}
+}
+
+// NewNilVarLength returns a new VarLength[T] with a nil backing.
+func NewNilVarLength[T any]() VarLength[T] {
+	return VarLength[T]{}
+}
+
+// Allocate allocates a new VarLength[T] from the pool.
+func (p *VarLengthPool[T]) Allocate(knownMin int) VarLength[T] {
+	if knownMin <= arraySize {
+		arr := p.arrayPool.Allocate()
+		return VarLength[T]{arr: arr}
+	}
+	slc := p.slicePool.Allocate()
+	return VarLength[T]{slc: slc}
+}
+
+// Reset resets the pool.
+func (p *VarLengthPool[T]) Reset() {
+	p.arrayPool.Reset()
+	p.slicePool.Reset()
+}
+
+// Append appends items to the backing slice just like the `append` builtin function in Go.
+func (i VarLength[T]) Append(p *VarLengthPool[T], items ...T) VarLength[T] {
+	if i.slc != nil {
+		*i.slc = append(*i.slc, items...)
+		return i
+	}
+
+	if i.arr == nil {
+		i.arr = p.arrayPool.Allocate()
+	}
+
+	arr := i.arr
+	if arr.next+len(items) <= arraySize {
+		for _, item := range items {
+			arr.arr[arr.next] = item
+			arr.next++
+		}
+	} else {
+		slc := p.slicePool.Allocate()
+		// Copy the array to the slice.
+		for ptr := 0; ptr < arr.next; ptr++ {
+			*slc = append(*slc, arr.arr[ptr])
+		}
+		i.slc = slc
+		*i.slc = append(*i.slc, items...)
+	}
+	return i
+}
+
+// View returns the backing slice.
+func (i VarLength[T]) View() []T {
+	if i.slc != nil {
+		return *i.slc
+	} else if i.arr != nil {
+		arr := i.arr
+		return arr.arr[:arr.next]
+	}
+	return nil
+}
+
+// Cut cuts the backing slice to the given length.
+// Precondition: n <= len(i.backing).
+func (i VarLength[T]) Cut(n int) {
+	if i.slc != nil {
+		*i.slc = (*i.slc)[:n]
+	} else if i.arr != nil {
+		i.arr.next = n
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go
@ -0,0 +1,15 @@
+package wazevoapi
+
+import "unsafe"
+
+// PtrFromUintptr resurrects the original *T from the given uintptr.
+// The caller of this function MUST be sure that ptr is valid.
+func PtrFromUintptr[T any](ptr uintptr) *T {
+	// Wraps ptrs as the double pointer in order to avoid the unsafe access as detected by race detector.
+	//
+	// For example, if we have (*function)(unsafe.Pointer(ptr)) instead, then the race detector's "checkptr"
+	// subroutine wanrs as "checkptr: pointer arithmetic result points to invalid allocation"
+	// https://github.com/golang/go/blob/1ce7fcf139417d618c2730010ede2afb41664211/src/runtime/checkptr.go#L69
+	var wrapped *uintptr = &ptr
+	return *(**T)(unsafe.Pointer(wrapped))
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go
@ -0,0 +1,26 @@
+package wazevoapi
+
+// Queue is the resettable queue where the underlying slice is reused.
+type Queue[T any] struct {
+	index int
+	Data  []T
+}
+
+func (q *Queue[T]) Enqueue(v T) {
+	q.Data = append(q.Data, v)
+}
+
+func (q *Queue[T]) Dequeue() (ret T) {
+	ret = q.Data[q.index]
+	q.index++
+	return
+}
+
+func (q *Queue[T]) Empty() bool {
+	return q.index >= len(q.Data)
+}
+
+func (q *Queue[T]) Reset() {
+	q.index = 0
+	q.Data = q.Data[:0]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go
@ -0,0 +1,13 @@
+package wazevoapi
+
+// ResetMap resets the map to an empty state, or creates a new map if it is nil.
+func ResetMap[K comparable, V any](m map[K]V) map[K]V {
+	if m == nil {
+		m = make(map[K]V)
+	} else {
+		for v := range m {
+			delete(m, v)
+		}
+	}
+	return m
+}