Source/JavaScriptCore/llint/InPlaceInterpreter.asm - external/github.com/WebKit/webkit - Git at Google

 #
 # IPInt: the WASM in-place interpreter
 # DISCLAIMER: not tested on x86 yet (as of 05 Jul 2023); IPInt may break *very* badly.
 #
 # docs by Daniel Liu <daniel_liu4@apple.com / danlliu@umich.edu>; 2023 intern project
 #
 # 0. OfflineASM:
 # --------------
 #
 # For a crash course on OfflineASM, check out LowLevelInterpreter.asm.
 #
 # 1. Code Structure
 # -----------------
 #
 # IPInt is designed to start up quickly and interpret WASM code efficiently. To optimize for speed, we utilize a jump
 # table, using the opcode's first byte as an offset. This jump table is set up in _ipint_setup.
 # For more complex opcodes (ex. SIMD), we define additional jump tables that utilize further bytes as indices.
 #
 # 2. Setting Up
 # -------------
 #
 # Before we can execute WebAssembly, we have to handle the call frame that is given to us. This is handled in _ipint_entry.
 # We start by saving registers to the stack as per the system calling convention. Then, we have IPInt specific logic:
 #
 # 2.1. Locals
 # -----------
 #
 # To ensure that we are able to access local variables quickly, we allocate a section of the stack to store local variables.
 # We allocate 8 bytes for each local variable on the stack.
 #
 # Additionally, we need to load the parameters to the function into local variables. As per the calling convention, arguments
 # are passed via registers, and then on the stack if all argument registers have been exhausted. Thus, we need to handle those
 # cases. We keep track of the number of arguments in IPIntCallee, allowing us to know exactly where to load arguments from.
 #
 # Finally, we set the value of the `PL` (pointer to locals) register to the position of the first local. This allows us to quickly
 # index into locals.
 #
 # 2.2. Bytecode and Metadata
 # --------------------------
 #
 # The final step before executing is to load the bytecode to execute, as well as the metadata. For an explanation of why we use
 # metadata in IPInt, check out WasmIPIntGenerator.cpp. We load these into registers `PB` (pointer to bytecode) and `PM` (pointer
 # to metadata). Additionally, registers `PC` (program counter) and `MC` (metadata counter) are set to 0.
 #
 # 3. Executing WebAssembly
 # ------------------------
 #
 # WebAssembly execution revolves around a stack machine, which we run on the program stack. We work with the constraint
 # that the stack must be 16B aligned by ensuring that pushes and pops are always 16B. This makes certain opcodes (ex. drop)
 # much easier as well.
 #
 # For each instruction, we align its assembly to a 256B boundary. Thus, we can take (address of instruction 0) + opcode * 256
 # to find the exact point where we need to jump for each opcode without any dependent loads.
 #
 # 4. Returning
 # ------------
 #
 # To return values to the caller, IPInt uses the standard WebAssembly calling convention. Return values are passed in the
 # system return registers, and on the stack if not possible. After this, we perform cleanup logic to reset the stack to its
 # original state, and return to the caller.
 #

 #################################
 # Register and Size Definitions #
 #################################

 # PC = t4
 const MC = t5  # Metadata counter (index into metadata)
 const PL = t6  # Pointer to locals (index into locals)
 const PM = metadataTable

 if ARM64 or ARM64E
     const IB = t7  # instruction base
 end

 # TODO: SIMD support, since locals will need double the space. Can we do it only sometimes?
 # May just need to write metadata that rewrites offsets. May be worth the space savings.
 # Actually, what if we just use the same thing but have a SIMD section separately allocated that
 # is "pointed" to by the 8B entries on the stack? Easier and we only need to allocate SIMD when we need
 # instead of blowing up the stack. Argument copying a little trickier though.

 const PtrSize = constexpr (sizeof(void*))
 const MachineRegisterSize = constexpr (sizeof(CPURegister))
 const SlotSize = constexpr (sizeof(Register))
 const LocalSize = SlotSize
 const StackValueSize = 16

 if X86_64 or ARM64 or ARM64E or RISCV64
     const wasmInstance = csr0
     const memoryBase = csr3
     const boundsCheckingSize = csr4
 elsif ARMv7
     const wasmInstance = csr0
     const memoryBase = invalidGPR
     const boundsCheckingSize = invalidGPR
 else
 end

 const UnboxedWasmCalleeStackSlot = CallerFrame - constexpr Wasm::numberOfIPIntCalleeSaveRegisters * SlotSize - MachineRegisterSize

 ##########
 # Macros #
 ##########

 # Callee Save

 # FIXME: This happens to work because UnboxedWasmCalleeStackSlot sits in the extra space we should be more precise in case we want to use an even number of callee saves in the future.
 const IPIntCalleeSaveSpaceStackAligned = 2*CalleeSaveSpaceStackAligned

 macro saveIPIntRegisters()
     subp IPIntCalleeSaveSpaceStackAligned, sp
     if ARM64 or ARM64E
         storepairq PM, PB, -16[cfr]
         storeq wasmInstance, -24[cfr]
     elsif X86_64 or RISCV64
         storep PB, -0x8[cfr]
         storep PM, -0x10[cfr]
         storep wasmInstance, -0x18[cfr]
     else
     end
 end

 macro restoreIPIntRegisters()
     if ARM64 or ARM64E
         loadpairq -16[cfr], PM, PB
         loadq -24[cfr], wasmInstance
     elsif X86_64 or RISCV64
         loadp -0x8[cfr], PB
         loadp -0x10[cfr], PM
         loadp -0x18[cfr], wasmInstance
     else
     end
     addp IPIntCalleeSaveSpaceStackAligned, sp
 end

 # Get IPIntCallee object at startup

 macro getIPIntCallee()
     loadp Callee[cfr], ws0
 if JSVALUE64
     andp ~(constexpr JSValue::NativeCalleeTag), ws0
 end
     leap WTFConfig + constexpr WTF::offsetOfWTFConfigLowestAccessibleAddress, ws1
     loadp [ws1], ws1
     addp ws1, ws0
     storep ws0, UnboxedWasmCalleeStackSlot[cfr]
 end

 # Tail-call dispatch

 macro advancePC(amount)
     addq amount, PC
 end

 macro advancePCByReg(amount)
     addq amount, PC
 end

 macro advanceMC(amount)
     addq amount, MC
 end

 macro advanceMCByReg(amount)
     addq amount, MC
 end

 macro nextIPIntInstruction()
     # Consistency check
     # move MC, t0
     # andq 7, t0
     # bqeq t0, 0, .fine
     # break
 # .fine:
     loadb [PB, PC, 1], t0
 if ARM64 or ARM64E
     # x7 = IB
     # x0 = opcode
     emit "add x0, x7, x0, lsl #8"
     emit "br x0"
 elsif X86_64
     lshiftq 8, t0
     leap (_ipint_unreachable), t1
     addq t1, t0
     emit "jmp *(%eax)"
 else
     break
 end
 end

 # Stack operations
 # Every value on the stack is always 16 bytes! This makes life easy.

 macro pushQuad(reg)
     if ARM64 or ARM64E
         push reg, reg
     else
         push reg
     end
 end

 macro pushQuadPair(reg1, reg2)
     push reg1, reg2
 end

 macro popQuad(reg, scratch)
     if ARM64 or ARM64E
         pop reg, scratch
     else
         pop reg
     end
 end

 macro pushVectorReg0()
     if ARM64 or ARM64E
         emit "str q0, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm0, (%esp)"
     end
 end

 macro pushVectorReg1()
     if ARM64 or ARM64E
         emit "str q1, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm1, (%esp)"
     end
 end

 macro pushVectorReg2()
     if ARM64 or ARM64E
         emit "str q2, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm2, (%esp)"
     end
 end

 macro popVectorReg0()
     if ARM64 or ARM64E
         emit "ldr q0, [sp], #16"
     elsif X86_64
         emit "movdqu (%esp), %xmm0"
         emit "add $16, %esp"
     end
 end

 macro popVectorReg1()
     if ARM64 or ARM64E
         emit "ldr q1, [sp], #16"
     elsif X86_64
         emit "movdqu (%esp), %xmm1"
         emit "add $16, %esp"
     end
 end

 macro popVectorReg2()
     if ARM64 or ARM64E
         emit "ldr q2, [sp], #16"
     elsif X86_64
         emit "movdqu (%esp), %xmm2"
         emit "add $16, %esp"
     end
 end

 # Pushes ft0 because macros
 macro pushFPR()
     if ARM64 or ARM64E
         emit "str q0, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm0, (%esp)"
     end
 end

 macro pushFPR1()
     if ARM64 or ARM64E
         emit "str q1, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm1, (%esp)"
     end
 end

 macro popFPR()
     if ARM64 or ARM64E
         # We'll just drop the entire q0 register in here
         # to keep stack aligned to 16
         # We'll never actually use q0 as a whole for FP,
         # since we only work with f32 (s0) or f64 (d0)
         emit "ldr q0, [sp], #16"
     elsif X86_64
         emit "movdqu (%esp), %xmm0"
         emit "add $16, %esp"
     end
 end

 macro popFPR1()
     if ARM64 or ARM64E
         emit "ldr q1, [sp], #16"
     elsif X86_64
         emit "movdqu (%esp), %xmm1"
         emit "add $16, %esp"
     end
 end

 # Typed push/pop to make code pretty

 macro pushInt32(reg)
     pushQuad(reg)
 end

 macro popInt32(reg, scratch)
     popQuad(reg, scratch)
 end

 macro pushInt64(reg)
     pushQuad(reg)
 end

 macro popInt64(reg, scratch)
     popQuad(reg, scratch)
 end

 macro pushFloat32FT0()
     pushFPR()
 end

 macro pushFloat32FT1()
     pushFPR1()
 end

 macro popFloat32FT0()
     popFPR()
 end

 macro popFloat32FT1()
     popFPR1()
 end

 macro pushFloat64FT0()
     pushFPR()
 end

 macro pushFloat64FT1()
     pushFPR1()
 end

 macro popFloat64FT0()
     popFPR()
 end

 macro popFloat64FT1()
     popFPR1()
 end

 # Instruction labels
 # Important Note: If you don't use the unaligned global label from C++ (in our case we use the
 # labels in InPlaceInterpreter.cpp) then some linkers will still remove the definition which
 # causes all kinds of problems.

 # FIXME: switch offlineasm unalignedglobal to take alignment and optionally pad with breakpoint instructions (rdar://113594783)
 macro alignment()
 if ARM64 or ARM64E
     # fill with brk instructions
     emit ".balignl 256, 0xd4388e20"
 elsif X86_64
     # fill with int 3 instructions
     emit ".balign 256, 0xcc"
 end
 end

 macro instructionLabel(instrname)
     alignment()
     unalignedglobal _ipint%instrname%_validate
     _ipint%instrname%:
     _ipint%instrname%_validate:
 end

 macro unimplementedInstruction(instrname)
     alignment()
     instructionLabel(instrname)
     break
 end

 macro reservedOpcode(opcode)
     alignment()
     break
 end

 # Memory

 macro ipintReloadMemory()
     if ARM64 or ARM64E
         loadpairq Wasm::Instance::m_cachedMemory[wasmInstance], memoryBase, boundsCheckingSize
     else
         loadp Wasm::Instance::m_cachedMemory[wasmInstance], memoryBase
         loadp Wasm::Instance::m_cachedBoundsCheckingSize[wasmInstance], boundsCheckingSize
     end
     if not ARMv7
         cagedPrimitiveMayBeNull(memoryBase, t2)
     end
 end

 # Operation Calls

 macro operationCall(fn)
     move wasmInstance, a0
     push PC, MC
     push PL, ws0
     fn()
     pop ws0, PL
     pop MC, PC
     if ARM64 or ARM64E
         pcrtoaddr _ipint_unreachable, IB
     end
 end

 macro operationCallMayThrow(fn)
     storei PC, CallSiteIndex[cfr]

     move wasmInstance, a0
     push PC, MC
     push PL, ws0
     fn()
     bqneq r0, 1, .continuation
     storei r1, ArgumentCountIncludingThis + PayloadOffset[cfr]
     jmp _wasm_throw_from_slow_path_trampoline
 .continuation:
     pop ws0, PL
     pop MC, PC
     if ARM64 or ARM64E
         pcrtoaddr _ipint_unreachable, IB
     end
 end

 # Exception handling

 macro ipintException(exception)
     # move PL, sp
     # loadi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], PM
     # mulq SlotSize, PM
     # addq PM, sp
     # restoreCallerPCAndCFR()
     storei constexpr Wasm::ExceptionType::%exception%, ArgumentCountIncludingThis + PayloadOffset[cfr]
     jmp _wasm_throw_from_slow_path_trampoline
 end

 # OSR

 macro ipintPrologueOSR(increment)
 if JIT
     loadp UnboxedWasmCalleeStackSlot[cfr], ws0
     baddis increment, Wasm::IPIntCallee::m_tierUpCounter + Wasm::LLIntTierUpCounter::m_counter[ws0], .continue

     subq (NumberOfWasmArgumentJSRs + NumberOfWasmArgumentFPRs) * 8, sp
 if ARM64 or ARM64E
     forEachArgumentJSR(macro (offset, gpr1, gpr2)
         storepairq gpr2, gpr1, offset[sp]
     end)
 elsif JSVALUE64
     forEachArgumentJSR(macro (offset, gpr)
         storeq gpr, offset[sp]
     end)
 else
     forEachArgumentJSR(macro (offset, gprMsw, gpLsw)
         store2ia gpLsw, gprMsw, offset[sp]
     end)
 end
 if ARM64 or ARM64E
     forEachArgumentFPR(macro (offset, fpr1, fpr2)
         storepaird fpr2, fpr1, offset[sp]
     end)
 else
     forEachArgumentFPR(macro (offset, fpr)
         stored fpr, offset[sp]
     end)
 end

     ipintReloadMemory()
     push memoryBase, boundsCheckingSize

     move cfr, a1
     operationCall(macro() cCall2(_ipint_extern_prologue_osr) end)
     move r0, ws0

     pop boundsCheckingSize, memoryBase

 if ARM64 or ARM64E
     forEachArgumentFPR(macro (offset, fpr1, fpr2)
         loadpaird offset[sp], fpr2, fpr1
     end)
 else
     forEachArgumentFPR(macro (offset, fpr)
         loadd offset[sp], fpr
     end)
 end

 if ARM64 or ARM64E
     forEachArgumentJSR(macro (offset, gpr1, gpr2)
         loadpairq offset[sp], gpr2, gpr1
     end)
 elsif JSVALUE64
     forEachArgumentJSR(macro (offset, gpr)
         loadq offset[sp], gpr
     end)
 else
     forEachArgumentJSR(macro (offset, gprMsw, gpLsw)
         load2ia offset[sp], gpLsw, gpMsw
     end)
 end
     addq (NumberOfWasmArgumentJSRs + NumberOfWasmArgumentFPRs) * 8, sp

     btpz ws0, .recover

     restoreIPIntRegisters()
     restoreCallerPCAndCFR()

     if ARM64E
         leap _g_config, ws1
         jmp JSCConfigGateMapOffset + (constexpr Gate::wasmOSREntry) * PtrSize[ws1], NativeToJITGatePtrTag # WasmEntryPtrTag
     else
         jmp ws0, WasmEntryPtrTag
     end

 .recover:
     loadp UnboxedWasmCalleeStackSlot[cfr], ws0
 .continue:
 end
 end

 macro ipintLoopOSR(increment)
 if JIT
     loadp UnboxedWasmCalleeStackSlot[cfr], ws0
     baddis increment, Wasm::IPIntCallee::m_tierUpCounter + Wasm::LLIntTierUpCounter::m_counter[ws0], .continue

     move cfr, a1
     move PC, a2
     # Add 1 to the index due to WTF::HashMap not supporting 0 as a key
     addq 1, a2
     move PL, a3
     operationCall(macro() cCall4(_ipint_extern_loop_osr) end)
     btpz r1, .recover
     restoreIPIntRegisters()
     restoreCallerPCAndCFR()
     move r0, a0

     if ARM64E
         move r1, ws0
         leap _g_config, ws1
         jmp JSCConfigGateMapOffset + (constexpr Gate::wasmOSREntry) * PtrSize[ws1], NativeToJITGatePtrTag # WasmEntryPtrTag
     else
         jmp r1, WasmEntryPtrTag
     end

 .recover:
     loadp UnboxedWasmCalleeStackSlot[cfr], ws0
 .continue:
 end
 end

 macro ipintEpilogueOSR(increment)
 if JIT
     loadp UnboxedWasmCalleeStackSlot[cfr], ws0
     baddis increment, Wasm::IPIntCallee::m_tierUpCounter + Wasm::LLIntTierUpCounter::m_counter[ws0], .continue

     move cfr, a1
     operationCall(macro() cCall2(_ipint_extern_epilogue_osr) end)
 .continue:
 end
 end


 macro decodeLEBVarUInt32(offset, dst, scratch1, scratch2, scratch3, scratch4)
     # if it's a single byte, fastpath it
     const tempPC = scratch4
     leap offset[PC], tempPC
     loadb [PB, tempPC], dst

     bbb dst, 0x80, .fastpath
     # otherwise, set up for second iteration
     # next shift is 7
     move 7, scratch1
     # take off high bit
     subi 0x80, dst
 .loop:
     addp 1, tempPC
     loadb [PB, tempPC], scratch2
     # scratch3 = high bit 7
     # leave scratch2 with low bits 6-0
     move 0x80, scratch3
     andi scratch2, scratch3
     xori scratch3, scratch2
     lshifti scratch1, scratch2
     addi 7, scratch1
     ori scratch2, dst
     bbneq scratch3, 0, .loop
 .fastpath:
 end

 ########################
 # In-Place Interpreter #
 ########################

 # FIXME: switch offlineasm unalignedglobal to take alignment and optionally pad with breakpoint instructions (rdar://113594783)
 macro argumINTAlign()
     emit ".balign 64"
 end

 macro argumINTDispatch()
     loadb [PM], csr0
     addq 1, PM
     lshiftq 6, csr0
 if ARM64 or ARM64E
     pcrtoaddr _argumINT_a0, csr4
     addq csr0, csr4
     emit "br x23"
 elsif X86_64
     leap (_argumINT_a0), csr4
     addq csr0, csr4
     emit "jmp *(%r13)"
 end
 end

 global _ipint_entry
 _ipint_entry:
 if WEBASSEMBLY and (ARM64 or ARM64E or X86_64)
     preserveCallerPCAndCFR()
     saveIPIntRegisters()
     storep wasmInstance, CodeBlock[cfr]
     getIPIntCallee()

     # Allocate space for locals and rethrow values
     if ARM64 or ARM64E
         loadpairi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], csr0, csr3
     else
         loadi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], csr0
         loadi Wasm::IPIntCallee::m_numRethrowSlotsToAlloc[ws0], csr3
     end
     addq csr3, csr0
     mulq LocalSize, csr0
     move sp, csr3
     subq csr0, sp
     move sp, csr4
     loadp Wasm::IPIntCallee::m_argumINTBytecodePointer[ws0], PM

     push csr0, csr1, csr2, csr3

     # PM = location in argumINT bytecode
     # csr0 = tmp
     # csr1 = dst
     # csr2 = src
     # csr3 = end
     # csr4 = for dispatch

 const argumINTDest = csr1
 const argumINTSrc = csr2
     move csr4, argumINTDest
     leap FirstArgumentOffset[cfr], argumINTSrc

     argumINTDispatch()

 .ipint_entry_end_local:
     # zero out remaining locals
     bqeq argumINTDest, csr3, .ipint_entry_finish_zero
     storeq 0, [argumINTDest]
     addq 8, argumINTDest

     jmp .ipint_entry_end_local
 .ipint_entry_finish_zero:
     pop csr3, csr2, csr1, csr0

     loadp CodeBlock[cfr], wasmInstance
     # OSR Check
     ipintPrologueOSR(5)

     move sp, PL

     if ARM64 or ARM64E
         pcrtoaddr _ipint_unreachable, IB
     end
     loadp Wasm::IPIntCallee::m_bytecode[ws0], PB
     move 0, PC
     loadp Wasm::IPIntCallee::m_metadata[ws0], PM
     move 0, MC
     # Load memory
     ipintReloadMemory()

     nextIPIntInstruction()

 .ipint_exit:
     # Clean up locals
     # Don't overwrite the return registers
     # Will use PM as a temp because we don't want to use the actual temps.
     # move PL, sp
     # loadi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], PM
     # mulq LocalSize, PM
     # addq PM, sp
     ipintReloadMemory()

     restoreIPIntRegisters()
     restoreCallerPCAndCFR()
     ret
 else
     ret
 end

 global _ipint_entry_simd
 _ipint_entry_simd:
 if WEBASSEMBLY and (ARM64 or ARM64E or X86_64)
     preserveCallerPCAndCFR()
     saveIPIntRegisters()
     storep wasmInstance, CodeBlock[cfr]
     getIPIntCallee()

     # Allocate space for locals and rethrow values
     if ARM64 or ARM64E
         loadpairi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], csr0, csr3
     else
         loadi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], csr0
         loadi Wasm::IPIntCallee::m_numRethrowSlotsToAlloc[ws0], csr3
     end
     addq csr3, csr0
     mulq LocalSize, csr0
     move sp, csr3
     subq csr0, sp
     move sp, csr4
     loadp Wasm::IPIntCallee::m_argumINTBytecodePointer[ws0], PM

     push csr0, csr1, csr2, csr3

     # PM = location in argumINT bytecode
     # csr0 = tmp
     # csr1 = dst
     # csr2 = src
     # csr3 = end
     # csr4 = for dispatch

 const argumINTDest = csr1
 const argumINTSrc = csr2
     move csr4, argumINTDest
     leap FirstArgumentOffset[cfr], argumINTSrc

     argumINTDispatch()

 .ipint_entry_end_local_simd:
     # zero out remaining locals
     bqeq argumINTDest, csr3, .ipint_entry_finish_zero_simd
     storeq 0, [argumINTDest]
     addq 8, argumINTDest

     jmp .ipint_entry_end_local_simd
 .ipint_entry_finish_zero_simd:
     pop csr3, csr2, csr1, csr0

     loadp CodeBlock[cfr], wasmInstance
     # OSR Check
     ipintPrologueOSR(5)

     move sp, PL

     if ARM64 or ARM64E
         pcrtoaddr _ipint_unreachable, IB
     end
     loadp Wasm::IPIntCallee::m_bytecode[ws0], PB
     move 0, PC
     loadp Wasm::IPIntCallee::m_metadata[ws0], PM
     move 0, MC
     # Load memory
     ipintReloadMemory()

     nextIPIntInstruction()
 else
     ret
 end

 macro ipintCatchCommon()
     getVMFromCallFrame(t3, t0)
     restoreCalleeSavesFromVMEntryFrameCalleeSavesBuffer(t3, t0)

     loadp VM::callFrameForCatch[t3], cfr
     storep 0, VM::callFrameForCatch[t3]

     loadp VM::targetInterpreterPCForThrow[t3], PC
     loadp VM::targetInterpreterMetadataPCForThrow[t3], MC

     getIPIntCallee()

     loadp CodeBlock[cfr], wasmInstance
     if ARM64 or ARM64E
         pcrtoaddr _ipint_unreachable, IB
     end
     loadp Wasm::IPIntCallee::m_bytecode[ws0], PB
     loadp Wasm::IPIntCallee::m_metadata[ws0], PM

     # Recompute PL
     if ARM64 or ARM64E
         loadpairi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], t0, t1
     else
         loadi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], t0
         loadi Wasm::IPIntCallee::m_numRethrowSlotsToAlloc[ws0], t1
     end
     addq t1, t0
     # FIXME: Can this be an leaq?
     mulq LocalSize, t0
     addq IPIntCalleeSaveSpaceStackAligned, t0
     subq cfr, t0, PL

     loadi [PM, MC], t0
     # 1 << 4 == StackValueSize
     lshiftq 4, t0
     addq IPIntCalleeSaveSpaceStackAligned, t0
     subp cfr, t0, sp
 end

 global _ipint_catch_entry
 _ipint_catch_entry:
 if WEBASSEMBLY and (ARM64 or ARM64E or X86_64)
     ipintCatchCommon()

     move cfr, a1
     move sp, a2
     move PL, a3
     operationCall(macro() cCall4(_ipint_extern_retrieve_and_clear_exception) end)

     ipintReloadMemory()
     advanceMC(4)
     nextIPIntInstruction()
 end

 global _ipint_catch_all_entry
 _ipint_catch_all_entry:
 if WEBASSEMBLY and (ARM64 or ARM64E or X86_64)
     ipintCatchCommon()

     move cfr, a1
     move 0, a2
     move PL, a3
     operationCall(macro() cCall4(_ipint_extern_retrieve_and_clear_exception) end)

     ipintReloadMemory()
     advanceMC(4)
     nextIPIntInstruction()
 end

 if WEBASSEMBLY and (ARM64 or ARM64E or X86_64)
 # Put all instructions after this `if`, or 32 bit will fail to build.

     #############################
     # 0x00 - 0x11: control flow #
     #############################

 instructionLabel(_unreachable)
     # unreachable
     ipintException(Unreachable)

 instructionLabel(_nop)
     # nop
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_block)
     # block
     loadi [PM, MC], PC
     loadi 4[PM, MC], MC
     nextIPIntInstruction()

 instructionLabel(_loop)
     # loop
     ipintLoopOSR(1)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_if)
     # if
     popInt32(t0, t1)
     bqneq 0, t0, .ipint_if_taken
     loadi [PM, MC], PC
     loadi 4[PM, MC], MC
     nextIPIntInstruction()
 .ipint_if_taken:
     # Skip LEB128
     loadb 8[PM, MC], t0
     advanceMC(9)
     advancePCByReg(t0)
     nextIPIntInstruction()

 instructionLabel(_else)
     # else
     # Counterintuitively, we only run this instruction if the if
     # clause is TAKEN. This is used to branch to the end of the
     # block.
     loadi [PM, MC], PC
     loadi 4[PM, MC], MC
     nextIPIntInstruction()

 instructionLabel(_try)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_catch)
     # Counterintuitively, like else, we only run this instruction
     # if no exception was thrown during the preceeding try or catch block.
     loadi [PM, MC], PC
     loadi 4[PM, MC], MC
     nextIPIntInstruction()

 instructionLabel(_throw)
     storei PC, CallSiteIndex[cfr]

     loadp Wasm::Instance::m_vm[wasmInstance], t0
     loadp VM::topEntryFrame[t0], t0
     copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0)

     move cfr, a1
     move sp, a2
     loadi [PM, MC], a3
     operationCall(macro() cCall4(_ipint_extern_throw_exception) end)
     jumpToException()

 instructionLabel(_rethrow)
     storei PC, CallSiteIndex[cfr]

     loadp Wasm::Instance::m_vm[wasmInstance], t0
     loadp VM::topEntryFrame[t0], t0
     copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0)

     move cfr, a1
     move PL, a2
     loadi [PM, MC], a3
     operationCall(macro() cCall4(_ipint_extern_rethrow_exception) end)
     jumpToException()

 reservedOpcode(0x0a)

 # FIXME: switch offlineasm unalignedglobal to take alignment and optionally pad with breakpoint instructions (rdar://113594783)
 macro uintAlign()
     emit ".balign 64"
 end

 macro uintDispatch()
 if ARM64 or ARM64E
     loadb [PM], ws2
     addq 1, PM
     bilt ws2, 5, .safe
     break
 .safe:
     lshiftq 6, ws2
     pcrtoaddr _uint_r0, ws3
     addq ws2, ws3
     # ws3 = x12
     emit "br x12"
 elsif X86_64
     loadb [PM], r1
     addq 1, PM
     bilt r1, 5, .safe
     break
 .safe:
     lshiftq 6, r1
     leap (_uint_r0), t0
     addq r1, t0
     emit "jmp *(%rax)"
 end
 end

 instructionLabel(_end)
     loadi Wasm::IPIntCallee::m_bytecodeLength[ws0], t0
     subq 1, t0
     bqeq PC, t0, .ipint_end_ret
     advancePC(1)
     nextIPIntInstruction()
 .ipint_end_ret:
     ipintEpilogueOSR(10)
     addq MC, PM
     uintDispatch()

 instructionLabel(_br)
     # br
     # number to pop
     loadh 8[PM, MC], t0
     # number to keep
     loadh 10[PM, MC], t1

     # ex. pop 3 and keep 2
     #
     # +4 +3 +2 +1 sp
     # a  b  c  d  e
     # d  e
     #
     # [sp + k + numToPop] = [sp + k] for k in numToKeep-1 -> 0
     move t0, t2
     lshiftq 4, t2
     leap [sp, t2], t2

 .ipint_br_poploop:
     bqeq t1, 0, .ipint_br_popend
     subq 1, t1
     move t1, t3
     lshiftq 4, t3
     loadq [sp, t3], t0
     storeq t0, [t2, t3]
     loadq 8[sp, t3], t0
     storeq t0, 8[t2, t3]
     jmp .ipint_br_poploop
 .ipint_br_popend:
     loadh 8[PM, MC], t0
     lshiftq 4, t0
     leap [sp, t0], sp
     loadi [PM, MC], PC
     loadi 4[PM, MC], MC
     nextIPIntInstruction()

 instructionLabel(_br_if)
     # pop i32
     popInt32(t0, t2)
     bineq t0, 0, _ipint_br
     loadb 12[PM, MC], t0
     advanceMC(13)
     advancePCByReg(t0)
     nextIPIntInstruction()

 instructionLabel(_br_table)
     # br_table
     popInt32(t0, t2)
     loadi [PM, MC], t1
     advanceMC(4)
     biaeq t0, t1, .ipint_br_table_maxout
     move t0, t1
     lshiftq 3, t0
     lshiftq 2, t1
     addq t1, t0
     addq t0, MC
     jmp _ipint_br
 .ipint_br_table_maxout:
     subq 1, t1
     move t1, t2
     lshiftq 3, t1
     lshiftq 2, t2
     addq t2, t1
     addq t1, MC
     jmp _ipint_br

 instructionLabel(_return)
     # ret
     loadi Wasm::IPIntCallee::m_bytecodeLength[ws0], PC
     loadi Wasm::IPIntCallee::m_returnMetadata[ws0], MC
     subq 1, PC
     # This is guaranteed going to an end instruction, so skip
     # dispatch and end of program check for speed
     jmp .ipint_end_ret

 instructionLabel(_call)
     storei PC, CallSiteIndex[cfr]

     # call
     jmp _ipint_call_impl

 instructionLabel(_call_indirect)
     storei PC, CallSiteIndex[cfr]

     # Get ref
     # Load pre-computed values from metadata
     popInt32(t0, t1)
     push PC, MC # a4
     move t0, a2
     leap 1[PM, MC], a3
     move wasmInstance, a0
     move cfr, a1
     operationCall(macro() cCall4(_ipint_extern_call_indirect) end)
     pop MC, PC
     btpz r1, .ipint_call_indirect_throw

     loadb [PM, MC], t2
     advancePCByReg(t2)
     advanceMC(9)

     jmp .ipint_call_common
 .ipint_call_indirect_throw:
     jmp _wasm_throw_from_slow_path_trampoline

 reservedOpcode(0x12)
 reservedOpcode(0x13)
 reservedOpcode(0x14)
 reservedOpcode(0x15)
 reservedOpcode(0x16)
 reservedOpcode(0x17)

 instructionLabel(_delegate)
     # Counterintuitively, like else, we only run this instruction
     # if no exception was thrown during the preceeding try or catch block.
     loadi [PM, MC], PC
     loadi 4[PM, MC], MC
     nextIPIntInstruction()

 instructionLabel(_catch_all)
     # Counterintuitively, like else, we only run this instruction
     # if no exception was thrown during the preceeding try or catch block.
     loadi [PM, MC], PC
     loadi 4[PM, MC], MC
     nextIPIntInstruction()

 instructionLabel(_drop)
     addq StackValueSize, sp
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_select)
     popInt32(t0, t2)
     bieq t0, 0, .ipint_select_val2
     addq 16, sp
     advancePC(1)
     advanceMC(8)
     nextIPIntInstruction()
 .ipint_select_val2:
     popQuad(t1, t2)
     popQuad(t0, t2)
     pushQuad(t1)
     advancePC(1)
     advanceMC(8)
     nextIPIntInstruction()

 instructionLabel(_select_t)
     popInt32(t0, t2)
     bieq t0, 0, .ipint_select_t_val2
     addq 16, sp
     loadi [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(8)
     nextIPIntInstruction()
 .ipint_select_t_val2:
     popQuad(t1, t2)
     popQuad(t0, t3)
     pushQuadPair(t2, t1)
     loadi [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(8)
     nextIPIntInstruction()

 reservedOpcode(0x1d)
 reservedOpcode(0x1e)
 reservedOpcode(0x1f)

     ###################################
     # 0x20 - 0x26: get and set values #
     ###################################

 instructionLabel(_local_get)
     # local.get
     loadb [PM, MC], t1
     bineq t1, 0, .ipint_local_get_longpath

     loadb 1[PB, PC], t0
     # Index into locals
     loadq [PL, t0, LocalSize], t0
     # Push to stack
     pushQuad(t0)
     advancePC(2)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_local_get_longpath:
     # Load pre-computed index from metadata
     loadi 1[PM, MC], t0
     # Index into locals
     loadq [PL, t0, LocalSize], t0
     # Push to stack
     pushQuad(t0)

     advancePCByReg(t1)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_local_set)
     # local.set
     # Pop from stack
     loadb [PM, MC], t1
     popQuad(t2, t3)

     bineq t1, 0, .ipint_local_set_longpath

     loadb 1[PB, PC], t0
     # Store to locals
     storeq t2, [PL, t0, LocalSize]
     advancePC(2)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_local_set_longpath:
     # Load pre-computed index from metadata
     loadi 1[PM, MC], t0
     # Store to locals
     storeq t2, [PL, t0, LocalSize]

     advancePCByReg(t1)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_local_tee)
     # local.tee
     loadb [PM, MC], t1
     loadq [sp], t2
     bineq t1, 0, .ipint_local_tee_longpath

     loadb 1[PB, PC], t0
     storeq t2, [PL, t0, LocalSize]

     advancePC(2)
     advanceMC(2)
     nextIPIntInstruction()

 .ipint_local_tee_longpath:
     loadi 1[PM, MC], t0
     storeq t2, [PL, t0, LocalSize]

     advancePCByReg(t1)
     advanceMC(10)
     nextIPIntInstruction()

 instructionLabel(_global_get)
     # Load pre-computed index from metadata
     loadh 6[PM, MC], t2
     loadi [PM, MC], t1
     loadp Wasm::Instance::m_globals[wasmInstance], t0
     lshiftp 1, t1
     loadq [t0, t1, 8], t0
     bieq t2, 0, .ipint_global_get_embedded
     loadq [t0], t0
 .ipint_global_get_embedded:
     pushQuad(t0)

     loadh 4[PM, MC], t0
     advancePCByReg(t0)
     advanceMC(8)
     nextIPIntInstruction()

 instructionLabel(_global_set)
     # b7 = 1 => ref, use slowpath
     loadb 7[PM, MC], t0
     bineq t0, 0, .ipint_global_set_refpath
     # b6 = 1 => portable
     loadb 6[PM, MC], t2
     # get global addr
     loadp Wasm::Instance::m_globals[wasmInstance], t0
     # get value to store
     popQuad(t3, t1)
     # get index
     loadi [PM, MC], t1
     lshiftp 1, t1
     bieq t2, 0, .ipint_global_set_embedded
     # portable: dereference then set
     loadq [t0, t1, 8], t0
     storeq t3, [t0]
     loadh 4[PM, MC], t0
     advancePCByReg(t0)
     advanceMC(8)
     nextIPIntInstruction()
 .ipint_global_set_embedded:
     # embedded: set directly
     storeq t3, [t0, t1, 8]
     loadh 4[PM, MC], t0
     advancePCByReg(t0)
     advanceMC(8)
     nextIPIntInstruction()

 .ipint_global_set_refpath:
     loadi [PM, MC], a1
     # Pop from stack
     popQuad(a2, t3)
     operationCall(macro() cCall3(_ipint_extern_set_global_64) end)

     loadh 4[PM, MC], t0
     advancePCByReg(t0)
     advanceMC(8)
     nextIPIntInstruction()

 instructionLabel(_table_get)
     # Load pre-computed index from metadata
     loadi 1[PM, MC], a1
     popInt32(a2, t3)

     operationCallMayThrow(macro() cCall3(_ipint_extern_table_get) end)

     pushQuad(t0)

     loadb [PM, MC], t0

     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_table_set)
     # Load pre-computed index from metadata
     loadi 1[PM, MC], a1
     popQuad(a3, t0)
     popInt32(a2, t0)
     operationCallMayThrow(macro() cCall4(_ipint_extern_table_set) end)

     loadb [PM, MC], t0

     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 reservedOpcode(0x27)

 macro ipintCheckMemoryBound(mem, scratch, size)
     leap size - 1[mem], scratch
     bpb scratch, boundsCheckingSize, .continuation
     ipintException(OutOfBoundsMemoryAccess)
 .continuation:
 end

 instructionLabel(_i32_load_mem)
     # i32.load
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 4)
     # load memory location
     loadi [memoryBase, t0], t1
     pushInt32(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_load_mem)
     # i32.load
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 8)
     # load memory location
     loadq [memoryBase, t0], t1
     pushInt64(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_f32_load_mem)
     # f32.load
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 4)
     # load memory location
     loadf [memoryBase, t0], ft0
     pushFloat32FT0()

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_f64_load_mem)
     # f64.load
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 8)
     # load memory location
     loadd [memoryBase, t0], ft0
     pushFloat64FT0()

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()


 instructionLabel(_i32_load8s_mem)
     # i32.load8_s
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 1)
     # load memory location
     loadb [memoryBase, t0], t1
     sxb2i t1, t1
     pushInt32(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i32_load8u_mem)
     # i32.load8_u
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 1)
     # load memory location
     loadb [memoryBase, t0], t1
     pushInt32(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i32_load16s_mem)
     # i32.load16_s
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 2)
     # load memory location
     loadh [memoryBase, t0], t1
     sxh2i t1, t1
     pushInt32(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i32_load16u_mem)
     # i32.load16_u
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 2)
     # load memory location
     loadh [memoryBase, t0], t1
     pushInt32(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()


 instructionLabel(_i64_load8s_mem)
     # i64.load8_s
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 1)
     # load memory location
     loadb [memoryBase, t0], t1
     sxb2q t1, t1
     pushInt64(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_load8u_mem)
     # i64.load8_u
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 1)
     # load memory location
     loadb [memoryBase, t0], t1
     pushInt64(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_load16s_mem)
     # i64.load16_s
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 2)
     # load memory location
     loadh [memoryBase, t0], t1
     sxh2q t1, t1
     pushInt64(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_load16u_mem)
     # i64.load16_u
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 2)
     # load memory location
     loadh [memoryBase, t0], t1
     pushInt64(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_load32s_mem)
     # i64.load32_s
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 4)
     # load memory location
     loadi [memoryBase, t0], t1
     sxi2q t1, t1
     pushInt64(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_load32u_mem)
     # i64.load8_s
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 4)
     # load memory location
     loadi [memoryBase, t0], t1
     pushInt64(t1)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()


 instructionLabel(_i32_store_mem)
     # i32.store
     # pop data
     popInt32(t1, t2)
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 4)
     # load memory location
     storei t1, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_store_mem)
     # i64.store
     # pop data
     popInt64(t1, t2)
     # pop index
     popInt64(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 8)
     # load memory location
     storeq t1, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_f32_store_mem)
     # f32.store
     # pop data
     popFloat32FT0()
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 4)
     # load memory location
     storef ft0, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_f64_store_mem)
     # f64.store
     # pop data
     popFloat64FT0()
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 8)
     # load memory location
     stored ft0, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i32_store8_mem)
     # i32.store8
     # pop data
     popInt32(t1, t2)
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 1)
     # load memory location
     storeb t1, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i32_store16_mem)
     # i32.store16
     # pop data
     popInt32(t1, t2)
     # pop index
     popInt32(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 2)
     # load memory location
     storeh t1, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_store8_mem)
     # i64.store8
     # pop data
     popInt64(t1, t2)
     # pop index
     popInt64(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 1)
     # load memory location
     storeb t1, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_store16_mem)
     # i64.store16
     # pop data
     popInt64(t1, t2)
     # pop index
     popInt64(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 2)
     # load memory location
     storeh t1, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_store32_mem)
     # i64.store32
     # pop data
     popInt64(t1, t2)
     # pop index
     popInt64(t0, t2)
     loadi 1[PM, MC], t2
     addq t2, t0
     ipintCheckMemoryBound(t0, t2, 4)
     # load memory location
     storei t1, [memoryBase, t0]

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()


 instructionLabel(_memory_size)
     operationCall(macro() cCall2(_ipint_extern_current_memory) end)
     pushInt32(r0)
     advancePC(2)
     nextIPIntInstruction()

 instructionLabel(_memory_grow)
     popInt32(a1, t2)
     operationCall(macro() cCall2(_ipint_extern_memory_grow) end)
     pushInt32(r0)
     ipintReloadMemory()
     advancePC(2)
     nextIPIntInstruction()

     ################################
     # 0x41 - 0x44: constant values #
     ################################

 instructionLabel(_i32_const)
     # i32.const
     loadb [PM, MC], t1
     bigteq t1, 2, .ipint_i32_const_slowpath
     loadb 1[PB, PC], t0
     lshiftq 7, t1
     orq t1, t0
     sxb2i t0, t0
     pushInt32(t0)
     advancePC(2)
     advanceMC(1)
     nextIPIntInstruction()
 .ipint_i32_const_slowpath:
     # Load pre-computed value from metadata
     loadi 1[PM, MC], t0
     # Push to stack
     pushInt32(t0)

     advancePCByReg(t1)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_i64_const)
     # i64.const
     # Load pre-computed value from metadata
     loadq 1[PM, MC], t0
     # Push to stack
     pushInt64(t0)
     loadb [PM, MC], t0

     advancePCByReg(t0)
     advanceMC(9)
     nextIPIntInstruction()

 instructionLabel(_f32_const)
     # f32.const
     # Load pre-computed value from metadata
     loadf 1[PB, PC], ft0
     pushFloat32FT0()

     advancePC(5)
     nextIPIntInstruction()

 instructionLabel(_f64_const)
     # f64.const
     # Load pre-computed value from metadata
     loadd 1[PB, PC], ft0
     pushFloat64FT0()

     advancePC(9)
     nextIPIntInstruction()

     ###############################
     # 0x45 - 0x4f: i32 comparison #
     ###############################

 instructionLabel(_i32_eqz)
     # i32.eqz
     popInt32(t0, t2)
     cieq t0, 0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_eq)
     # i32.eq
     popInt32(t1, t2)
     popInt32(t0, t2)
     cieq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_ne)
     # i32.ne
     popInt32(t1, t2)
     popInt32(t0, t2)
     cineq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_lt_s)
     # i32.lt_s
     popInt32(t1, t2)
     popInt32(t0, t2)
     cilt t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_lt_u)
     # i32.lt_u
     popInt32(t1, t2)
     popInt32(t0, t2)
     cib t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_gt_s)
     # i32.gt_s
     popInt32(t1, t2)
     popInt32(t0, t2)
     cigt t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_gt_u)
     # i32.gt_u
     popInt32(t1, t2)
     popInt32(t0, t2)
     cia t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_le_s)
     # i32.le_s
     popInt32(t1, t2)
     popInt32(t0, t2)
     cilteq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_le_u)
     # i32.le_u
     popInt32(t1, t2)
     popInt32(t0, t2)
     cibeq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_ge_s)
     # i32.ge_s
     popInt32(t1, t2)
     popInt32(t0, t2)
     cigteq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_ge_u)
     # i32.ge_u
     popInt32(t1, t2)
     popInt32(t0, t2)
     ciaeq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

     ###############################
     # 0x50 - 0x5a: i64 comparison #
     ###############################

 instructionLabel(_i64_eqz)
     # i64.eqz
     popInt64(t0, t2)
     cqeq t0, 0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_eq)
     # i64.eq
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqeq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_ne)
     # i64.ne
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqneq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_lt_s)
     # i64.lt_s
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqlt t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_lt_u)
     # i64.lt_u
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqb t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_gt_s)
     # i64.gt_s
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqgt t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_gt_u)
     # i64.gt_u
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqa t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_le_s)
     # i64.le_s
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqlteq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_le_u)
     # i64.le_u
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqbeq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_ge_s)
     # i64.ge_s
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqgteq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_ge_u)
     # i64.ge_u
     popInt64(t1, t2)
     popInt64(t0, t2)
     cqaeq t0, t1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

     ###############################
     # 0x5b - 0x60: f32 comparison #
     ###############################

 instructionLabel(_f32_eq)
     # f32.eq
     popFloat32FT1()
     popFloat32FT0()
     cfeq ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_ne)
     # f32.ne
     popFloat32FT1()
     popFloat32FT0()
     cfnequn ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_lt)
     # f32.lt
     popFloat32FT1()
     popFloat32FT0()
     cflt ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_gt)
     # f32.gt
     popFloat32FT1()
     popFloat32FT0()
     cfgt ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_le)
     # f32.le
     popFloat32FT1()
     popFloat32FT0()
     cflteq ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_ge)
     # f32.ge
     popFloat32FT1()
     popFloat32FT0()
     cfgteq ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()


     ###############################
     # 0x61 - 0x66: f64 comparison #
     ###############################

 instructionLabel(_f64_eq)
     # f64.eq
     popFloat64FT1()
     popFloat64FT0()
     cdeq ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_ne)
     # f64.ne
     popFloat64FT1()
     popFloat64FT0()
     cdnequn ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_lt)
     # f64.lt
     popFloat64FT1()
     popFloat64FT0()
     cdlt ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_gt)
     # f64.gt
     popFloat64FT1()
     popFloat64FT0()
     cdgt ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_le)
     # f64.le
     popFloat64FT1()
     popFloat64FT0()
     cdlteq ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_ge)
     # f64.ge
     popFloat64FT1()
     popFloat64FT0()
     cdgteq ft0, ft1, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

     ###############################
     # 0x67 - 0x78: i32 operations #
     ###############################

 instructionLabel(_i32_clz)
     # i32.clz
     popInt32(t0, t2)
     lzcnti t0, t1
     pushInt32(t1)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_ctz)
     # i32.ctz
     popInt32(t0, t2)
     tzcnti t0, t1
     pushInt32(t1)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_popcnt)
     # i32.popcnt
     popInt32(t1, t2)
     operationCall(macro() cCall2(_slow_path_wasm_popcount) end)
     pushInt32(r1)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_add)
     # i32.add
     popInt32(t1, t2)
     popInt32(t0, t2)
     addi t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_sub)
     # i32.sub
     popInt32(t1, t2)
     popInt32(t0, t2)
     subi t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_mul)
     # i32.mul
     popInt32(t1, t2)
     popInt32(t0, t2)
     muli t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_div_s)
     # i32.div_s
     popInt32(t1, t2)
     popInt32(t0, t2)
     btiz t1, .ipint_i32_div_s_throwDivisionByZero

     bineq t1, -1, .ipint_i32_div_s_safe
     bieq t0, constexpr INT32_MIN, .ipint_i32_div_s_throwIntegerOverflow

 .ipint_i32_div_s_safe:
     if X86_64
         # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
         # https://bugs.webkit.org/show_bug.cgi?id=203692
         cdqi
         idivi t1
     elsif ARM64 or ARM64E or RISCV64
         divis t1, t0
     else
         error
     end
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i32_div_s_throwDivisionByZero:
     ipintException(DivisionByZero)

 .ipint_i32_div_s_throwIntegerOverflow:
     ipintException(IntegerOverflow)

 instructionLabel(_i32_div_u)
     # i32.div_u
     popInt32(t1, t2)
     popInt32(t0, t2)
     btiz t1, .ipint_i32_div_u_throwDivisionByZero

     if X86_64
         xori t2, t2
         udivi t1
     elsif ARM64 or ARM64E or RISCV64
         divi t1, t0
     else
         error
     end
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i32_div_u_throwDivisionByZero:
     ipintException(DivisionByZero)

 instructionLabel(_i32_rem_s)
     # i32.rem_s
     popInt32(t1, t2)
     popInt32(t0, t2)

     btiz t1, .ipint_i32_rem_s_throwDivisionByZero

     bineq t1, -1, .ipint_i32_rem_s_safe
     bineq t0, constexpr INT32_MIN, .ipint_i32_rem_s_safe

     move 0, t2
     jmp .ipint_i32_rem_s_return

 .ipint_i32_rem_s_safe:
     if X86_64
         # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
         # https://bugs.webkit.org/show_bug.cgi?id=203692
         cdqi
         idivi t1
     elsif ARM64 or ARM64E
         divis t1, t0, t2
         muli t1, t2
         subi t0, t2, t2
     elsif RISCV64
         remis t0, t1, t2
     else
         error
     end

 .ipint_i32_rem_s_return:
     pushInt32(t2)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i32_rem_s_throwDivisionByZero:
     ipintException(DivisionByZero)

 instructionLabel(_i32_rem_u)
     # i32.rem_u
     popInt32(t1, t2)
     popInt32(t0, t2)
     btiz t1, .ipint_i32_rem_u_throwDivisionByZero

     if X86_64
         xori t2, t2
         udivi t1
     elsif ARM64 or ARM64E
         divi t1, t0, t2
         muli t1, t2
         subi t0, t2, t2
     elsif RISCV64
         remi t0, t1, t2
     else
         error
     end
     pushInt32(t2)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i32_rem_u_throwDivisionByZero:
     ipintException(DivisionByZero)

 instructionLabel(_i32_and)
     # i32.and
     popInt32(t1, t2)
     popInt32(t0, t2)
     andi t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_or)
     # i32.or
     popInt32(t1, t2)
     popInt32(t0, t2)
     ori t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_xor)
     # i32.xor
     popInt32(t1, t2)
     popInt32(t0, t2)
     xori t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_shl)
     # i32.shl
     popInt32(t1, t2)
     popInt32(t0, t2)
     lshifti t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_shr_s)
     # i32.shr_s
     popInt32(t1, t2)
     popInt32(t0, t2)
     rshifti t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_shr_u)
     # i32.shr_u
     popInt32(t1, t2)
     popInt32(t0, t2)
     urshifti t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_rotl)
     # i32.rotl
     popInt32(t1, t2)
     popInt32(t0, t2)
     lrotatei t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_rotr)
     # i32.rotr
     popInt32(t1, t2)
     popInt32(t0, t2)
     rrotatei t1, t0
     pushInt32(t0)

     advancePC(1)
     nextIPIntInstruction()

     ###############################
     # 0x79 - 0x8a: i64 operations #
     ###############################

 instructionLabel(_i64_clz)
     # i64.clz
     popInt64(t0, t2)
     lzcntq t0, t1
     pushInt64(t1)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_ctz)
     # i64.ctz
     popInt64(t0, t2)
     tzcntq t0, t1
     pushInt64(t1)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_popcnt)
     # i64.popcnt
     popInt64(t1, t2)
     operationCall(macro() cCall2(_slow_path_wasm_popcountll) end)
     pushInt64(r1)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_add)
     # i64.add
     popInt64(t1, t2)
     popInt64(t0, t2)
     addq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_sub)
     # i64.sub
     popInt64(t1, t2)
     popInt64(t0, t2)
     subq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_mul)
     # i64.mul
     popInt64(t1, t2)
     popInt64(t0, t2)
     mulq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_div_s)
     # i64.div_s
     popInt64(t1, t2)
     popInt64(t0, t2)
     btqz t1, .ipint_i64_div_s_throwDivisionByZero

     bqneq t1, -1, .ipint_i64_div_s_safe
     bqeq t0, constexpr INT64_MIN, .ipint_i64_div_s_throwIntegerOverflow

 .ipint_i64_div_s_safe:
     if X86_64
         # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
         # https://bugs.webkit.org/show_bug.cgi?id=203692
         cqoq
         idivq t1
     elsif ARM64 or ARM64E or RISCV64
         divqs t1, t0
     else
         error
     end
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i64_div_s_throwDivisionByZero:
     ipintException(DivisionByZero)

 .ipint_i64_div_s_throwIntegerOverflow:
     ipintException(IntegerOverflow)

 instructionLabel(_i64_div_u)
     # i64.div_u
     popInt64(t1, t2)
     popInt64(t0, t2)
     btqz t1, .ipint_i64_div_u_throwDivisionByZero

     if X86_64
         xorq t2, t2
         udivq t1
     elsif ARM64 or ARM64E or RISCV64
         divq t1, t0
     else
         error
     end
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i64_div_u_throwDivisionByZero:
     ipintException(DivisionByZero)

 instructionLabel(_i64_rem_s)
     # i64.rem_s
     popInt64(t1, t2)
     popInt64(t0, t2)

     btqz t1, .ipint_i64_rem_s_throwDivisionByZero

     bqneq t1, -1, .ipint_i64_rem_s_safe
     bqneq t0, constexpr INT64_MIN, .ipint_i64_rem_s_safe

     move 0, t2
     jmp .ipint_i64_rem_s_return

 .ipint_i64_rem_s_safe:
     if X86_64
         # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx
         # https://bugs.webkit.org/show_bug.cgi?id=203692
         cqoq
         idivq t1
     elsif ARM64 or ARM64E
         divqs t1, t0, t2
         mulq t1, t2
         subq t0, t2, t2
     elsif RISCV64
         remqs t0, t1, t2
     else
         error
     end

 .ipint_i64_rem_s_return:
     pushInt64(t2)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i64_rem_s_throwDivisionByZero:
     ipintException(DivisionByZero)

 instructionLabel(_i64_rem_u)
     # i64.rem_u
     popInt64(t1, t2)
     popInt64(t0, t2)
     btqz t1, .ipint_i64_rem_u_throwDivisionByZero

     if X86_64
         xorq t2, t2
         udivq t1
     elsif ARM64 or ARM64E
         divq t1, t0, t2
         mulq t1, t2
         subq t0, t2, t2
     elsif RISCV64
         remq t0, t1, t2
     else
         error
     end
     pushInt64(t2)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i64_rem_u_throwDivisionByZero:
     ipintException(DivisionByZero)

 instructionLabel(_i64_and)
     # i64.and
     popInt64(t1, t2)
     popInt64(t0, t2)
     andq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_or)
     # i64.or
     popInt64(t1, t2)
     popInt64(t0, t2)
     orq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_xor)
     # i64.xor
     popInt64(t1, t2)
     popInt64(t0, t2)
     xorq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_shl)
     # i64.shl
     popInt64(t1, t2)
     popInt64(t0, t2)
     lshiftq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_shr_s)
     # i64.shr_s
     popInt64(t1, t2)
     popInt64(t0, t2)
     rshiftq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_shr_u)
     # i64.shr_u
     popInt64(t1, t2)
     popInt64(t0, t2)
     urshiftq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_rotl)
     # i64.rotl
     popInt64(t1, t2)
     popInt64(t0, t2)
     lrotateq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_rotr)
     # i64.rotr
     popInt64(t1, t2)
     popInt64(t0, t2)
     rrotateq t1, t0
     pushInt64(t0)

     advancePC(1)
     nextIPIntInstruction()

     ###############################
     # 0x8b - 0x98: f32 operations #
     ###############################

 instructionLabel(_f32_abs)
     # f32.abs
     popFloat32FT0()
     absf ft0, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_neg)
     # f32.neg
     popFloat32FT0()
     negf ft0, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_ceil)
     # f32.ceil
     popFloat32FT0()
     ceilf ft0, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_floor)
     # f32.floor
     popFloat32FT0()
     floorf ft0, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_trunc)
     # f32.trunc
     popFloat32FT0()
     truncatef ft0, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_nearest)
     # f32.nearest
     popFloat32FT0()
     roundf ft0, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_sqrt)
     # f32.sqrt
     popFloat32FT0()
     sqrtf ft0, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_add)
     # f32.add
     popFloat32FT1()
     popFloat32FT0()
     addf ft1, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_sub)
     # f32.sub
     popFloat32FT1()
     popFloat32FT0()
     subf ft1, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_mul)
     # f32.mul
     popFloat32FT1()
     popFloat32FT0()
     mulf ft1, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_div)
     # f32.div
     popFloat32FT1()
     popFloat32FT0()
     divf ft1, ft0
     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_min)
     # f32.min
     popFloat32FT1()
     popFloat32FT0()
     bfeq ft0, ft1, .ipint_f32_min_equal
     bflt ft0, ft1, .ipint_f32_min_lt
     bfgt ft0, ft1, .ipint_f32_min_return

 .ipint_f32_min_NaN:
     addf ft0, ft1
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f32_min_equal:
     orf ft0, ft1
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f32_min_lt:
     moved ft0, ft1
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f32_min_return:
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_max)
     # f32.max
     popFloat32FT1()
     popFloat32FT0()

     bfeq ft1, ft0, .ipint_f32_max_equal
     bflt ft1, ft0, .ipint_f32_max_lt
     bfgt ft1, ft0, .ipint_f32_max_return

 .ipint_f32_max_NaN:
     addf ft0, ft1
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f32_max_equal:
     andf ft0, ft1
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f32_max_lt:
     moved ft0, ft1
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f32_max_return:
     pushFloat32FT1()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_copysign)
     # f32.copysign
     popFloat32FT1()
     popFloat32FT0()

     ff2i ft1, t1
     move 0x80000000, t2
     andi t2, t1

     ff2i ft0, t0
     move 0x7fffffff, t2
     andi t2, t0

     ori t1, t0
     fi2f t0, ft0

     pushFloat32FT0()

     advancePC(1)
     nextIPIntInstruction()

     ###############################
     # 0x99 - 0xa6: f64 operations #
     ###############################

 instructionLabel(_f64_abs)
     # f64.abs
     popFloat64FT0()
     absd ft0, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_neg)
     # f64.neg
     popFloat64FT0()
     negd ft0, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_ceil)
     # f64.ceil
     popFloat64FT0()
     ceild ft0, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_floor)
     # f64.floor
     popFloat64FT0()
     floord ft0, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_trunc)
     # f64.trunc
     popFloat64FT0()
     truncated ft0, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_nearest)
     # f64.nearest
     popFloat64FT0()
     roundd ft0, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_sqrt)
     # f64.sqrt
     popFloat64FT0()
     sqrtd ft0, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_add)
     # f64.add
     popFloat64FT1()
     popFloat64FT0()
     addd ft1, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_sub)
     # f64.sub
     popFloat64FT1()
     popFloat64FT0()
     subd ft1, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_mul)
     # f64.mul
     popFloat64FT1()
     popFloat64FT0()
     muld ft1, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_div)
     # f64.div
     popFloat64FT1()
     popFloat64FT0()
     divd ft1, ft0
     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_min)
     # f64.min
     popFloat64FT1()
     popFloat64FT0()
     bdeq ft0, ft1, .ipint_f64_min_equal
     bdlt ft0, ft1, .ipint_f64_min_lt
     bdgt ft0, ft1, .ipint_f64_min_return

 .ipint_f64_min_NaN:
     addd ft0, ft1
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f64_min_equal:
     ord ft0, ft1
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f64_min_lt:
     moved ft0, ft1
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f64_min_return:
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_max)
     # f64.max
     popFloat64FT1()
     popFloat64FT0()

     bdeq ft1, ft0, .ipint_f64_max_equal
     bdlt ft1, ft0, .ipint_f64_max_lt
     bdgt ft1, ft0, .ipint_f64_max_return

 .ipint_f64_max_NaN:
     addd ft0, ft1
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f64_max_equal:
     andd ft0, ft1
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f64_max_lt:
     moved ft0, ft1
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 .ipint_f64_max_return:
     pushFloat64FT1()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_copysign)
     # f64.copysign
     popFloat64FT1()
     popFloat64FT0()

     fd2q ft1, t1
     move 0x8000000000000000, t2
     andq t2, t1

     fd2q ft0, t0
     move 0x7fffffffffffffff, t2
     andq t2, t0

     orq t1, t0
     fq2d t0, ft0

     pushFloat64FT0()

     advancePC(1)
     nextIPIntInstruction()

     ############################
     # 0xa7 - 0xc4: conversions #
     ############################

 instructionLabel(_i32_wrap_i64)
     # because of how we store values on stack, do nothing
     advancePC(1)
     nextIPIntInstruction()


 instructionLabel(_i32_trunc_f32_s)
     popFloat32FT0()
     move 0xcf000000, t0 # INT32_MIN (Note that INT32_MIN - 1.0 in float is the same as INT32_MIN in float).
     fi2f t0, ft1
     bfltun ft0, ft1, .ipint_trunc_i32_f32_s_outOfBoundsTrunc

     move 0x4f000000, t0 # -INT32_MIN
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_trunc_i32_f32_s_outOfBoundsTrunc

     truncatef2is ft0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_trunc_i32_f32_s_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_i32_trunc_f32_u)
     popFloat32FT0()
     move 0xbf800000, t0 # -1.0
     fi2f t0, ft1
     bfltequn ft0, ft1, .ipint_trunc_i32_f32_u_outOfBoundsTrunc

     move 0x4f800000, t0 # INT32_MIN * -2.0
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_trunc_i32_f32_u_outOfBoundsTrunc

     truncatef2i ft0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_trunc_i32_f32_u_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_i32_trunc_f64_s)
     popFloat64FT0()
     move 0xc1e0000000200000, t0 # INT32_MIN - 1.0
     fq2d t0, ft1
     bdltequn ft0, ft1, .ipint_trunc_i32_f64_s_outOfBoundsTrunc

     move 0x41e0000000000000, t0 # -INT32_MIN
     fq2d t0, ft1
     bdgtequn ft0, ft1, .ipint_trunc_i32_f64_s_outOfBoundsTrunc

     truncated2is ft0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_trunc_i32_f64_s_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_i32_trunc_f64_u)
     popFloat64FT0()
     move 0xbff0000000000000, t0 # -1.0
     fq2d t0, ft1
     bdltequn ft0, ft1, .ipint_trunc_i32_f64_u_outOfBoundsTrunc

     move 0x41f0000000000000, t0 # INT32_MIN * -2.0
     fq2d t0, ft1
     bdgtequn ft0, ft1, .ipint_trunc_i32_f64_u_outOfBoundsTrunc

     truncated2i ft0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_trunc_i32_f64_u_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_i64_extend_i32_s)
     popInt32(t0, t1)
     sxi2q t0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_extend_i32_u)
     popInt32(t0, t1)
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_trunc_f32_s)
     popFloat32FT0()
     move 0xdf000000, t0 # INT64_MIN
     fi2f t0, ft1
     bfltun ft0, ft1, .ipint_trunc_i64_f32_s_outOfBoundsTrunc

     move 0x5f000000, t0 # -INT64_MIN
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_trunc_i64_f32_s_outOfBoundsTrunc

     truncatef2qs ft0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_trunc_i64_f32_s_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_i64_trunc_f32_u)
     popFloat32FT0()
     move 0xbf800000, t0 # -1.0
     fi2f t0, ft1
     bfltequn ft0, ft1, .ipint_i64_f32_u_outOfBoundsTrunc

     move 0x5f800000, t0 # INT64_MIN * -2.0
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_i64_f32_u_outOfBoundsTrunc

     truncatef2q ft0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i64_f32_u_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_i64_trunc_f64_s)
     move 0xc3e0000000000000, t0 # INT64_MIN
     fq2d t0, ft1
     bdltun ft0, ft1, .ipint_i64_f64_s_outOfBoundsTrunc

     move 0x43e0000000000000, t0 # -INT64_MIN
     fq2d t0, ft1
     bdgtequn ft0, ft1, .ipint_i64_f64_s_outOfBoundsTrunc

     truncated2qs ft0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i64_f64_s_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_i64_trunc_f64_u)
     move 0xbff0000000000000, t0 # -1.0
     fq2d t0, ft1
     bdltequn ft0, ft1, .ipint_i64_f64_u_outOfBoundsTrunc

     move 0x43f0000000000000, t0 # INT64_MIN * -2.0
     fq2d t0, ft1
     bdgtequn ft0, ft1, .ipint_i64_f64_u_outOfBoundsTrunc

     truncated2q ft0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 .ipint_i64_f64_u_outOfBoundsTrunc:
     ipintException(OutOfBoundsTrunc)

 instructionLabel(_f32_convert_i32_s)
     popInt32(t0, t1)
     ci2fs t0, ft0
     pushFloat32FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_convert_i32_u)
     popInt32(t0, t1)
     ci2f t0, ft0
     pushFloat32FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_convert_i64_s)
     popInt64(t0, t1)
     cq2fs t0, ft0
     pushFloat32FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_convert_i64_u)
     popInt64(t0, t1)
     if X86_64
         cq2f t0, t1, ft0
     else
         cq2f t0, ft0
     end
     pushFloat32FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_demote_f64)
     popFloat64FT0()
     cd2f ft0, ft0
     pushFloat32FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_convert_i32_s)
     popInt32(t0, t1)
     ci2ds t0, ft0
     pushFloat64FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_convert_i32_u)
     popInt32(t0, t1)
     ci2d t0, ft0
     pushFloat64FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_convert_i64_s)
     popInt64(t0, t1)
     cq2ds t0, ft0
     pushFloat64FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_convert_i64_u)
     popInt64(t0, t1)
     if X86_64
         cq2d t0, t1, ft0
     else
         cq2d t0, ft0
     end
     pushFloat64FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_promote_f32)
     popFloat32FT0()
     cf2d ft0, ft0
     pushFloat64FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_reinterpret_f32)
     popFloat32FT0()
     ff2i ft0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_reinterpret_f64)
     popFloat64FT0()
     fd2q ft0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f32_reinterpret_i32)
     pushInt32(t0)
     fi2f t0, ft0
     popFloat32FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_f64_reinterpret_i64)
     pushInt64(t0)
     fq2d t0, ft0
     popFloat64FT0()
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_extend8_s)
     # i32.extend8_s
     popInt32(t0, t1)
     sxb2i t0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_extend16_s)
     # i32.extend8_s
     popInt32(t0, t1)
     sxh2i t0, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_extend8_s)
     # i64.extend8_s
     popInt64(t0, t1)
     sxb2q t0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_extend16_s)
     # i64.extend8_s
     popInt64(t0, t1)
     sxh2q t0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_extend32_s)
     # i64.extend8_s
     popInt64(t0, t1)
     sxi2q t0, t0
     pushInt64(t0)
     advancePC(1)
     nextIPIntInstruction()

 reservedOpcode(0xc5)
 reservedOpcode(0xc6)
 reservedOpcode(0xc7)
 reservedOpcode(0xc8)
 reservedOpcode(0xc9)
 reservedOpcode(0xca)
 reservedOpcode(0xcb)
 reservedOpcode(0xcc)
 reservedOpcode(0xcd)
 reservedOpcode(0xce)
 reservedOpcode(0xcf)

     #####################
     # 0xd0 - 0xd2: refs #
     #####################

 instructionLabel(_ref_null_t)
     loadi 1[PM, MC], t0
     pushQuad(t0)
     loadb [PM, MC], t0
     advancePC(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_ref_is_null)
     popQuad(t0, t1)
     cqeq t0, ValueNull, t0
     pushInt32(t0)
     advancePC(1)
     nextIPIntInstruction()

 instructionLabel(_ref_func)
     move wasmInstance, a0
     loadi 1[PM, MC], a1
     operationCall(macro() cCall2(_ipint_extern_ref_func) end)
     pushQuad(t0)
     loadb [PM, MC], t0
     advancePC(t0)
     advanceMC(5)
     nextIPIntInstruction()

 reservedOpcode(0xd3)
 reservedOpcode(0xd4)
 reservedOpcode(0xd5)
 reservedOpcode(0xd6)
 reservedOpcode(0xd7)
 reservedOpcode(0xd8)
 reservedOpcode(0xd9)
 reservedOpcode(0xda)
 reservedOpcode(0xdb)
 reservedOpcode(0xdc)
 reservedOpcode(0xdd)
 reservedOpcode(0xde)
 reservedOpcode(0xdf)
 reservedOpcode(0xe0)
 reservedOpcode(0xe1)
 reservedOpcode(0xe2)
 reservedOpcode(0xe3)
 reservedOpcode(0xe4)
 reservedOpcode(0xe5)
 reservedOpcode(0xe6)
 reservedOpcode(0xe7)
 reservedOpcode(0xe8)
 reservedOpcode(0xe9)
 reservedOpcode(0xea)
 reservedOpcode(0xeb)
 reservedOpcode(0xec)
 reservedOpcode(0xed)
 reservedOpcode(0xee)
 reservedOpcode(0xef)
 reservedOpcode(0xf0)
 reservedOpcode(0xf1)
 reservedOpcode(0xf2)
 reservedOpcode(0xf3)
 reservedOpcode(0xf4)
 reservedOpcode(0xf5)
 reservedOpcode(0xf6)
 reservedOpcode(0xf7)
 reservedOpcode(0xf8)
 reservedOpcode(0xf9)
 reservedOpcode(0xfa)
 reservedOpcode(0xfb)
 instructionLabel(_fc_block)
     decodeLEBVarUInt32(1, t0, t1, t2, t3, ws1)
     # Security guarantee: always less than 18 (0x00 -> 0x11)
     biaeq t0, 18, .ipint_fc_nonexistent
     if ARM64 or ARM64E
         pcrtoaddr _ipint_i32_trunc_sat_f32_s, t1
         emit "add x0, x1, x0, lsl 8"
         emit "br x0"
     elsif X86_64
         lshifti 4, t0
         leap (_ipint_i32_trunc_sat_f32_s), t1
         addq t1, t0
         emit "jmp *(%eax)"
     end

 .ipint_fc_nonexistent:
     break

 instructionLabel(_simd)
     # TODO: for relaxed SIMD, handle parsing the value.
     # Metadata? Could just hardcode loading two bytes though
     decodeLEBVarUInt32(1, t0, t1, t2, t3, ws1)
     if ARM64 or ARM64E
         pcrtoaddr _ipint_simd_v128_load_mem, t1
         emit "add x0, x1, x0, lsl 8"
         emit "br x0"
     elsif X86_64
         lshifti 4, t0
         leap (_ipint_simd_v128_load_mem), t1
         addq t1, t0
         emit "jmp *(%eax)"
     end

 instructionLabel(_atomic)
     decodeLEBVarUInt32(1, t0, t1, t2, t3, ws1)
     # Security guarantee: always less than 78 (0x00 -> 0x4e)
     biaeq t0, 0x4f, .ipint_atomic_nonexistent
     if ARM64 or ARM64E
         pcrtoaddr _ipint_memory_atomic_notify, t1
         emit "add x0, x1, x0, lsl 8"
         emit "br x0"
     elsif X86_64
         lshifti 4, t0
         leap (_ipint_memory_atomic_notify), t1
         addq t1, t0
         emit "jmp *(%eax)"
     end

 .ipint_atomic_nonexistent:
     break

 reservedOpcode(0xff)

     #######################
     ## 0xFC instructions ##
     #######################

 instructionLabel(_i32_trunc_sat_f32_s)
     popFloat32FT0()

     move 0xcf000000, t0 # INT32_MIN (Note that INT32_MIN - 1.0 in float is the same as INT32_MIN in float).
     fi2f t0, ft1
     bfltun ft0, ft1, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN

     move 0x4f000000, t0 # -INT32_MIN
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMax

     truncatef2is ft0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN:
     bfeq ft0, ft0, .outOfBoundsTruncSatMin
     move 0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMax:
     move (constexpr INT32_MAX), t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMin:
     move (constexpr INT32_MIN), t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_trunc_sat_f32_u)
     popFloat32FT0()

     move 0xbf800000, t0 # -1.0
     fi2f t0, ft1
     bfltequn ft0, ft1, .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMin

     move 0x4f800000, t0 # INT32_MIN * -2.0
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMax

     truncatef2i ft0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMin:
     move 0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMax:
     move (constexpr UINT32_MAX), t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_trunc_sat_f64_s)
     popFloat64FT0()

     move 0xc1e0000000200000, t0 # INT32_MIN - 1.0
     fq2d t0, ft1
     bdltequn ft0, ft1, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN

     move 0x41e0000000000000, t0 # -INT32_MIN
     fq2d t0, ft1
     bdgtequn ft0, ft1, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMax

     truncated2is ft0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN:
     bdeq ft0, ft0, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMin
     move 0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMax:
     move (constexpr INT32_MAX), t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMin:
     move (constexpr INT32_MIN), t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_i32_trunc_sat_f64_u)
     popFloat64FT0()

     move 0xbff0000000000000, t0 # -1.0
     fq2d t0, ft1
     bdltequn ft0, ft1, .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMin

     move 0x41f0000000000000, t0 # INT32_MIN * -2.0
     fq2d t0, ft1
     bdgtequn ft0, ft1, .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMax

     truncated2i ft0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMin:
     move 0, t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMax:
     move (constexpr UINT32_MAX), t0
     pushInt32(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_trunc_sat_f32_s)
     popFloat32FT0()

     move 0xdf000000, t0 # INT64_MIN
     fi2f t0, ft1
     bfltun ft0, ft1, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN

     move 0x5f000000, t0 # -INT64_MIN
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMax

     truncatef2qs ft0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN:
     bfeq ft0, ft0, .outOfBoundsTruncSatMin
     move 0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMax:
     move (constexpr INT64_MAX), t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_trunc_sat_f32_u)
     popFloat32FT0()

     move 0xbf800000, t0 # -1.0
     fi2f t0, ft1
     bfltequn ft0, ft1, .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMin

     move 0x5f800000, t0 # INT64_MIN * -2.0
     fi2f t0, ft1
     bfgtequn ft0, ft1, .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMax

     truncatef2q ft0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMin:
     move 0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMax:
     move (constexpr UINT64_MAX), t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_trunc_sat_f64_s)
     popFloat64FT0()
     move 0xc3e0000000000000, t0 # INT64_MIN
     fq2d t0, ft1
     bdltun ft0, ft1, .outOfBoundsTruncSatMinOrNaN

     move 0x43e0000000000000, t0 # -INT64_MIN
     fq2d t0, ft1
     bdgtequn ft0, ft1, .outOfBoundsTruncSatMax

     truncated2qs ft0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .outOfBoundsTruncSatMinOrNaN:
     bdeq ft0, ft0, .outOfBoundsTruncSatMin
     move 0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .outOfBoundsTruncSatMax:
     move (constexpr INT64_MAX), t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .outOfBoundsTruncSatMin:
     move (constexpr INT64_MIN), t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_i64_trunc_sat_f64_u)
     popFloat64FT0()

     move 0xbff0000000000000, t0 # -1.0
     fq2d t0, ft1
     bdltequn ft0, ft1, .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMin

     move 0x43f0000000000000, t0 # INT64_MIN * -2.0
     fq2d t0, ft1
     bdgtequn ft0, ft1, .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMax

     truncated2q ft0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMin:
     move 0, t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMax:
     move (constexpr UINT64_MAX), t0
     pushInt64(t0)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_memory_init)
     # memory.init
     popQuad(t0, t3) # n
     popQuad(t1, t3) # s
     popQuad(a2, t3) # d
     lshiftq 32, t1
     orq t1, t0
     move t0, a3
     loadi 1[PM, MC], a1
     operationCallMayThrow(macro() cCall4(_ipint_extern_memory_init) end)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_data_drop)
     # data.drop
     loadi 1[PM, MC], a1
     operationCall(macro() cCall2(_ipint_extern_data_drop) end)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_memory_copy)
     # memory.copy
     popQuad(a3, t0) # n
     popQuad(a2, t0) # s
     popQuad(a1, t0) # d
     operationCallMayThrow(macro() cCall4(_ipint_extern_memory_copy) end)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_memory_fill)
     # memory.fill
     popQuad(a3, t0) # n
     popQuad(a2, t0) # val
     popQuad(a1, t0) # d
     operationCallMayThrow(macro() cCall4(_ipint_extern_memory_fill) end)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 instructionLabel(_table_init)
     # memory.init
     popQuad(t0, t3) # n
     popQuad(t1, t3) # s
     popQuad(a2, t3) # d
     lshiftq 32, t1
     orq t1, t0
     move t0, a3
     leap [PM, MC], a1
     operationCallMayThrow(macro() cCall4(_ipint_extern_table_init) end)
     loadb 8[PM, MC], t0
     advancePCByReg(t0)
     advanceMC(9)
     nextIPIntInstruction()

 instructionLabel(_elem_drop)
     # elem.drop
     loadi 1[PM, MC], a1
     operationCall(macro() cCall2(_ipint_extern_elem_drop) end)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_table_copy)
     # table.copy
     popQuad(t0, t3) # n
     popQuad(t1, t3) # s
     popQuad(a2, t3) # d
     lshiftq 32, t1
     orq t1, t0
     move t0, a3
     leap [PM, MC], a1
     operationCallMayThrow(macro() cCall4(_ipint_extern_table_copy) end)
     loadb 8[PM, MC], t0
     advancePCByReg(t0)
     advanceMC(9)
     nextIPIntInstruction()

 instructionLabel(_table_grow)
     # table.grow
     loadi 1[PM, MC], a1
     popQuad(a3, t0) # n
     popQuad(a2, t0) # fill
     operationCall(macro() cCall4(_ipint_extern_table_grow) end)
     pushQuad(t0)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_table_size)
     # table.size
     loadi 1[PM, MC], a1
     operationCall(macro() cCall2(_ipint_extern_table_size) end)
     pushQuad(t0)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 instructionLabel(_table_fill)
     # table.fill
     popQuad(t0, t3) # n
     popQuad(a2, t3) # val
     popQuad(t3, t1) # i
     lshiftq 32, t3
     orq t0, t3
     loadi 1[PM, MC], a1
     operationCallMayThrow(macro() cCall4(_ipint_extern_table_fill) end)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

     #######################
     ## SIMD Instructions ##
     #######################

 # 0xFD 0x00 - 0xFD 0x0B: memory
 unimplementedInstruction(_simd_v128_load_mem)
 unimplementedInstruction(_simd_v128_load_8x8s_mem)
 unimplementedInstruction(_simd_v128_load_8x8u_mem)
 unimplementedInstruction(_simd_v128_load_16x4s_mem)
 unimplementedInstruction(_simd_v128_load_16x4u_mem)
 unimplementedInstruction(_simd_v128_load_32x2s_mem)
 unimplementedInstruction(_simd_v128_load_32x2u_mem)
 unimplementedInstruction(_simd_v128_load8_splat_mem)
 unimplementedInstruction(_simd_v128_load16_splat_mem)
 unimplementedInstruction(_simd_v128_load32_splat_mem)
 unimplementedInstruction(_simd_v128_load64_splat_mem)
 unimplementedInstruction(_simd_v128_store_mem)

 # 0xFD 0x0C: v128.const
 instructionLabel(_simd_v128_const)
     # v128.const
     leap [PM, MC], t0
     loadv 1[t0], v0
     loadb [t0], t0
     pushv v0
     advancePCByReg(t0)
     advanceMC(17)
     nextIPIntInstruction()

 # 0xFD 0x0D - 0xFD 0x14: splat (+ shuffle/swizzle)
 unimplementedInstruction(_simd_i8x16_shuffle)
 unimplementedInstruction(_simd_i8x16_swizzle)
 unimplementedInstruction(_simd_i8x16_splat)
 unimplementedInstruction(_simd_i16x8_splat)
 unimplementedInstruction(_simd_i32x4_splat)
 unimplementedInstruction(_simd_i64x2_splat)
 unimplementedInstruction(_simd_f32x4_splat)
 unimplementedInstruction(_simd_f64x2_splat)

 # 0xFD 0x15 - 0xFD 0x22: extract and replace lanes
 unimplementedInstruction(_simd_i8x16_extract_lane_s)
 unimplementedInstruction(_simd_i8x16_extract_lane_u)
 unimplementedInstruction(_simd_i8x16_replace_lane)
 unimplementedInstruction(_simd_i16x8_extract_lane_s)
 unimplementedInstruction(_simd_i16x8_extract_lane_u)
 unimplementedInstruction(_simd_i16x8_replace_lane)

 instructionLabel(_simd_i32x4_extract_lane)
     # i32x4.extract_lane (lane)
     loadb 2[PB, PC], t0  # lane index
     popv v0
     if ARM64 or ARM64E
         pcrtoaddr _simd_i32x4_extract_lane_0, t1
         leap [t1, t0, 8], t0
         _simd_i32x4_extract_lane_0:
         umovi t0, v0_i, 0
         jmp _simd_i32x4_extract_lane_end
         umovi t0, v0_i, 1
         jmp _simd_i32x4_extract_lane_end
         umovi t0, v0_i, 2
         jmp _simd_i32x4_extract_lane_end
         umovi t0, v0_i, 3
         jmp _simd_i32x4_extract_lane_end
     elsif X86_64
         # FIXME: implement SIMD instructions for x86 and finish this implementation!
     end
 _simd_i32x4_extract_lane_end:
     pushInt32(t0)
     advancePC(3)
     nextIPIntInstruction()

 unimplementedInstruction(_simd_i32x4_replace_lane)
 unimplementedInstruction(_simd_i64x2_extract_lane)
 unimplementedInstruction(_simd_i64x2_replace_lane)
 unimplementedInstruction(_simd_f32x4_extract_lane)
 unimplementedInstruction(_simd_f32x4_replace_lane)
 unimplementedInstruction(_simd_f64x2_extract_lane)
 unimplementedInstruction(_simd_f64x2_replace_lane)

 # 0xFD 0x23 - 0xFD 0x2C: i8x16 operations
 unimplementedInstruction(_simd_i8x16_eq)
 unimplementedInstruction(_simd_i8x16_ne)
 unimplementedInstruction(_simd_i8x16_lt_s)
 unimplementedInstruction(_simd_i8x16_lt_u)
 unimplementedInstruction(_simd_i8x16_gt_s)
 unimplementedInstruction(_simd_i8x16_gt_u)
 unimplementedInstruction(_simd_i8x16_le_s)
 unimplementedInstruction(_simd_i8x16_le_u)
 unimplementedInstruction(_simd_i8x16_ge_s)
 unimplementedInstruction(_simd_i8x16_ge_u)

 # 0xFD 0x2D - 0xFD 0x36: i8x16 operations
 unimplementedInstruction(_simd_i16x8_eq)
 unimplementedInstruction(_simd_i16x8_ne)
 unimplementedInstruction(_simd_i16x8_lt_s)
 unimplementedInstruction(_simd_i16x8_lt_u)
 unimplementedInstruction(_simd_i16x8_gt_s)
 unimplementedInstruction(_simd_i16x8_gt_u)
 unimplementedInstruction(_simd_i16x8_le_s)
 unimplementedInstruction(_simd_i16x8_le_u)
 unimplementedInstruction(_simd_i16x8_ge_s)
 unimplementedInstruction(_simd_i16x8_ge_u)

 # 0xFD 0x37 - 0xFD 0x40: i32x4 operations
 unimplementedInstruction(_simd_i32x4_eq)
 unimplementedInstruction(_simd_i32x4_ne)
 unimplementedInstruction(_simd_i32x4_lt_s)
 unimplementedInstruction(_simd_i32x4_lt_u)
 unimplementedInstruction(_simd_i32x4_gt_s)
 unimplementedInstruction(_simd_i32x4_gt_u)
 unimplementedInstruction(_simd_i32x4_le_s)
 unimplementedInstruction(_simd_i32x4_le_u)
 unimplementedInstruction(_simd_i32x4_ge_s)
 unimplementedInstruction(_simd_i32x4_ge_u)

 # 0xFD 0x41 - 0xFD 0x46: f32x4 operations
 unimplementedInstruction(_simd_f32x4_eq)
 unimplementedInstruction(_simd_f32x4_ne)
 unimplementedInstruction(_simd_f32x4_lt)
 unimplementedInstruction(_simd_f32x4_gt)
 unimplementedInstruction(_simd_f32x4_le)
 unimplementedInstruction(_simd_f32x4_ge)

 # 0xFD 0x47 - 0xFD 0x4c: f64x2 operations
 unimplementedInstruction(_simd_f64x2_eq)
 unimplementedInstruction(_simd_f64x2_ne)
 unimplementedInstruction(_simd_f64x2_lt)
 unimplementedInstruction(_simd_f64x2_gt)
 unimplementedInstruction(_simd_f64x2_le)
 unimplementedInstruction(_simd_f64x2_ge)

 # 0xFD 0x4D - 0xFD 0x53: v128 operations
 unimplementedInstruction(_simd_v128_not)
 unimplementedInstruction(_simd_v128_and)
 unimplementedInstruction(_simd_v128_andnot)
 unimplementedInstruction(_simd_v128_or)
 unimplementedInstruction(_simd_v128_xor)
 unimplementedInstruction(_simd_v128_bitselect)
 unimplementedInstruction(_simd_v128_any_true)

 # 0xFD 0x54 - 0xFD 0x5D: v128 load/store lane
 unimplementedInstruction(_simd_v128_load8_lane_mem)
 unimplementedInstruction(_simd_v128_load16_lane_mem)
 unimplementedInstruction(_simd_v128_load32_lane_mem)
 unimplementedInstruction(_simd_v128_load64_lane_mem)
 unimplementedInstruction(_simd_v128_store8_lane_mem)
 unimplementedInstruction(_simd_v128_store16_lane_mem)
 unimplementedInstruction(_simd_v128_store32_lane_mem)
 unimplementedInstruction(_simd_v128_store64_lane_mem)
 unimplementedInstruction(_simd_v128_load32_zero_mem)
 unimplementedInstruction(_simd_v128_load64_zero_mem)

 # 0xFD 0x5E - 0xFD 0x5F: f32x4/f64x2 conversion
 unimplementedInstruction(_simd_f32x4_demote_f64x2_zero)
 unimplementedInstruction(_simd_f64x2_promote_low_f32x4)

 # 0xFD 0x60 - 0x66: i8x16 operations
 unimplementedInstruction(_simd_i8x16_abs)
 unimplementedInstruction(_simd_i8x16_neg)
 unimplementedInstruction(_simd_i8x16_popcnt)
 unimplementedInstruction(_simd_i8x16_all_true)
 unimplementedInstruction(_simd_i8x16_bitmask)
 unimplementedInstruction(_simd_i8x16_narrow_i16x8_s)
 unimplementedInstruction(_simd_i8x16_narrow_i16x8_u)

 # 0xFD 0x67 - 0xFD 0x6A: f32x4 operations
 unimplementedInstruction(_simd_f32x4_ceil)
 unimplementedInstruction(_simd_f32x4_floor)
 unimplementedInstruction(_simd_f32x4_trunc)
 unimplementedInstruction(_simd_f32x4_nearest)

 # 0xFD 0x6B - 0xFD 0x73: i8x16 binary operations
 unimplementedInstruction(_simd_i8x16_shl)
 unimplementedInstruction(_simd_i8x16_shr_s)
 unimplementedInstruction(_simd_i8x16_shr_u)
 unimplementedInstruction(_simd_i8x16_add)
 unimplementedInstruction(_simd_i8x16_add_sat_s)
 unimplementedInstruction(_simd_i8x16_add_sat_u)
 unimplementedInstruction(_simd_i8x16_sub)
 unimplementedInstruction(_simd_i8x16_sub_sat_s)
 unimplementedInstruction(_simd_i8x16_sub_sat_u)

 # 0xFD 0x74 - 0xFD 0x75: f64x2 operations
 unimplementedInstruction(_simd_f64x2_ceil)
 unimplementedInstruction(_simd_f64x2_floor)

 # 0xFD 0x76 - 0xFD 0x79: i8x16 binary operations
 unimplementedInstruction(_simd_i8x16_min_s)
 unimplementedInstruction(_simd_i8x16_min_u)
 unimplementedInstruction(_simd_i8x16_max_s)
 unimplementedInstruction(_simd_i8x16_max_u)

 # 0xFD 0x7A: f64x2 trunc
 unimplementedInstruction(_simd_f64x2_trunc)

 # 0xFD 0x7B: i8x16 avgr_u
 unimplementedInstruction(_simd_i8x16_avgr_u)

 # 0xFD 0x7C - 0xFD 0x7F: extadd_pairwise
 unimplementedInstruction(_simd_i16x8_extadd_pairwise_i8x16_s)
 unimplementedInstruction(_simd_i16x8_extadd_pairwise_i8x16_u)
 unimplementedInstruction(_simd_i32x4_extadd_pairwise_i16x8_s)
 unimplementedInstruction(_simd_i32x4_extadd_pairwise_i16x8_u)

 # 0xFD 0x80 0x01 - 0xFD 0x93 0x01: i16x8 operations

 unimplementedInstruction(_simd_i16x8_abs)
 unimplementedInstruction(_simd_i16x8_neg)
 unimplementedInstruction(_simd_i16x8_q15mulr_sat_s)
 unimplementedInstruction(_simd_i16x8_all_true)
 unimplementedInstruction(_simd_i16x8_bitmask)
 unimplementedInstruction(_simd_i16x8_narrow_i32x4_s)
 unimplementedInstruction(_simd_i16x8_narrow_i32x4_u)
 unimplementedInstruction(_simd_i16x8_extend_low_i8x16_s)
 unimplementedInstruction(_simd_i16x8_extend_high_i8x16_s)
 unimplementedInstruction(_simd_i16x8_extend_low_i8x16_u)
 unimplementedInstruction(_simd_i16x8_extend_high_i8x16_u)
 unimplementedInstruction(_simd_i16x8_shl)
 unimplementedInstruction(_simd_i16x8_shr_s)
 unimplementedInstruction(_simd_i16x8_shr_u)
 unimplementedInstruction(_simd_i16x8_add)
 unimplementedInstruction(_simd_i16x8_add_sat_s)
 unimplementedInstruction(_simd_i16x8_add_sat_u)
 unimplementedInstruction(_simd_i16x8_sub)
 unimplementedInstruction(_simd_i16x8_sub_sat_s)
 unimplementedInstruction(_simd_i16x8_sub_sat_u)

 # 0xFD 0x94 0x01: f64x2.nearest

 unimplementedInstruction(_simd_f64x2_nearest)

 # 0xFD 0x95 0x01 - 0xFD 0x9F 0x01: i16x8 operations

 unimplementedInstruction(_simd_i16x8_mul)
 unimplementedInstruction(_simd_i16x8_min_s)
 unimplementedInstruction(_simd_i16x8_min_u)
 unimplementedInstruction(_simd_i16x8_max_s)
 unimplementedInstruction(_simd_i16x8_max_u)
 reservedOpcode(0xFD9A01)
 unimplementedInstruction(_simd_i16x8_avgr_u)
 unimplementedInstruction(_simd_i16x8_extmul_low_i8x16_s)
 unimplementedInstruction(_simd_i16x8_extmul_high_i8x16_s)
 unimplementedInstruction(_simd_i16x8_extmul_low_i8x16_u)
 unimplementedInstruction(_simd_i16x8_extmul_high_i8x16_u)

 # 0xFD 0xA0 0x01 - 0xFD 0xBF 0x01: i32x4 operations

 unimplementedInstruction(_simd_i32x4_abs)
 unimplementedInstruction(_simd_i32x4_neg)
 reservedOpcode(0xFDA201)
 unimplementedInstruction(_simd_i32x4_all_true)
 unimplementedInstruction(_simd_i32x4_bitmask)
 reservedOpcode(0xFDA501)
 reservedOpcode(0xFDA601)
 unimplementedInstruction(_simd_i32x4_extend_low_i16x8_s)
 unimplementedInstruction(_simd_i32x4_extend_high_i16x8_s)
 unimplementedInstruction(_simd_i32x4_extend_low_i16x8_u)
 unimplementedInstruction(_simd_i32x4_extend_high_i16x8_u)
 unimplementedInstruction(_simd_i32x4_shl)
 unimplementedInstruction(_simd_i32x4_shr_s)
 unimplementedInstruction(_simd_i32x4_shr_u)
 unimplementedInstruction(_simd_i32x4_add)
 reservedOpcode(0xFDAF01)
 reservedOpcode(0xFDB001)
 unimplementedInstruction(_simd_i32x4_sub)
 reservedOpcode(0xFDB201)
 reservedOpcode(0xFDB301)
 reservedOpcode(0xFDB401)
 unimplementedInstruction(_simd_i32x4_mul)
 unimplementedInstruction(_simd_i32x4_min_s)
 unimplementedInstruction(_simd_i32x4_min_u)
 unimplementedInstruction(_simd_i32x4_max_s)
 unimplementedInstruction(_simd_i32x4_max_u)
 unimplementedInstruction(_simd_i32x4_dot_i16x8_s)
 reservedOpcode(0xFDBB01)
 unimplementedInstruction(_simd_i32x4_extmul_low_i16x8_s)
 unimplementedInstruction(_simd_i32x4_extmul_high_i16x8_s)
 unimplementedInstruction(_simd_i32x4_extmul_low_i16x8_u)
 unimplementedInstruction(_simd_i32x4_extmul_high_i16x8_u)

 # 0xFD 0xC0 0x01 - 0xFD 0xDF 0x01: i64x2 operations

 unimplementedInstruction(_simd_i64x2_abs)
 unimplementedInstruction(_simd_i64x2_neg)
 reservedOpcode(0xFDC201)
 unimplementedInstruction(_simd_i64x2_all_true)
 unimplementedInstruction(_simd_i64x2_bitmask)
 reservedOpcode(0xFDC501)
 reservedOpcode(0xFDC601)
 unimplementedInstruction(_simd_i64x2_extend_low_i32x4_s)
 unimplementedInstruction(_simd_i64x2_extend_high_i32x4_s)
 unimplementedInstruction(_simd_i64x2_extend_low_i32x4_u)
 unimplementedInstruction(_simd_i64x2_extend_high_i32x4_u)
 unimplementedInstruction(_simd_i64x2_shl)
 unimplementedInstruction(_simd_i64x2_shr_s)
 unimplementedInstruction(_simd_i64x2_shr_u)
 unimplementedInstruction(_simd_i64x2_add)
 reservedOpcode(0xFDCF01)
 reservedOpcode(0xFDD001)
 unimplementedInstruction(_simd_i64x2_sub)
 reservedOpcode(0xFDD201)
 reservedOpcode(0xFDD301)
 reservedOpcode(0xFDD401)
 unimplementedInstruction(_simd_i64x2_mul)
 unimplementedInstruction(_simd_i64x2_eq)
 unimplementedInstruction(_simd_i64x2_ne)
 unimplementedInstruction(_simd_i64x2_lt_s)
 unimplementedInstruction(_simd_i64x2_gt_s)
 unimplementedInstruction(_simd_i64x2_le_s)
 unimplementedInstruction(_simd_i64x2_ge_s)
 unimplementedInstruction(_simd_i64x2_extmul_low_i32x4_s)
 unimplementedInstruction(_simd_i64x2_extmul_high_i32x4_s)
 unimplementedInstruction(_simd_i64x2_extmul_low_i32x4_u)
 unimplementedInstruction(_simd_i64x2_extmul_high_i32x4_u)

 # 0xFD 0xE0 0x01 - 0xFD 0xEB 0x01: f32x4 operations

 unimplementedInstruction(_simd_f32x4_abs)
 unimplementedInstruction(_simd_f32x4_neg)
 reservedOpcode(0xFDC201)
 unimplementedInstruction(_simd_f32x4_sqrt)
 unimplementedInstruction(_simd_f32x4_add)
 unimplementedInstruction(_simd_f32x4_sub)
 unimplementedInstruction(_simd_f32x4_mul)
 unimplementedInstruction(_simd_f32x4_div)
 unimplementedInstruction(_simd_f32x4_min)
 unimplementedInstruction(_simd_f32x4_max)
 unimplementedInstruction(_simd_f32x4_pmin)
 unimplementedInstruction(_simd_f32x4_pmax)

 # 0xFD 0xEC 0x01 - 0xFD 0xF7 0x01: f64x2 operations

 unimplementedInstruction(_simd_f64x2_abs)
 unimplementedInstruction(_simd_f64x2_neg)
 reservedOpcode(0xFDEE01)
 unimplementedInstruction(_simd_f64x2_sqrt)
 unimplementedInstruction(_simd_f64x2_add)
 unimplementedInstruction(_simd_f64x2_sub)
 unimplementedInstruction(_simd_f64x2_mul)
 unimplementedInstruction(_simd_f64x2_div)
 unimplementedInstruction(_simd_f64x2_min)
 unimplementedInstruction(_simd_f64x2_max)
 unimplementedInstruction(_simd_f64x2_pmin)
 unimplementedInstruction(_simd_f64x2_pmax)

 # 0xFD 0xF8 0x01 - 0xFD 0xFF 0x01: trunc/convert

 unimplementedInstruction(_simd_i32x4_trunc_sat_f32x4_s)
 unimplementedInstruction(_simd_i32x4_trunc_sat_f32x4_u)
 unimplementedInstruction(_simd_f32x4_convert_i32x4_s)
 unimplementedInstruction(_simd_f32x4_convert_i32x4_u)
 unimplementedInstruction(_simd_i32x4_trunc_sat_f64x2_s_zero)
 unimplementedInstruction(_simd_i32x4_trunc_sat_f64x2_u_zero)
 unimplementedInstruction(_simd_f64x2_convert_low_i32x4_s)
 unimplementedInstruction(_simd_f64x2_convert_low_i32x4_u)

     #########################
     ## Atomic instructions ##
     #########################

 macro ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, size)
     leap size - 1[mem], scratch
     bpb scratch, boundsCheckingSize, .continuation
 .throw:
     ipintException(OutOfBoundsMemoryAccess)
 .continuation:
     btpnz mem, (size - 1), .throw
 end

 macro ipintCheckMemoryBoundWithAlignmentCheck1(mem, scratch)
     ipintCheckMemoryBound(mem, scratch, 1)
 end

 macro ipintCheckMemoryBoundWithAlignmentCheck2(mem, scratch)
     ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, 2)
 end

 macro ipintCheckMemoryBoundWithAlignmentCheck4(mem, scratch)
     ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, 4)
 end

 macro ipintCheckMemoryBoundWithAlignmentCheck8(mem, scratch)
     ipintCheckMemoryBoundWithAlignmentCheck(mem, scratch, 8)
 end

 instructionLabel(_memory_atomic_notify)
     # pop count
     popInt32(a3, t0)
     # pop pointer
     popInt32(a1, t0)
     # load offset
     loadi 1[PM, MC], a2

     move wasmInstance, a0
     operationCall(macro() cCall4(_ipint_extern_memory_atomic_notify) end)
     bilt r0, 0, .atomic_notify_throw

     pushInt32(r0)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 .atomic_notify_throw:
     ipintException(OutOfBoundsMemoryAccess)

 instructionLabel(_memory_atomic_wait32)
     # pop timeout
     popInt32(a3, t0)
     # pop value
     popInt32(a2, t0)
     # pop pointer
     popInt32(a1, t0)
     # load offset
     loadi 1[PM, MC], t0
     # merge them since the slow path takes the combined pointer + offset.
     addq t0, a1

     move wasmInstance, a0
     operationCall(macro() cCall4(_ipint_extern_memory_atomic_wait32) end)
     bilt r0, 0, .atomic_wait32_throw

     pushInt32(r0)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 .atomic_wait32_throw:
     ipintException(OutOfBoundsMemoryAccess)

 instructionLabel(_memory_atomic_wait64)
     # pop timeout
     popInt32(a3, t0)
     # pop value
     popInt64(a2, t0)
     # pop pointer
     popInt32(a1, t0)
     # load offset
     loadi 1[PM, MC], t0
     # merge them since the slow path takes the combined pointer + offset.
     addq t0, a1

     move wasmInstance, a0
     operationCall(macro() cCall4(_ipint_extern_memory_atomic_wait64) end)
     bilt r0, 0, .atomic_wait64_throw

     pushInt32(r0)
     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()

 .atomic_wait64_throw:
     ipintException(OutOfBoundsMemoryAccess)

 instructionLabel(_atomic_fence)
     fence

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(1)
     nextIPIntInstruction()

 reservedOpcode(_atomic_0x4)
 reservedOpcode(_atomic_0x5)
 reservedOpcode(_atomic_0x6)
 reservedOpcode(_atomic_0x7)
 reservedOpcode(_atomic_0x8)
 reservedOpcode(_atomic_0x9)
 reservedOpcode(_atomic_0xa)
 reservedOpcode(_atomic_0xb)
 reservedOpcode(_atomic_0xc)
 reservedOpcode(_atomic_0xd)
 reservedOpcode(_atomic_0xe)
 reservedOpcode(_atomic_0xf)

 macro atomicLoadOp(boundsAndAlignmentCheck, loadAndPush)
     # pop index
     popInt32(t0, t2)
     # load offset
     loadi 1[PM, MC], t2
     addq t2, t0
     boundsAndAlignmentCheck(t0,  t3)
     addq memoryBase, t0
     loadAndPush(t0, t2)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()
 end

 instructionLabel(_i32_atomic_load)
     atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, scratch)
         if ARM64 or ARM64E or X86_64
             atomicloadi [mem], scratch
         else
             error
         end
         pushInt32(scratch)
     end)

 instructionLabel(_i64_atomic_load)
     atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, scratch)
         if ARM64 or ARM64E or X86_64
             atomicloadq [mem], scratch
         else
             error
         end
         pushInt64(scratch)
     end)

 instructionLabel(_i32_atomic_load8_u)
     atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, scratch)
         if ARM64 or ARM64E or X86_64
             atomicloadb [mem], scratch
         else
             error
         end
         pushInt32(scratch)
     end)

 instructionLabel(_i32_atomic_load16_u)
     atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, scratch)
         if ARM64 or ARM64E or X86_64
             atomicloadh [mem], scratch
         else
             error
         end
         pushInt32(scratch)
     end)

 instructionLabel(_i64_atomic_load8_u)
     atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, scratch)
         if ARM64 or ARM64E or X86_64
             atomicloadb [mem], scratch
         else
             error
         end
         pushInt64(scratch)
     end)

 instructionLabel(_i64_atomic_load16_u)
     atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, scratch)
         if ARM64 or ARM64E or X86_64
             atomicloadh [mem], scratch
         else
             error
         end
         pushInt64(scratch)
     end)

 instructionLabel(_i64_atomic_load32_u)
     atomicLoadOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, scratch)
         if ARM64 or ARM64E or X86_64
             atomicloadi [mem], scratch
         else
             error
         end
         pushInt64(scratch)
     end)

 macro weakCASLoopByte(mem, value, scratch1AndOldValue, scratch2, fn)
     if X86_64
         loadb [mem], scratch1AndOldValue
     .loop:
         move scratch1AndOldValue, scratch2
         fn(value, scratch2)
         batomicweakcasb scratch1AndOldValue, scratch2, [mem], .loop
     else
     .loop:
         loadlinkacqb [mem], scratch1AndOldValue
         fn(value, scratch1AndOldValue, scratch2)
         storecondrelb ws2, scratch2, [mem]
         bineq ws2, 0, .loop
     end
 end

 macro weakCASLoopHalf(mem, value, scratch1AndOldValue, scratch2, fn)
     if X86_64
         loadh [mem], scratch1AndOldValue
     .loop:
         move scratch1AndOldValue, scratch2
         fn(value, scratch2)
         batomicweakcash scratch1AndOldValue, scratch2, [mem], .loop
     else
     .loop:
         loadlinkacqh [mem], scratch1AndOldValue
         fn(value, scratch1AndOldValue, scratch2)
         storecondrelh ws2, scratch2, [mem]
         bineq ws2, 0, .loop
     end
 end

 macro weakCASLoopInt(mem, value, scratch1AndOldValue, scratch2, fn)
     if X86_64
         loadi [mem], scratch1AndOldValue
     .loop:
         move scratch1AndOldValue, scratch2
         fn(value, scratch2)
         batomicweakcasi scratch1AndOldValue, scratch2, [mem], .loop
     else
     .loop:
         loadlinkacqi [mem], scratch1AndOldValue
         fn(value, scratch1AndOldValue, scratch2)
         storecondreli ws2, scratch2, [mem]
         bineq ws2, 0, .loop
     end
 end

 macro weakCASLoopQuad(mem, value, scratch1AndOldValue, scratch2, fn)
     if X86_64
         loadq [mem], scratch1AndOldValue
     .loop:
         move scratch1AndOldValue, scratch2
         fn(value, scratch2)
         batomicweakcasq scratch1AndOldValue, scratch2, [mem], .loop
     else
     .loop:
         loadlinkacqq [mem], scratch1AndOldValue
         fn(value, scratch1AndOldValue, scratch2)
         storecondrelq ws2, scratch2, [mem]
         bineq ws2, 0, .loop
     end
 end

 macro atomicStoreOp(boundsAndAlignmentCheck, popAndStore)
     # pop value
     popInt64(t1, t0)
     # pop index
     popInt32(t2, t0)
     # load offset
     loadi 1[PM, MC], t0
     addq t0, t2
     boundsAndAlignmentCheck(t2, t3)
     addq memoryBase, t2
     popAndStore(t2, t1, t0, t3)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()
 end

 instructionLabel(_i32_atomic_store)
     atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgi value, [mem], value
         elsif X86_64
             atomicxchgi value, [mem]
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
     end)

 instructionLabel(_i64_atomic_store)
     atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgq value, [mem], value
         elsif X86_64
             atomicxchgq value, [mem]
         elsif ARM64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
     end)

 instructionLabel(_i32_atomic_store8_u)
     atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgb value, [mem], value
         elsif X86_64
             atomicxchgb value, [mem]
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
     end)

 instructionLabel(_i32_atomic_store16_u)
     atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgh value, [mem], value
         elsif X86_64
             atomicxchgh value, [mem]
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
     end)

 instructionLabel(_i64_atomic_store8_u)
     atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgb value, [mem], value
         elsif X86_64
             atomicxchgb value, [mem]
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
     end)

 instructionLabel(_i64_atomic_store16_u)
     atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgh value, [mem], value
         elsif X86_64
             atomicxchgh value, [mem]
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
     end)

 instructionLabel(_i64_atomic_store32_u)
     atomicStoreOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgi value, [mem], value
         elsif X86_64
             atomicxchgi value, [mem]
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
     end)


 macro atomicRMWOp(boundsAndAlignmentCheck, rmw)
     # pop value
     popInt64(t1, t0)
     # pop index
     popInt32(t2, t0)
     # load offset
     loadi 1[PM, MC], t0
     addq t0, t2
     boundsAndAlignmentCheck(t2, t3)
     addq memoryBase, t2
     rmw(t2, t1, t0, t3)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()
 end

 instructionLabel(_i32_atomic_rmw_add)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgaddi value, [mem], scratch1
         elsif X86_64
             atomicxchgaddi value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 addi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw_add)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgaddq value, [mem], scratch1
         elsif X86_64
             atomicxchgaddq value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 addq value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw8_add_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgaddb value, [mem], scratch1
         elsif X86_64
             atomicxchgaddb value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 addi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw16_add_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgaddh value, [mem], scratch1
         elsif X86_64
             atomicxchgaddh value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 addi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw8_add_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgaddb value, [mem], scratch1
         elsif X86_64
             atomicxchgaddb value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 addi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw16_add_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgaddh value, [mem], scratch1
         elsif X86_64
             atomicxchgaddh value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 addi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw32_add_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgaddi value, [mem], scratch1
         elsif X86_64
             atomicxchgaddi value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 addi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw_sub)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             negi value
             atomicxchgaddi value, [mem], scratch1
         elsif X86_64
             negi value
             atomicxchgaddi value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 subi oldValue, value, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)


 instructionLabel(_i64_atomic_rmw_sub)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
         if ARM64E
             negq value
             atomicxchgaddq value, [mem], scratch1
         elsif X86_64
             negq value
             atomicxchgaddq value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 subq oldValue, value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw8_sub_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             negi value
             atomicxchgaddb value, [mem], scratch1
         elsif X86_64
             negi value
             atomicxchgaddb value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 subi oldValue, value, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw16_sub_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             negi value
             atomicxchgaddh value, [mem], scratch1
         elsif X86_64
             negi value
             atomicxchgaddh value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 subi oldValue, value, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw8_sub_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             negq value
             atomicxchgaddb value, [mem], scratch1
         elsif X86_64
             negq value
             atomicxchgaddb value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 subi oldValue, value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw16_sub_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             negq value
             atomicxchgaddh value, [mem], scratch1
         elsif X86_64
             negq value
             atomicxchgaddh value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 subi oldValue, value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw32_sub_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             negq value
             atomicxchgaddi value, [mem], scratch1
         elsif X86_64
             negq value
             atomicxchgaddi value, [mem]
             move value, scratch1
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 subi oldValue, value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw_and)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             noti value
             atomicxchgcleari value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 andq value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 andi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw_and)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
         if ARM64E
             notq value
             atomicxchgclearq value, [mem], scratch1
         elsif X86_64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
                 andq value, dst
             end)
         elsif ARM64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 andq value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw8_and_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             noti value
             atomicxchgclearb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 andq value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 andi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw16_and_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             noti value
             atomicxchgclearh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 andq value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 andi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw8_and_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             notq value
             atomicxchgclearb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 andq value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 andi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw16_and_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             notq value
             atomicxchgclearh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 andq value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 andi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw32_and_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             notq value
             atomicxchgcleari value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 andq value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 andi value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw_or)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgori value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 ori value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 ori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw_or)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgorq value, [mem], scratch1
         elsif X86_64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
                 orq value, dst
             end)
         elsif ARM64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 orq value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw8_or_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgorb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 orq value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 ori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw16_or_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgorh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 orq value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 ori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw8_or_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgorb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 orq value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 ori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw16_or_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgorh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 orq value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 ori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw32_or_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgori value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 orq value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 ori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw_xor)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgxori value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 xorq value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 xori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw_xor)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgxorq value, [mem], scratch1
         elsif X86_64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
                 xorq value, dst
             end)
         elsif ARM64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 xorq value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)


 instructionLabel(_i32_atomic_rmw8_xor_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgxorb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 xorq value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 xori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw16_xor_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgxorh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 xorq value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 xori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw8_xor_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgxorb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 xorq value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 xori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw16_xor_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgxorh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 xorq value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 xori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw32_xor_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgxori value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 xorq value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 xori value, oldValue, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw_xchg)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgi value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 move value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw_xchg)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgq value, [mem], scratch1
         elsif X86_64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro (value, dst)
                 move value, dst
             end)
         elsif ARM64
             weakCASLoopQuad(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw8_xchg_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 move value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i32_atomic_rmw16_xchg_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 move value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
         pushInt32(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw8_xchg_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgb value, [mem], scratch1
         elsif X86_64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro (value, dst)
                 move value, dst
             end)
         elsif ARM64
             weakCASLoopByte(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw16_xchg_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgh value, [mem], scratch1
         elsif X86_64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro (value, dst)
                 move value, dst
             end)
         elsif ARM64
             weakCASLoopHalf(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 instructionLabel(_i64_atomic_rmw32_xchg_u)
     atomicRMWOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, scratch1, scratch2)
         if ARM64E
             atomicxchgi value, [mem], scratch1
         elsif X86_64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro (value, dst)
                 move value, dst
             end)
         elsif ARM64
             weakCASLoopInt(mem, value, scratch1, scratch2, macro(value, oldValue, newValue)
                 move value, newValue
             end)
         else
             error
         end
         pushInt64(scratch1)
     end)

 macro atomicCmpxchgOp(boundsAndAlignmentCheck, cmpxchg)
     # pop value
     popInt64(t1, t2)
     # pop expected
     popInt64(t0, t2)
     # pop index
     popInt32(t3, t2)
     # load offset
     loadi 1[PM, MC], t2
     addq t2, t3
     boundsAndAlignmentCheck(t3, t2)
     addq memoryBase, t3
     cmpxchg(t3, t1, t0, t2)

     loadb [PM, MC], t0
     advancePCByReg(t0)
     advanceMC(5)
     nextIPIntInstruction()
 end

 macro weakCASExchangeByte(mem, value, expected, scratch)
     if ARM64
     .loop:
         loadlinkacqb [mem], scratch
         bqneq expected, scratch, .fail
         storecondrelb scratch, value, [mem]
         bieq scratch, 0, .done
         jmp .loop
     .fail:
         emit "clrex"
         move scratch, expected
     .done:
     else
         error
     end
 end

 macro weakCASExchangeHalf(mem, value, expected, scratch)
     if ARM64
     .loop:
         loadlinkacqh [mem], scratch
         bqneq expected, scratch, .fail
         storecondrelh scratch, value, [mem]
         bieq scratch, 0, .done
         jmp .loop
     .fail:
         emit "clrex"
         move scratch, expected
     .done:
     else
         error
     end
 end

 macro weakCASExchangeInt(mem, value, expected, scratch)
     if ARM64
     .loop:
         loadlinkacqi [mem], scratch
         bqneq expected, scratch, .fail
         storecondreli scratch, value, [mem]
         bieq scratch, 0, .done
         jmp .loop
     .fail:
         emit "clrex"
         move scratch, expected
     .done:
     else
         error
     end
 end

 macro weakCASExchangeQuad(mem, value, expected, scratch)
     if ARM64
     .loop:
         loadlinkacqq [mem], scratch
         bqneq expected, scratch, .fail
         storecondrelq scratch, value, [mem]
         bieq scratch, 0, .done
         jmp .loop
     .fail:
         emit "clrex"
         move scratch, expected
     .done:
     else
         error
     end
 end

 instructionLabel(_i32_atomic_rmw_cmpxchg)
     atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, expected, scratch2)
         if ARM64E or X86_64
             atomicweakcasi expected, value, [mem]
         elsif ARM64
             weakCASExchangeInt(mem, value, expected, scratch2)
         else
             error
         end
         pushInt32(expected)
     end)

 instructionLabel(_i64_atomic_rmw_cmpxchg)
     atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck8, macro(mem, value, expected, scratch2)
         if ARM64E or X86_64
             atomicweakcasq expected, value, [mem]
         elsif ARM64
             weakCASExchangeQuad(mem, value, expected, scratch2)
         else
             error
         end
         pushInt64(expected)
     end)

 instructionLabel(_i32_atomic_rmw8_cmpxchg_u)
     atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, expected, scratch2)
         andq 0xff, expected
         if ARM64E or X86_64
             atomicweakcasb expected, value, [mem]
         elsif ARM64
             weakCASExchangeByte(mem, value, expected, scratch2)
         else
             error
         end
         pushInt32(expected)
     end)

 instructionLabel(_i32_atomic_rmw16_cmpxchg_u)
     atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, expected, scratch2)
         andq 0xffff, expected
         if ARM64E or X86_64
             atomicweakcash expected, value, [mem]
         elsif ARM64
             weakCASExchangeHalf(mem, value, expected, scratch2)
         else
             error
         end
         pushInt32(expected)
     end)

 instructionLabel(_i64_atomic_rmw8_cmpxchg_u)
     atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck1, macro(mem, value, expected, scratch2)
         andq 0xff, expected
         if ARM64E or X86_64
             atomicweakcasb expected, value, [mem]
         elsif ARM64
             weakCASExchangeByte(mem, value, expected, scratch2)
         else
             error
         end
         pushInt64(expected)
     end)

 instructionLabel(_i64_atomic_rmw16_cmpxchg_u)
     atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck2, macro(mem, value, expected, scratch2)
         andq 0xffff, expected
         if ARM64E or X86_64
             atomicweakcash expected, value, [mem]
         elsif ARM64
             weakCASExchangeHalf(mem, value, expected, scratch2)
         else
             error
         end
         pushInt64(expected)
     end)

 instructionLabel(_i64_atomic_rmw32_cmpxchg_u)
     atomicCmpxchgOp(ipintCheckMemoryBoundWithAlignmentCheck4, macro(mem, value, expected, scratch2)
         andq 0xffffffff, expected
         if ARM64E or X86_64
             atomicweakcasi expected, value, [mem]
         elsif ARM64
             weakCASExchangeInt(mem, value, expected, scratch2)
         else
             error
         end
         pushInt64(expected)
     end)

     ##################################
     ## "Out of line" logic for call ##
     ##################################

 # FIXME: switch offlineasm unalignedglobal to take alignment and optionally pad with breakpoint instructions (rdar://113594783)
 macro mintAlign()
     emit ".balign 64"
 end

 macro mintPop(reg)
     loadq [ws1], reg
     addq 16, ws1
 end

 macro mintPopF(reg)
     loadd [ws1], reg
     addq 16, ws1
 end

 macro mintArgDispatch()
     loadb [PM], ws0
     addq 1, PM
     andq 15, ws0
     lshiftq 6, ws0
 if ARM64 or ARM64E
     pcrtoaddr _mint_a0, csr4
     addq ws0, csr4
     # csr4 = x23
     emit "br x23"
 elsif X86_64
     leap (_mint_a0), csr4
     addq ws0, csr4
     # csr4 = r13
     emit "jmp *(%r13)"
 end
 end

 macro mintRetDispatch()
     loadb [PM], ws0
     addq 1, PM
     bilt ws0, 14, .safe
     break
 .safe:
     lshiftq 6, ws0
 if ARM64 or ARM64E
     pcrtoaddr _mint_r0, csr4
     addq ws0, csr4
     # csr4 = x23
     emit "br x23"
 elsif X86_64
     leap (_mint_r0), csr4
     addq ws0, csr4
     # csr4 = r13
     emit "jmp *(%r13)"
 end
 end

 _ipint_call_impl:
     # 0 - 3: function index
     # 4 - 7: PC post call
     # 8 - 9: length of mint bytecode
     # 10 - : mint bytecode

     # function index
     loadi 1[PM, MC], t0

     loadb [PM, MC], t1
     advancePCByReg(t1)
     advanceMC(5)

     # Get function data
     move t0, a1
     move wasmInstance, a0
     operationCall(macro() cCall2(_ipint_extern_call) end)

 .ipint_call_common:
     # wasmInstance = csr0
     # PM = csr1
     # PB = csr2
     # memoryBase = csr3
     # boundsCheckingSize = csr4

     # CANNOT throw away: entrypoint, new instance, PM
     # CAN throw away immediately: memoryBase, boundsCheckingSize, PB
     # for call: MUST preserve MC
     # for call: PB/PM (load from callee)

     # csr0 = wasmInstance, then PC
     # csr1 = PM (later PM + PB)
     # csr2 = new entrypoint
     # csr3 = new instance, then old instance
     # csr4 = temp
     # ws0 = temp

     const ipintCallSavedEntrypoint = PB
     const ipintCallNewInstance = memoryBase

     # shadow stack pointer
     const ipintCallShadowSP = ws1

     push PL, wasmInstance
     move sp, ipintCallShadowSP
     addq 16, ipintCallShadowSP
     move PC, wasmInstance

     # Free up r0, r1 to be used as argument registers
     move r0, ipintCallSavedEntrypoint
     move r1, ipintCallNewInstance

     # We'll update PM to be the value that the return metadata starts at
     addq MC, PM
     mintArgDispatch()

 mintAlign()
 _mint_a0:
     mintPop(a0)
     mintArgDispatch()

 mintAlign()
 _mint_a1:
     mintPop(a1)
     mintArgDispatch()

 mintAlign()
 _mint_a2:
     mintPop(a2)
     mintArgDispatch()

 mintAlign()
 _mint_a3:
     mintPop(a3)
     mintArgDispatch()

 mintAlign()
 _mint_a4:
 if ARM64 or ARM64E
     mintPop(a4)
     mintArgDispatch()
 else
     break
 end

 mintAlign()
 _mint_a5:
 if ARM64 or ARM64E
     mintPop(a5)
     mintArgDispatch()
 else
     break
 end

 mintAlign()
 _mint_a6:
 if ARM64 or ARM64E
     mintPop(a6)
     mintArgDispatch()
 else
     break
 end

 mintAlign()
 _mint_a7:
 if ARM64 or ARM64E
     mintPop(a7)
     mintArgDispatch()
 else
     break
 end

 mintAlign()
 _mint_fa0:
     mintPopF(fa0)
     mintArgDispatch()

 mintAlign()
 _mint_fa1:
     mintPopF(fa1)
     mintArgDispatch()

 mintAlign()
 _mint_fa2:
     mintPopF(fa2)
     mintArgDispatch()

 mintAlign()
 _mint_fa3:
     mintPopF(fa3)
     mintArgDispatch()

 mintAlign()
 _mint_stackzero:
     mintPop(ws0)
     storeq ws0, [sp]
     mintArgDispatch()

 mintAlign()
 _mint_stackeight:
     mintPop(ws0)
     pushQuad(ws0)
     mintArgDispatch()

 mintAlign()
 _mint_gap:
     subq 16, sp
     mintArgDispatch()

 mintAlign()
 _mint_call:
     # Set up the rest of the stack frame
     subp FirstArgumentOffset - 16, sp

     # wasmInstance = PC
     storeq wasmInstance, ThisArgumentOffset - 16[sp]

     # Swap instances
     move ipintCallNewInstance, wasmInstance

     # Set up memory
     push t2, t3
     ipintReloadMemory()
     pop t3, t2

     # Make the call
     call ipintCallSavedEntrypoint, JSEntrySlowPathPtrTag

     loadq ThisArgumentOffset - 16[sp], PB
     # Restore the stack pointer
     addp FirstArgumentOffset - 16, sp

     # Hey, look. PM hasn't been used to store anything.
     # No need to compute anything, just directly load stuff we need.
     loadh [PM], ws0  # number of stack args
     leap [sp, ws0, 8], sp

     const ipintCallSavedPL = memoryBase

     # Grab PL
     pop wasmInstance, ipintCallSavedPL

     loadh 2[PM], ws0
     lshiftq 4, ws0
     addq ws0, sp
     addq 4, PM
     mintRetDispatch()

 mintAlign()
 _mint_r0:
     pushQuad(r0)
     mintRetDispatch()

 mintAlign()
 _mint_r1:
     pushQuad(r1)
     mintRetDispatch()

 mintAlign()
 _mint_r2:
     pushQuad(t2)
     mintRetDispatch()

 mintAlign()
 _mint_r3:
     pushQuad(t3)
     mintRetDispatch()

 mintAlign()
 _mint_r4:
 if ARM64 or ARM64E
     pushQuad(t4)
     mintRetDispatch()
 else
     break
 end

 mintAlign()
 _mint_r5:
 if ARM64 or ARM64E
     pushQuad(t5)
     mintRetDispatch()
 else
     break
 end

 mintAlign()
 _mint_r6:
 if ARM64 or ARM64E
     pushQuad(t6)
     mintRetDispatch()
 else
     break
 end

 mintAlign()
 _mint_r7:
 if ARM64 or ARM64E
     pushQuad(t7)
     mintRetDispatch()
 else
     break
 end

 mintAlign()
 _mint_fr0:
     if ARM64 or ARM64E
         emit "str q0, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm0, (%esp)"
     end
     mintRetDispatch()

 mintAlign()
 _mint_fr1:
     if ARM64 or ARM64E
         emit "str q1, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm1, (%esp)"
     end
     mintRetDispatch()

 mintAlign()
 _mint_fr2:
     if ARM64 or ARM64E
         emit "str q2, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm2, (%esp)"
     end
     mintRetDispatch()

 mintAlign()
 _mint_fr3:
     if ARM64 or ARM64E
         emit "str q3, [sp, #-16]!"
     else
         emit "sub $16, %esp"
         emit "movdqu %xmm3, (%esp)"
     end
     mintRetDispatch()

 mintAlign()
 _mint_ret_stack:
     # TODO
     break

 mintAlign()
 _mint_end:

     move PM, MC
     move PB, PC
     # Restore PL
     move ipintCallSavedPL, PL
     # Restore PB/PM
     getIPIntCallee()
     loadp Wasm::IPIntCallee::m_bytecode[ws0], PB
     loadp Wasm::IPIntCallee::m_metadata[ws0], PM
     subq PM, MC
     # Restore IB
     if ARM64 or ARM64E
         pcrtoaddr _ipint_unreachable, IB
     end
     # Restore memory
     ipintReloadMemory()
     nextIPIntInstruction()

 uintAlign()
 _uint_r0:
     popQuad(r0, t3)
     uintDispatch()

 uintAlign()
 _uint_r1:
     popQuad(r1, t3)
     uintDispatch()

 uintAlign()
 _uint_fr1:
     popFPR()
     uintDispatch()

 uintAlign()
 _uint_stack:
     break

 uintAlign()
 _uint_ret:
     jmp .ipint_exit

 # PM = location in argumINT bytecode
 # csr0 = tmp
 # csr1 = dst
 # csr2 = src
 # csr3
 # csr4 = for dispatch

 # const argumINTDest = csr3
 # const argumINTSrc = PB

 argumINTAlign()
 _argumINT_a0:
     storeq a0, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_a1:
     storeq a1, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_a2:
     storeq a2, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_a3:
     storeq a3, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_a4:
 if ARM64 or ARM64E
     storeq a4, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()
 else
     break
 end

 argumINTAlign()
 _argumINT_a5:
 if ARM64 or ARM64E
     storeq a5, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()
 else
     break
 end

 argumINTAlign()
 _argumINT_a6:
 if ARM64 or ARM64E
     storeq a6, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()
 else
     break
 end

 argumINTAlign()
 _argumINT_a7:
 if ARM64 or ARM64E
     storeq a7, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()
 else
     break
 end

 argumINTAlign()
 _argumINT_fa0:
     stored fa0, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_fa1:
     stored fa1, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_fa2:
     stored fa2, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_fa3:
     stored fa3, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_stack:
     loadq [argumINTSrc], csr0
     addq 8, argumINTSrc
     storeq csr0, [argumINTDest]
     addq 8, argumINTDest
     argumINTDispatch()

 argumINTAlign()
 _argumINT_end:
     jmp .ipint_entry_end_local

 # Put all operations before this `else`, or else 32-bit architectures will fail to build.
 else
 # For 32-bit architectures: make sure that the assertions can still find the labels
 unimplementedInstruction(_unreachable)
 unimplementedInstruction(_nop)
 unimplementedInstruction(_block)
 unimplementedInstruction(_loop)
 unimplementedInstruction(_if)
 unimplementedInstruction(_else)
 unimplementedInstruction(_try)
 unimplementedInstruction(_catch)
 unimplementedInstruction(_throw)
 unimplementedInstruction(_rethrow)
 reservedOpcode(0x0a)
 unimplementedInstruction(_end)
 unimplementedInstruction(_br)
 unimplementedInstruction(_br_if)
 unimplementedInstruction(_br_table)
 unimplementedInstruction(_return)
 unimplementedInstruction(_call)
 unimplementedInstruction(_call_indirect)
 reservedOpcode(0x12)
 reservedOpcode(0x13)
 reservedOpcode(0x14)
 reservedOpcode(0x15)
 reservedOpcode(0x16)
 reservedOpcode(0x17)
 unimplementedInstruction(_delegate)
 unimplementedInstruction(_catch_all)
 unimplementedInstruction(_drop)
 unimplementedInstruction(_select)
 unimplementedInstruction(_select_t)
 reservedOpcode(0x1d)
 reservedOpcode(0x1e)
 reservedOpcode(0x1f)
 unimplementedInstruction(_local_get)
 unimplementedInstruction(_local_set)
 unimplementedInstruction(_local_tee)
 unimplementedInstruction(_global_get)
 unimplementedInstruction(_global_set)
 unimplementedInstruction(_table_get)
 unimplementedInstruction(_table_set)
 reservedOpcode(0x27)
 unimplementedInstruction(_i32_load_mem)
 unimplementedInstruction(_i64_load_mem)
 unimplementedInstruction(_f32_load_mem)
 unimplementedInstruction(_f64_load_mem)
 unimplementedInstruction(_i32_load8s_mem)
 unimplementedInstruction(_i32_load8u_mem)
 unimplementedInstruction(_i32_load16s_mem)
 unimplementedInstruction(_i32_load16u_mem)
 unimplementedInstruction(_i64_load8s_mem)
 unimplementedInstruction(_i64_load8u_mem)
 unimplementedInstruction(_i64_load16s_mem)
 unimplementedInstruction(_i64_load16u_mem)
 unimplementedInstruction(_i64_load32s_mem)
 unimplementedInstruction(_i64_load32u_mem)
 unimplementedInstruction(_i32_store_mem)
 unimplementedInstruction(_i64_store_mem)
 unimplementedInstruction(_f32_store_mem)
 unimplementedInstruction(_f64_store_mem)
 unimplementedInstruction(_i32_store8_mem)
 unimplementedInstruction(_i32_store16_mem)
 unimplementedInstruction(_i64_store8_mem)
 unimplementedInstruction(_i64_store16_mem)
 unimplementedInstruction(_i64_store32_mem)
 unimplementedInstruction(_memory_size)
 unimplementedInstruction(_memory_grow)
 unimplementedInstruction(_i32_const)
 unimplementedInstruction(_i64_const)
 unimplementedInstruction(_f32_const)
 unimplementedInstruction(_f64_const)
 unimplementedInstruction(_i32_eqz)
 unimplementedInstruction(_i32_eq)
 unimplementedInstruction(_i32_ne)
 unimplementedInstruction(_i32_lt_s)
 unimplementedInstruction(_i32_lt_u)
 unimplementedInstruction(_i32_gt_s)
 unimplementedInstruction(_i32_gt_u)
 unimplementedInstruction(_i32_le_s)
 unimplementedInstruction(_i32_le_u)
 unimplementedInstruction(_i32_ge_s)
 unimplementedInstruction(_i32_ge_u)
 unimplementedInstruction(_i64_eqz)
 unimplementedInstruction(_i64_eq)
 unimplementedInstruction(_i64_ne)
 unimplementedInstruction(_i64_lt_s)
 unimplementedInstruction(_i64_lt_u)
 unimplementedInstruction(_i64_gt_s)
 unimplementedInstruction(_i64_gt_u)
 unimplementedInstruction(_i64_le_s)
 unimplementedInstruction(_i64_le_u)
 unimplementedInstruction(_i64_ge_s)
 unimplementedInstruction(_i64_ge_u)
 unimplementedInstruction(_f32_eq)
 unimplementedInstruction(_f32_ne)
 unimplementedInstruction(_f32_lt)
 unimplementedInstruction(_f32_gt)
 unimplementedInstruction(_f32_le)
 unimplementedInstruction(_f32_ge)
 unimplementedInstruction(_f64_eq)
 unimplementedInstruction(_f64_ne)
 unimplementedInstruction(_f64_lt)
 unimplementedInstruction(_f64_gt)
 unimplementedInstruction(_f64_le)
 unimplementedInstruction(_f64_ge)
 unimplementedInstruction(_i32_clz)
 unimplementedInstruction(_i32_ctz)
 unimplementedInstruction(_i32_popcnt)
 unimplementedInstruction(_i32_add)
 unimplementedInstruction(_i32_sub)
 unimplementedInstruction(_i32_mul)
 unimplementedInstruction(_i32_div_s)
 unimplementedInstruction(_i32_div_u)
 unimplementedInstruction(_i32_rem_s)
 unimplementedInstruction(_i32_rem_u)
 unimplementedInstruction(_i32_and)
 unimplementedInstruction(_i32_or)
 unimplementedInstruction(_i32_xor)
 unimplementedInstruction(_i32_shl)
 unimplementedInstruction(_i32_shr_s)
 unimplementedInstruction(_i32_shr_u)
 unimplementedInstruction(_i32_rotl)
 unimplementedInstruction(_i32_rotr)
 unimplementedInstruction(_i64_clz)
 unimplementedInstruction(_i64_ctz)
 unimplementedInstruction(_i64_popcnt)
 unimplementedInstruction(_i64_add)
 unimplementedInstruction(_i64_sub)
 unimplementedInstruction(_i64_mul)
 unimplementedInstruction(_i64_div_s)
 unimplementedInstruction(_i64_div_u)
 unimplementedInstruction(_i64_rem_s)
 unimplementedInstruction(_i64_rem_u)
 unimplementedInstruction(_i64_and)
 unimplementedInstruction(_i64_or)
 unimplementedInstruction(_i64_xor)
 unimplementedInstruction(_i64_shl)
 unimplementedInstruction(_i64_shr_s)
 unimplementedInstruction(_i64_shr_u)
 unimplementedInstruction(_i64_rotl)
 unimplementedInstruction(_i64_rotr)
 unimplementedInstruction(_f32_abs)
 unimplementedInstruction(_f32_neg)
 unimplementedInstruction(_f32_ceil)
 unimplementedInstruction(_f32_floor)
 unimplementedInstruction(_f32_trunc)
 unimplementedInstruction(_f32_nearest)
 unimplementedInstruction(_f32_sqrt)
 unimplementedInstruction(_f32_add)
 unimplementedInstruction(_f32_sub)
 unimplementedInstruction(_f32_mul)
 unimplementedInstruction(_f32_div)
 unimplementedInstruction(_f32_min)
 unimplementedInstruction(_f32_max)
 unimplementedInstruction(_f32_copysign)
 unimplementedInstruction(_f64_abs)
 unimplementedInstruction(_f64_neg)
 unimplementedInstruction(_f64_ceil)
 unimplementedInstruction(_f64_floor)
 unimplementedInstruction(_f64_trunc)
 unimplementedInstruction(_f64_nearest)
 unimplementedInstruction(_f64_sqrt)
 unimplementedInstruction(_f64_add)
 unimplementedInstruction(_f64_sub)
 unimplementedInstruction(_f64_mul)
 unimplementedInstruction(_f64_div)
 unimplementedInstruction(_f64_min)
 unimplementedInstruction(_f64_max)
 unimplementedInstruction(_f64_copysign)
 unimplementedInstruction(_i32_wrap_i64)
 unimplementedInstruction(_i32_trunc_f32_s)
 unimplementedInstruction(_i32_trunc_f32_u)
 unimplementedInstruction(_i32_trunc_f64_s)
 unimplementedInstruction(_i32_trunc_f64_u)
 unimplementedInstruction(_i64_extend_i32_s)
 unimplementedInstruction(_i64_extend_i32_u)
 unimplementedInstruction(_i64_trunc_f32_s)
 unimplementedInstruction(_i64_trunc_f32_u)
 unimplementedInstruction(_i64_trunc_f64_s)
 unimplementedInstruction(_i64_trunc_f64_u)
 unimplementedInstruction(_f32_convert_i32_s)
 unimplementedInstruction(_f32_convert_i32_u)
 unimplementedInstruction(_f32_convert_i64_s)
 unimplementedInstruction(_f32_convert_i64_u)
 unimplementedInstruction(_f32_demote_f64)
 unimplementedInstruction(_f64_convert_i32_s)
 unimplementedInstruction(_f64_convert_i32_u)
 unimplementedInstruction(_f64_convert_i64_s)
 unimplementedInstruction(_f64_convert_i64_u)
 unimplementedInstruction(_f64_promote_f32)
 unimplementedInstruction(_i32_reinterpret_f32)
 unimplementedInstruction(_i64_reinterpret_f64)
 unimplementedInstruction(_f32_reinterpret_i32)
 unimplementedInstruction(_f64_reinterpret_i64)
 unimplementedInstruction(_i32_extend8_s)
 unimplementedInstruction(_i32_extend16_s)
 unimplementedInstruction(_i64_extend8_s)
 unimplementedInstruction(_i64_extend16_s)
 unimplementedInstruction(_i64_extend32_s)
 reservedOpcode(0xc5)
 reservedOpcode(0xc6)
 reservedOpcode(0xc7)
 reservedOpcode(0xc8)
 reservedOpcode(0xc9)
 reservedOpcode(0xca)
 reservedOpcode(0xcb)
 reservedOpcode(0xcc)
 reservedOpcode(0xcd)
 reservedOpcode(0xce)
 reservedOpcode(0xcf)
 unimplementedInstruction(_ref_null_t)
 unimplementedInstruction(_ref_is_null)
 unimplementedInstruction(_ref_func)
 reservedOpcode(0xd3)
 reservedOpcode(0xd4)
 reservedOpcode(0xd5)
 reservedOpcode(0xd6)
 reservedOpcode(0xd7)
 reservedOpcode(0xd8)
 reservedOpcode(0xd9)
 reservedOpcode(0xda)
 reservedOpcode(0xdb)
 reservedOpcode(0xdc)
 reservedOpcode(0xdd)
 reservedOpcode(0xde)
 reservedOpcode(0xdf)
 reservedOpcode(0xe0)
 reservedOpcode(0xe1)
 reservedOpcode(0xe2)
 reservedOpcode(0xe3)
 reservedOpcode(0xe4)
 reservedOpcode(0xe5)
 reservedOpcode(0xe6)
 reservedOpcode(0xe7)
 reservedOpcode(0xe8)
 reservedOpcode(0xe9)
 reservedOpcode(0xea)
 reservedOpcode(0xeb)
 reservedOpcode(0xec)
 reservedOpcode(0xed)
 reservedOpcode(0xee)
 reservedOpcode(0xef)
 reservedOpcode(0xf0)
 reservedOpcode(0xf1)
 reservedOpcode(0xf2)
 reservedOpcode(0xf3)
 reservedOpcode(0xf4)
 reservedOpcode(0xf5)
 reservedOpcode(0xf6)
 reservedOpcode(0xf7)
 reservedOpcode(0xf8)
 reservedOpcode(0xf9)
 reservedOpcode(0xfa)
 reservedOpcode(0xfb)
 unimplementedInstruction(_fc_block)
 unimplementedInstruction(_simd)
 unimplementedInstruction(_atomic)
 reservedOpcode(0xff)
 end