ljx

FORK: LuaJIT with native 5.2 and 5.3 support
git clone https://git.neptards.moe/neptards/ljx.git
Log | Files | Refs | README

vm_x86.dasc (157287B)


      1 |// Low-level VM code for x86 CPUs.
      2 |// Bytecode interpreter, fast functions and helper functions.
      3 |// Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
      4 |
      5 |// Lua 5.2 modifications: ESETV, setmetatable __gc.
      6 |// Copyright (C) 2014 Karel Tuma. See Copyright Notice in luajit.h
      7 |
      8 |.if P64
      9 |.arch x64
     10 |.else
     11 |.arch x86
     12 |.endif
     13 |.section code_op, code_sub
     14 |
     15 |.actionlist build_actionlist
     16 |.globals GLOB_
     17 |.globalnames globnames
     18 |.externnames extnames
     19 |
     20 |//-----------------------------------------------------------------------
     21 |
     22 |.if P64
     23 |.define X64, 1
     24 |.if WIN
     25 |.define X64WIN, 1
     26 |.endif
     27 |.endif
     28 |
     29 |// Fixed register assignments for the interpreter.
     30 |// This is very fragile and has many dependencies. Caveat emptor.
     31 |.define BASE,		edx		// Not C callee-save, refetched anyway.
     32 |.if not X64
     33 |.define KBASE,		edi		// Must be C callee-save.
     34 |.define KBASEa,	KBASE
     35 |.define PC,		esi		// Must be C callee-save.
     36 |.define PCa,		PC
     37 |.define DISPATCH,	ebx		// Must be C callee-save.
     38 |.elif X64WIN
     39 |.define KBASE,		edi		// Must be C callee-save.
     40 |.define KBASEa,	rdi
     41 |.define PC,		esi		// Must be C callee-save.
     42 |.define PCa,		rsi
     43 |.define DISPATCH,	ebx		// Must be C callee-save.
     44 |.else
     45 |.define KBASE,		r15d		// Must be C callee-save.
     46 |.define KBASEa,	r15
     47 |.define PC,		ebx		// Must be C callee-save.
     48 |.define PCa,		rbx
     49 |.define DISPATCH,	r14d		// Must be C callee-save.
     50 |.endif
     51 |
     52 |.define RA,		ecx
     53 |.define RAH,		ch
     54 |.define RAL,		cl
     55 |.define RB,		ebp		// Must be ebp (C callee-save).
     56 |.define RC,		eax		// Must be eax.
     57 |.define RCW,		ax
     58 |.define RCH,		ah
     59 |.define RCL,		al
     60 |.define OP,		RB
     61 |.define RD,		RC
     62 |.define RDW,		RCW
     63 |.define RDL,		RCL
     64 |.if X64
     65 |.define RAa, rcx
     66 |.define RBa, rbp
     67 |.define RCa, rax
     68 |.define RDa, rax
     69 |.else
     70 |.define RAa, RA
     71 |.define RBa, RB
     72 |.define RCa, RC
     73 |.define RDa, RD
     74 |.endif
     75 |
     76 |.if not X64
     77 |.define FCARG1,	ecx		// x86 fastcall arguments.
     78 |.define FCARG2,	edx
     79 |.elif X64WIN
     80 |.define CARG1,		rcx		// x64/WIN64 C call arguments.
     81 |.define CARG2,		rdx
     82 |.define CARG3,		r8
     83 |.define CARG4,		r9
     84 |.define CARG1d,	ecx
     85 |.define CARG2d,	edx
     86 |.define CARG3d,	r8d
     87 |.define CARG4d,	r9d
     88 |.define FCARG1,	CARG1d		// Upwards compatible to x86 fastcall.
     89 |.define FCARG2,	CARG2d
     90 |.else
     91 |.define CARG1,		rdi		// x64/POSIX C call arguments.
     92 |.define CARG2,		rsi
     93 |.define CARG3,		rdx
     94 |.define CARG4,		rcx
     95 |.define CARG5,		r8
     96 |.define CARG6,		r9
     97 |.define CARG1d,	edi
     98 |.define CARG2d,	esi
     99 |.define CARG3d,	edx
    100 |.define CARG4d,	ecx
    101 |.define CARG5d,	r8d
    102 |.define CARG6d,	r9d
    103 |.define FCARG1,	CARG1d		// Simulate x86 fastcall.
    104 |.define FCARG2,	CARG2d
    105 |.endif
    106 |
    107 |// Type definitions. Some of these are only used for documentation.
    108 |.type L,		lua_State
    109 |.type GL,		global_State
    110 |.type TVALUE,		TValue
    111 |.type GCOBJ,		GCobj
    112 |.type STR,		GCstr
    113 |.type TAB,		GCtab
    114 |.type LFUNC,		GCfuncL
    115 |.type CFUNC,		GCfuncC
    116 |.type PROTO,		GCproto
    117 |.type UPVAL,		GCupval
    118 |.type NODE,		Node
    119 |.type NARGS,		int
    120 |.type TRACE,		GCtrace
    121 |.type SBUF,		SBuf
    122 |
    123 |// Stack layout while in interpreter. Must match with lj_frame.h.
    124 |//-----------------------------------------------------------------------
    125 |.if not X64		// x86 stack layout.
    126 |
    127 |.if WIN
    128 |
    129 |.define CFRAME_SPACE,	aword*9			// Delta for esp (see <--).
    130 |.macro saveregs_
    131 |  push edi; push esi; push ebx
    132 |  push extern lj_err_unwind_win
    133 |  fs; push dword [0]
    134 |  fs; mov [0], esp
    135 |  sub esp, CFRAME_SPACE
    136 |.endmacro
    137 |.macro restoreregs
    138 |  add esp, CFRAME_SPACE
    139 |  fs; pop dword [0]
    140 |  pop edi	// Short for esp += 4.
    141 |  pop ebx; pop esi; pop edi; pop ebp
    142 |.endmacro
    143 |
    144 |.else
    145 |
    146 |.define CFRAME_SPACE,	aword*7			// Delta for esp (see <--).
    147 |.macro saveregs_
    148 |  push edi; push esi; push ebx
    149 |  sub esp, CFRAME_SPACE
    150 |.endmacro
    151 |.macro restoreregs
    152 |  add esp, CFRAME_SPACE
    153 |  pop ebx; pop esi; pop edi; pop ebp
    154 |.endmacro
    155 |
    156 |.endif
    157 |
    158 |.macro saveregs
    159 |  push ebp; saveregs_
    160 |.endmacro
    161 |
    162 |.if WIN
    163 |.define SAVE_ERRF,	aword [esp+aword*19]	// vm_pcall/vm_cpcall only.
    164 |.define SAVE_NRES,	aword [esp+aword*18]
    165 |.define SAVE_CFRAME,	aword [esp+aword*17]
    166 |.define SAVE_L,	aword [esp+aword*16]
    167 |//----- 16 byte aligned, ^^^ arguments from C caller
    168 |.define SAVE_RET,	aword [esp+aword*15]	//<-- esp entering interpreter.
    169 |.define SAVE_R4,	aword [esp+aword*14]
    170 |.define SAVE_R3,	aword [esp+aword*13]
    171 |.define SAVE_R2,	aword [esp+aword*12]
    172 |//----- 16 byte aligned
    173 |.define SAVE_R1,	aword [esp+aword*11]
    174 |.define SEH_FUNC,	aword [esp+aword*10]
    175 |.define SEH_NEXT,	aword [esp+aword*9]	//<-- esp after register saves.
    176 |.define UNUSED2,	aword [esp+aword*8]
    177 |//----- 16 byte aligned
    178 |.define UNUSED1,	aword [esp+aword*7]
    179 |.define SAVE_PC,	aword [esp+aword*6]
    180 |.define TMP2,		aword [esp+aword*5]
    181 |.define TMP1,		aword [esp+aword*4]
    182 |//----- 16 byte aligned
    183 |.define ARG4,		aword [esp+aword*3]
    184 |.define ARG3,		aword [esp+aword*2]
    185 |.define ARG2,		aword [esp+aword*1]
    186 |.define ARG1,		aword [esp]		//<-- esp while in interpreter.
    187 |//----- 16 byte aligned, ^^^ arguments for C callee
    188 |.else
    189 |.define SAVE_ERRF,	aword [esp+aword*15]	// vm_pcall/vm_cpcall only.
    190 |.define SAVE_NRES,	aword [esp+aword*14]
    191 |.define SAVE_CFRAME,	aword [esp+aword*13]
    192 |.define SAVE_L,	aword [esp+aword*12]
    193 |//----- 16 byte aligned, ^^^ arguments from C caller
    194 |.define SAVE_RET,	aword [esp+aword*11]	//<-- esp entering interpreter.
    195 |.define SAVE_R4,	aword [esp+aword*10]
    196 |.define SAVE_R3,	aword [esp+aword*9]
    197 |.define SAVE_R2,	aword [esp+aword*8]
    198 |//----- 16 byte aligned
    199 |.define SAVE_R1,	aword [esp+aword*7]	//<-- esp after register saves.
    200 |.define SAVE_PC,	aword [esp+aword*6]
    201 |.define TMP2,		aword [esp+aword*5]
    202 |.define TMP1,		aword [esp+aword*4]
    203 |//----- 16 byte aligned
    204 |.define ARG4,		aword [esp+aword*3]
    205 |.define ARG3,		aword [esp+aword*2]
    206 |.define ARG2,		aword [esp+aword*1]
    207 |.define ARG1,		aword [esp]		//<-- esp while in interpreter.
    208 |//----- 16 byte aligned, ^^^ arguments for C callee
    209 |.endif
    210 |
    211 |// FPARGx overlaps ARGx and ARG(x+1) on x86.
    212 |.define FPARG3,	qword [esp+qword*1]
    213 |.define FPARG1,	qword [esp]
    214 |// TMPQ overlaps TMP1/TMP2. ARG5/MULTRES overlap TMP1/TMP2 (and TMPQ).
    215 |.define TMPQ,		qword [esp+aword*4]
    216 |.define TMP3,		ARG4
    217 |.define ARG5,		TMP1
    218 |.define TMPa,		TMP1
    219 |.define MULTRES,	TMP2
    220 |
    221 |// Arguments for vm_call and vm_pcall.
    222 |.define INARG_BASE,	SAVE_CFRAME		// Overwritten by SAVE_CFRAME!
    223 |
    224 |// Arguments for vm_cpcall.
    225 |.define INARG_CP_CALL,	SAVE_ERRF
    226 |.define INARG_CP_UD,	SAVE_NRES
    227 |.define INARG_CP_FUNC,	SAVE_CFRAME
    228 |
    229 |//-----------------------------------------------------------------------
    230 |.elif X64WIN		// x64/Windows stack layout
    231 |
    232 |.define CFRAME_SPACE,	aword*5			// Delta for rsp (see <--).
    233 |.macro saveregs_
    234 |  push rdi; push rsi; push rbx
    235 |  sub rsp, CFRAME_SPACE
    236 |.endmacro
    237 |.macro saveregs
    238 |  push rbp; saveregs_
    239 |.endmacro
    240 |.macro restoreregs
    241 |  add rsp, CFRAME_SPACE
    242 |  pop rbx; pop rsi; pop rdi; pop rbp
    243 |.endmacro
    244 |
    245 |.define SAVE_CFRAME,	aword [rsp+aword*13]
    246 |.define SAVE_PC,	dword [rsp+dword*25]
    247 |.define SAVE_L,	dword [rsp+dword*24]
    248 |.define SAVE_ERRF,	dword [rsp+dword*23]
    249 |.define SAVE_NRES,	dword [rsp+dword*22]
    250 |.define TMP2,		dword [rsp+dword*21]
    251 |.define TMP1,		dword [rsp+dword*20]
    252 |//----- 16 byte aligned, ^^^ 32 byte register save area, owned by interpreter
    253 |.define SAVE_RET,	aword [rsp+aword*9]	//<-- rsp entering interpreter.
    254 |.define SAVE_R4,	aword [rsp+aword*8]
    255 |.define SAVE_R3,	aword [rsp+aword*7]
    256 |.define SAVE_R2,	aword [rsp+aword*6]
    257 |.define SAVE_R1,	aword [rsp+aword*5]	//<-- rsp after register saves.
    258 |.define ARG5,		aword [rsp+aword*4]
    259 |.define CSAVE_4,	aword [rsp+aword*3]
    260 |.define CSAVE_3,	aword [rsp+aword*2]
    261 |.define CSAVE_2,	aword [rsp+aword*1]
    262 |.define CSAVE_1,	aword [rsp]		//<-- rsp while in interpreter.
    263 |//----- 16 byte aligned, ^^^ 32 byte register save area, owned by callee
    264 |
    265 |// TMPQ overlaps TMP1/TMP2. MULTRES overlaps TMP2 (and TMPQ).
    266 |.define TMPQ,		qword [rsp+aword*10]
    267 |.define MULTRES,	TMP2
    268 |.define TMPa,		ARG5
    269 |.define ARG5d,		dword [rsp+aword*4]
    270 |.define TMP3,		ARG5d
    271 |
    272 |//-----------------------------------------------------------------------
    273 |.else			// x64/POSIX stack layout
    274 |
    275 |.define CFRAME_SPACE,	aword*5			// Delta for rsp (see <--).
    276 |.macro saveregs_
    277 |  push rbx; push r15; push r14
    278 |.if NO_UNWIND
    279 |  push r13; push r12
    280 |.endif
    281 |  sub rsp, CFRAME_SPACE
    282 |.endmacro
    283 |.macro saveregs
    284 |  push rbp; saveregs_
    285 |.endmacro
    286 |.macro restoreregs
    287 |  add rsp, CFRAME_SPACE
    288 |.if NO_UNWIND
    289 |  pop r12; pop r13
    290 |.endif
    291 |  pop r14; pop r15; pop rbx; pop rbp
    292 |.endmacro
    293 |
    294 |//----- 16 byte aligned,
    295 |.if NO_UNWIND
    296 |.define SAVE_RET,	aword [rsp+aword*11]	//<-- rsp entering interpreter.
    297 |.define SAVE_R4,	aword [rsp+aword*10]
    298 |.define SAVE_R3,	aword [rsp+aword*9]
    299 |.define SAVE_R2,	aword [rsp+aword*8]
    300 |.define SAVE_R1,	aword [rsp+aword*7]
    301 |.define SAVE_RU2,	aword [rsp+aword*6]
    302 |.define SAVE_RU1,	aword [rsp+aword*5]	//<-- rsp after register saves.
    303 |.else
    304 |.define SAVE_RET,	aword [rsp+aword*9]	//<-- rsp entering interpreter.
    305 |.define SAVE_R4,	aword [rsp+aword*8]
    306 |.define SAVE_R3,	aword [rsp+aword*7]
    307 |.define SAVE_R2,	aword [rsp+aword*6]
    308 |.define SAVE_R1,	aword [rsp+aword*5]	//<-- rsp after register saves.
    309 |.endif
    310 |.define SAVE_CFRAME,	aword [rsp+aword*4]
    311 |.define SAVE_PC,	dword [rsp+dword*7]
    312 |.define SAVE_L,	dword [rsp+dword*6]
    313 |.define SAVE_ERRF,	dword [rsp+dword*5]
    314 |.define SAVE_NRES,	dword [rsp+dword*4]
    315 |.define TMPa,		aword [rsp+aword*1]
    316 |.define TMP2,		dword [rsp+dword*1]
    317 |.define TMP1,		dword [rsp]		//<-- rsp while in interpreter.
    318 |//----- 16 byte aligned
    319 |
    320 |// TMPQ overlaps TMP1/TMP2. MULTRES overlaps TMP2 (and TMPQ).
    321 |.define TMPQ,		qword [rsp]
    322 |.define TMP3,		dword [rsp+aword*1]
    323 |.define MULTRES,	TMP2
    324 |
    325 |.endif
    326 |
    327 |//-----------------------------------------------------------------------
    328 |
    329 |// Instruction headers.
    330 |.macro ins_A; .endmacro
    331 |.macro ins_AD; .endmacro
    332 |.macro ins_AJ; .endmacro
    333 |.macro ins_ABC; movzx RB, RCH; movzx RC, RCL; .endmacro
    334 |.macro ins_AB_; movzx RB, RCH; .endmacro
    335 |.macro ins_A_C; movzx RC, RCL; .endmacro
    336 |.macro ins_AND; not RDa; .endmacro
    337 |
    338 |// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
    339 |.macro ins_NEXT
    340 |  mov RC, [PC]
    341 |  movzx RA, RCH
    342 |  movzx OP, RCL
    343 |  add PC, 4
    344 |  shr RC, 16
    345 |.if X64
    346 |  jmp aword [DISPATCH+OP*8]
    347 |.else
    348 |  jmp aword [DISPATCH+OP*4]
    349 |.endif
    350 |.endmacro
    351 |
    352 |.macro ins_refetch
    353 |  mov RC, [PC-4]
    354 |  movzx RA, RCH
    355 |  shr RC, 16
    356 |.endmacro
    357 |
    358 |// Instruction footer.
    359 |.if 1
    360 |  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
    361 |  .define ins_next, ins_NEXT
    362 |  .define ins_next_, ins_NEXT
    363 |.else
    364 |  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
    365 |  // Affects only certain kinds of benchmarks (and only with -j off).
    366 |  // Around 10%-30% slower on Core2, a lot more slower on P4.
    367 |  .macro ins_next
    368 |    jmp ->ins_next
    369 |  .endmacro
    370 |  .macro ins_next_
    371 |  ->ins_next:
    372 |    ins_NEXT
    373 |  .endmacro
    374 |.endif
    375 |
    376 |// Call decode and dispatch.
    377 |.macro ins_callt
    378 |  // BASE = new base, RB = LFUNC, RD = nargs+1, [BASE-4] = PC
    379 |  mov PC, LFUNC:RB->pc
    380 |  mov RA, [PC]
    381 |  movzx OP, RAL
    382 |  movzx RA, RAH
    383 |  add PC, 4
    384 |.if X64
    385 |  jmp aword [DISPATCH+OP*8]
    386 |.else
    387 |  jmp aword [DISPATCH+OP*4]
    388 |.endif
    389 |.endmacro
    390 |
    391 |.macro ins_call
    392 |  // BASE = new base, RB = LFUNC, RD = nargs+1
    393 |  mov [BASE-4], PC
    394 |  ins_callt
    395 |.endmacro
    396 |
    397 |//-----------------------------------------------------------------------
    398 |
    399 |// Macros to test operand types.
    400 |.macro checktp, reg, tp;  cmp dword [BASE+reg*8+4], tp; .endmacro
    401 |.macro checknum, reg, target; checktp reg, LJ_TISNUM; jae target; .endmacro
    402 |.macro checkint, reg, target; checktp reg, LJ_TISNUM; jne target; .endmacro
    403 |.macro checkstr, reg, target; checktp reg, LJ_TSTR; jne target; .endmacro
    404 |.macro checktab, reg, target; checktp reg, LJ_TTAB; jne target; .endmacro
    405 |
    406 |// These operands must be used with movzx.
    407 |.define PC_OP, byte [PC-4]
    408 |.define PC_RA, byte [PC-3]
    409 |.define PC_RB, byte [PC-1]
    410 |.define PC_RC, byte [PC-2]
    411 |.define PC_RD, word [PC-2]
    412 |
    413 |.macro branchPC, reg
    414 |  lea PC, [PC+reg*4-BCBIAS_J*4]
    415 |.endmacro
    416 |
    417 |// Assumes DISPATCH is relative to GL.
    418 #define DISPATCH_GL(field)	(GG_DISP2G + (int)offsetof(global_State, field))
    419 #define DISPATCH_J(field)	(GG_DISP2J + (int)offsetof(jit_State, field))
    420 |
    421 #define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
    422 |
    423 |// Decrement hashed hotcount and trigger trace recorder if zero.
    424 |.macro hotloop, reg
    425 |  mov reg, PC
    426 |  shr reg, 1
    427 |  and reg, HOTCOUNT_PCMASK
    428 |  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
    429 |  jb ->vm_hotloop
    430 |.endmacro
    431 |
    432 |.macro hotcall, reg
    433 |  mov reg, PC
    434 |  shr reg, 1
    435 |  and reg, HOTCOUNT_PCMASK
    436 |  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_CALL
    437 |  jb ->vm_hotcall
    438 |.endmacro
    439 |
    440 |// Set current VM state.
    441 |.macro set_vmstate, st
    442 |  mov dword [DISPATCH+DISPATCH_GL(vmstate)], ~LJ_VMST_..st
    443 |.endmacro
    444 |
    445 |// x87 compares.
    446 |.macro fcomparepp			// Compare and pop st0 >< st1.
    447 |  fucomip st1
    448 |  fpop
    449 |.endmacro
    450 |
    451 |.macro fpop1; fstp st1; .endmacro
    452 |
    453 |// Synthesize SSE FP constants.
    454 |.macro sseconst_abs, reg, tmp		// Synthesize abs mask.
    455 |.if X64
    456 |  mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
    457 |.else
    458 |  pxor reg, reg; pcmpeqd reg, reg; psrlq reg, 1
    459 |.endif
    460 |.endmacro
    461 |
    462 |.macro sseconst_hi, reg, tmp, val	// Synthesize hi-32 bit const.
    463 |.if X64
    464 |  mov64 tmp, U64x(val,00000000); movd reg, tmp
    465 |.else
    466 |  mov tmp, 0x .. val; movd reg, tmp; pshufd reg, reg, 0x51
    467 |.endif
    468 |.endmacro
    469 |
    470 |.macro sseconst_sign, reg, tmp		// Synthesize sign mask.
    471 |  sseconst_hi reg, tmp, 80000000
    472 |.endmacro
    473 |.macro sseconst_1, reg, tmp		// Synthesize 1.0.
    474 |  sseconst_hi reg, tmp, 3ff00000
    475 |.endmacro
    476 |.macro sseconst_m1, reg, tmp		// Synthesize -1.0.
    477 |  sseconst_hi reg, tmp, bff00000
    478 |.endmacro
    479 |.macro sseconst_2p52, reg, tmp		// Synthesize 2^52.
    480 |  sseconst_hi reg, tmp, 43300000
    481 |.endmacro
    482 |.macro sseconst_tobit, reg, tmp	// Synthesize 2^52 + 2^51.
    483 |  sseconst_hi reg, tmp, 43380000
    484 |.endmacro
    485 |
    486 |// Move table write barrier back. Overwrites reg.
    487 |.macro barrierback, tab, reg
    488 |  and byte tab->marked, (uint8_t)~LJ_GC_BLACK	// black2gray(tab)
    489 |  mov reg, [DISPATCH+DISPATCH_GL(gc.grayagain)]
    490 |  mov [DISPATCH+DISPATCH_GL(gc.grayagain)], tab
    491 |  mov tab->gclist, reg
    492 |.endmacro
    493 |
    494 |//-----------------------------------------------------------------------
    495 
    496 /* Generate subroutines used by opcodes and other parts of the VM. */
    497 /* The .code_sub section should be last to help static branch prediction. */
    498 static void build_subroutines(BuildCtx *ctx)
    499 {
    500   |.code_sub
    501   |
    502   |//-----------------------------------------------------------------------
    503   |//-- Return handling ----------------------------------------------------
    504   |//-----------------------------------------------------------------------
    505   |
    506   |->vm_returnp:
    507   |  test PC, FRAME_P
    508   |  jz ->cont_dispatch
    509   |
    510   |  // Return from pcall or xpcall fast func.
    511   |  and PC, -8
    512   |  sub BASE, PC			// Restore caller base.
    513   |  lea RAa, [RA+PC-8]			// Rebase RA and prepend one result.
    514   |  mov PC, [BASE-4]			// Fetch PC of previous frame.
    515   |  // Prepending may overwrite the pcall frame, so do it at the end.
    516   |  mov dword [BASE+RA+4], LJ_TTRUE	// Prepend true to results.
    517   |
    518   |->vm_returnc:
    519   |  add RD, 1				// RD = nresults+1
    520   |  jz ->vm_unwind_yield
    521   |  mov MULTRES, RD
    522   |  test PC, FRAME_TYPE
    523   |  jz ->BC_RET_Z			// Handle regular return to Lua.
    524   |
    525   |->vm_return:
    526   |  // BASE = base, RA = resultofs, RD = nresults+1 (= MULTRES), PC = return
    527   |  xor PC, FRAME_C
    528   |  test PC, FRAME_TYPE
    529   |  jnz ->vm_returnp
    530   |
    531   |  // Return to C.
    532   |  set_vmstate C
    533   |  and PC, -8
    534   |  sub PC, BASE
    535   |  neg PC				// Previous base = BASE - delta.
    536   |
    537   |  sub RD, 1
    538   |  jz >2
    539   |1:  // Move results down.
    540   |.if X64
    541   |  mov RBa, [BASE+RA]
    542   |  mov [BASE-8], RBa
    543   |.else
    544   |  mov RB, [BASE+RA]
    545   |  mov [BASE-8], RB
    546   |  mov RB, [BASE+RA+4]
    547   |  mov [BASE-4], RB
    548   |.endif
    549   |  add BASE, 8
    550   |  sub RD, 1
    551   |  jnz <1
    552   |2:
    553   |  mov L:RB, SAVE_L
    554   |  mov L:RB->base, PC
    555   |3:
    556   |  mov RD, MULTRES
    557   |  mov RA, SAVE_NRES			// RA = wanted nresults+1
    558   |4:
    559   |  cmp RA, RD
    560   |  jne >6				// More/less results wanted?
    561   |5:
    562   |  sub BASE, 8
    563   |  mov L:RB->top, BASE
    564   |
    565   |->vm_leave_cp:
    566   |  mov RAa, SAVE_CFRAME		// Restore previous C frame.
    567   |  mov L:RB->cframe, RAa
    568   |  xor eax, eax			// Ok return status for vm_pcall.
    569   |
    570   |->vm_leave_unw:
    571   |  restoreregs
    572   |  ret
    573   |
    574   |6:
    575   |  jb >7				// Less results wanted?
    576   |  // More results wanted. Check stack size and fill up results with nil.
    577   |  cmp BASE, L:RB->maxstack
    578   |  ja >8
    579   |  mov dword [BASE-4], LJ_TNIL
    580   |  add BASE, 8
    581   |  add RD, 1
    582   |  jmp <4
    583   |
    584   |7:  // Less results wanted.
    585   |  test RA, RA
    586   |  jz <5				// But check for LUA_MULTRET+1.
    587   |  sub RA, RD				// Negative result!
    588   |  lea BASE, [BASE+RA*8]		// Correct top.
    589   |  jmp <5
    590   |
    591   |8:  // Corner case: need to grow stack for filling up results.
    592   |  // This can happen if:
    593   |  // - A C function grows the stack (a lot).
    594   |  // - The GC shrinks the stack in between.
    595   |  // - A return back from a lua_call() with (high) nresults adjustment.
    596   |  mov L:RB->top, BASE		// Save current top held in BASE (yes).
    597   |  mov MULTRES, RD			// Need to fill only remainder with nil.
    598   |  mov FCARG2, RA
    599   |  mov FCARG1, L:RB
    600   |  call extern lj_state_growstack@8	// (lua_State *L, int n)
    601   |  mov BASE, L:RB->top		// Need the (realloced) L->top in BASE.
    602   |  jmp <3
    603   |
    604   |->vm_unwind_yield:
    605   |  mov al, LUA_YIELD
    606   |  jmp ->vm_unwind_c_eh
    607   |
    608   |->vm_unwind_c@8:			// Unwind C stack, return from vm_pcall.
    609   |  // (void *cframe, int errcode)
    610   |.if X64
    611   |  mov eax, CARG2d			// Error return status for vm_pcall.
    612   |  mov rsp, CARG1
    613   |.else
    614   |  mov eax, FCARG2			// Error return status for vm_pcall.
    615   |  mov esp, FCARG1
    616   |.if WIN
    617   |  lea FCARG1, SEH_NEXT
    618   |  fs; mov [0], FCARG1
    619   |.endif
    620   |.endif
    621   |->vm_unwind_c_eh:			// Landing pad for external unwinder.
    622   |  mov L:RB, SAVE_L
    623   |  mov GL:RB, L:RB->glref
    624   |  mov dword GL:RB->vmstate, ~LJ_VMST_C
    625   |  jmp ->vm_leave_unw
    626   |
    627   |->vm_unwind_rethrow:
    628   |.if X64 and not X64WIN
    629   |  mov FCARG1, SAVE_L
    630   |  mov FCARG2, eax
    631   |  restoreregs
    632   |  jmp extern lj_err_throw@8		// (lua_State *L, int errcode)
    633   |.endif
    634   |
    635   |->vm_unwind_ff@4:			// Unwind C stack, return from ff pcall.
    636   |  // (void *cframe)
    637   |.if X64
    638   |  and CARG1, CFRAME_RAWMASK
    639   |  mov rsp, CARG1
    640   |.else
    641   |  and FCARG1, CFRAME_RAWMASK
    642   |  mov esp, FCARG1
    643   |.if WIN
    644   |  lea FCARG1, SEH_NEXT
    645   |  fs; mov [0], FCARG1
    646   |.endif
    647   |.endif
    648   |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
    649   |  mov L:RB, SAVE_L
    650   |  mov RAa, -8			// Results start at BASE+RA = BASE-8.
    651   |  mov RD, 1+1			// Really 1+2 results, incr. later.
    652   |  mov BASE, L:RB->base
    653   |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
    654   |  add DISPATCH, GG_G2DISP
    655   |  mov PC, [BASE-4]			// Fetch PC of previous frame.
    656   |  mov dword [BASE-4], LJ_TFALSE	// Prepend false to error message.
    657   |  set_vmstate INTERP
    658   |  jmp ->vm_returnc			// Increments RD/MULTRES and returns.
    659   |
    660   |.if WIN and not X64
    661   |->vm_rtlunwind@16:			// Thin layer around RtlUnwind.
    662   |  // (void *cframe, void *excptrec, void *unwinder, int errcode)
    663   |  mov [esp], FCARG1			// Return value for RtlUnwind.
    664   |  push FCARG2			// Exception record for RtlUnwind.
    665   |  push 0				// Ignored by RtlUnwind.
    666   |  push dword [FCARG1+CFRAME_OFS_SEH]
    667   |  call extern RtlUnwind@16		// Violates ABI (clobbers too much).
    668   |  mov FCARG1, eax
    669   |  mov FCARG2, [esp+4]		// errcode (for vm_unwind_c).
    670   |  ret				// Jump to unwinder.
    671   |.endif
    672   |
    673   |//-----------------------------------------------------------------------
    674   |//-- Grow stack for calls -----------------------------------------------
    675   |//-----------------------------------------------------------------------
    676   |
    677   |->vm_growstack_c:			// Grow stack for C function.
    678   |  mov FCARG2, LUA_MINSTACK
    679   |  jmp >2
    680   |
    681   |->vm_growstack_v:			// Grow stack for vararg Lua function.
    682   |  sub RD, 8
    683   |  jmp >1
    684   |
    685   |->vm_growstack_f:			// Grow stack for fixarg Lua function.
    686   |  // BASE = new base, RD = nargs+1, RB = L, PC = first PC
    687   |  lea RD, [BASE+NARGS:RD*8-8]
    688   |1:
    689   |  movzx RA, byte [PC-4+PC2PROTO(framesize)]
    690   |  add PC, 4				// Must point after first instruction.
    691   |  mov L:RB->base, BASE
    692   |  mov L:RB->top, RD
    693   |  mov SAVE_PC, PC
    694   |  mov FCARG2, RA
    695   |2:
    696   |  // RB = L, L->base = new base, L->top = top
    697   |  mov FCARG1, L:RB
    698   |  call extern lj_state_growstack@8	// (lua_State *L, int n)
    699   |  mov BASE, L:RB->base
    700   |  mov RD, L:RB->top
    701   |  mov LFUNC:RB, [BASE-8]
    702   |  sub RD, BASE
    703   |  shr RD, 3
    704   |  add NARGS:RD, 1
    705   |  // BASE = new base, RB = LFUNC, RD = nargs+1
    706   |  ins_callt				// Just retry the call.
    707   |
    708   |//-----------------------------------------------------------------------
    709   |//-- Entry points into the assembler VM ---------------------------------
    710   |//-----------------------------------------------------------------------
    711   |
    712   |->vm_resume:				// Setup C frame and resume thread.
    713   |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
    714   |  saveregs
    715   |.if X64
    716   |  mov L:RB, CARG1d			// Caveat: CARG1d may be RA.
    717   |  mov SAVE_L, CARG1d
    718   |  mov RA, CARG2d
    719   |.else
    720   |  mov L:RB, SAVE_L
    721   |  mov RA, INARG_BASE			// Caveat: overlaps SAVE_CFRAME!
    722   |.endif
    723   |  mov PC, FRAME_CP
    724   |  xor RD, RD
    725   |  lea KBASEa, [esp+CFRAME_RESUME]
    726   |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
    727   |  add DISPATCH, GG_G2DISP
    728   |  mov SAVE_PC, RD			// Any value outside of bytecode is ok.
    729   |  mov SAVE_CFRAME, RDa
    730   |.if X64
    731   |  mov SAVE_NRES, RD
    732   |  mov SAVE_ERRF, RD
    733   |.endif
    734   |  mov L:RB->cframe, KBASEa
    735   |  cmp byte L:RB->status, RDL
    736   |  je >2				// Initial resume (like a call).
    737   |
    738   |  // Resume after yield (like a return).
    739   |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
    740   |  set_vmstate INTERP
    741   |  mov byte L:RB->status, RDL
    742   |  mov BASE, L:RB->base
    743   |  mov RD, L:RB->top
    744   |  sub RD, RA
    745   |  shr RD, 3
    746   |  add RD, 1				// RD = nresults+1
    747   |  sub RA, BASE			// RA = resultofs
    748   |  mov PC, [BASE-4]
    749   |  mov MULTRES, RD
    750   |  test PC, FRAME_TYPE
    751   |  jz ->BC_RET_Z
    752   |  jmp ->vm_return
    753   |
    754   |->vm_pcall:				// Setup protected C frame and enter VM.
    755   |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
    756   |  saveregs
    757   |  mov PC, FRAME_CP
    758   |.if X64
    759   |  mov SAVE_ERRF, CARG4d
    760   |.endif
    761   |  jmp >1
    762   |
    763   |->vm_call:				// Setup C frame and enter VM.
    764   |  // (lua_State *L, TValue *base, int nres1)
    765   |  saveregs
    766   |  mov PC, FRAME_C
    767   |
    768   |1:  // Entry point for vm_pcall above (PC = ftype).
    769   |.if X64
    770   |  mov SAVE_NRES, CARG3d
    771   |  mov L:RB, CARG1d			// Caveat: CARG1d may be RA.
    772   |  mov SAVE_L, CARG1d
    773   |  mov RA, CARG2d
    774   |.else
    775   |  mov L:RB, SAVE_L
    776   |  mov RA, INARG_BASE			// Caveat: overlaps SAVE_CFRAME!
    777   |.endif
    778   |
    779   |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
    780   |  mov KBASEa, L:RB->cframe		// Add our C frame to cframe chain.
    781   |  mov SAVE_CFRAME, KBASEa
    782   |  mov SAVE_PC, L:RB			// Any value outside of bytecode is ok.
    783   |  add DISPATCH, GG_G2DISP
    784   |.if X64
    785   |  mov L:RB->cframe, rsp
    786   |.else
    787   |  mov L:RB->cframe, esp
    788   |.endif
    789   |
    790   |2:  // Entry point for vm_resume/vm_cpcall (RA = base, RB = L, PC = ftype).
    791   |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
    792   |  set_vmstate INTERP
    793   |  mov BASE, L:RB->base		// BASE = old base (used in vmeta_call).
    794   |  add PC, RA
    795   |  sub PC, BASE			// PC = frame delta + frame type
    796   |
    797   |  mov RD, L:RB->top
    798   |  sub RD, RA
    799   |  shr NARGS:RD, 3
    800   |  add NARGS:RD, 1			// RD = nargs+1
    801   |
    802   |->vm_call_dispatch:
    803   |  mov LFUNC:RB, [RA-8]
    804   |  cmp dword [RA-4], LJ_TFUNC
    805   |  jne ->vmeta_call			// Ensure KBASE defined and != BASE.
    806   |
    807   |->vm_call_dispatch_f:
    808   |  mov BASE, RA
    809   |  ins_call
    810   |  // BASE = new base, RB = func, RD = nargs+1, PC = caller PC
    811   |
    812   |->vm_cpcall:				// Setup protected C frame, call C.
    813   |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
    814   |  saveregs
    815   |.if X64
    816   |  mov L:RB, CARG1d			// Caveat: CARG1d may be RA.
    817   |  mov SAVE_L, CARG1d
    818   |.else
    819   |  mov L:RB, SAVE_L
    820   |  // Caveat: INARG_CP_* and SAVE_CFRAME/SAVE_NRES/SAVE_ERRF overlap!
    821   |  mov RC, INARG_CP_UD		// Get args before they are overwritten.
    822   |  mov RA, INARG_CP_FUNC
    823   |  mov BASE, INARG_CP_CALL
    824   |.endif
    825   |  mov SAVE_PC, L:RB			// Any value outside of bytecode is ok.
    826   |
    827   |  mov KBASE, L:RB->stack		// Compute -savestack(L, L->top).
    828   |  sub KBASE, L:RB->top
    829   |   mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
    830   |  mov SAVE_ERRF, 0			// No error function.
    831   |  mov SAVE_NRES, KBASE		// Neg. delta means cframe w/o frame.
    832   |   add DISPATCH, GG_G2DISP
    833   |  // Handler may change cframe_nres(L->cframe) or cframe_errfunc(L->cframe).
    834   |
    835   |.if X64
    836   |  mov KBASEa, L:RB->cframe		// Add our C frame to cframe chain.
    837   |  mov SAVE_CFRAME, KBASEa
    838   |  mov L:RB->cframe, rsp
    839   |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
    840   |
    841   |  call CARG4			// (lua_State *L, lua_CFunction func, void *ud)
    842   |.else
    843   |  mov ARG3, RC			// Have to copy args downwards.
    844   |  mov ARG2, RA
    845   |  mov ARG1, L:RB
    846   |
    847   |  mov KBASE, L:RB->cframe		// Add our C frame to cframe chain.
    848   |  mov SAVE_CFRAME, KBASE
    849   |  mov L:RB->cframe, esp
    850   |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
    851   |
    852   |  call BASE			// (lua_State *L, lua_CFunction func, void *ud)
    853   |.endif
    854   |  // TValue * (new base) or NULL returned in eax (RC).
    855   |  test RC, RC
    856   |  jz ->vm_leave_cp			// No base? Just remove C frame.
    857   |  mov RA, RC
    858   |  mov PC, FRAME_CP
    859   |  jmp <2				// Else continue with the call.
    860   |
    861   |//-----------------------------------------------------------------------
    862   |//-- Metamethod handling ------------------------------------------------
    863   |//-----------------------------------------------------------------------
    864   |
    865   |//-- Continuation dispatch ----------------------------------------------
    866   |
    867   |->cont_dispatch:
    868   |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
    869   |  add RA, BASE
    870   |  and PC, -8
    871   |  mov RB, BASE
    872   |  sub BASE, PC			// Restore caller BASE.
    873   |  mov dword [RA+RD*8-4], LJ_TNIL	// Ensure one valid arg.
    874   |  mov RC, RA				// ... in [RC]
    875   |  mov PC, [RB-12]			// Restore PC from [cont|PC].
    876   |.if X64
    877   |  movsxd RAa, dword [RB-16]		// May be negative on WIN64 with debug.
    878   |.if FFI
    879   |  cmp RA, 1
    880   |  jbe >1
    881   |.endif
    882   |  lea KBASEa, qword [=>0]
    883   |  add RAa, KBASEa
    884   |.else
    885   |  mov RA, dword [RB-16]
    886   |.if FFI
    887   |  cmp RA, 1
    888   |  jbe >1
    889   |.endif
    890   |.endif
    891   |  mov LFUNC:KBASE, [BASE-8]
    892   |  mov KBASE, LFUNC:KBASE->pc
    893   |  mov KBASE, [KBASE+PC2PROTO(k)]
    894   |  // BASE = base, RC = result, RB = meta base
    895   |  jmp RAa				// Jump to continuation.
    896   |
    897   |.if FFI
    898   |1:
    899   |  je ->cont_ffi_callback		// cont = 1: return from FFI callback.
    900   |  // cont = 0: Tail call from C function.
    901   |  sub RB, BASE
    902   |  shr RB, 3
    903   |  lea RD, [RB-1]
    904   |  jmp ->vm_call_tail
    905   |.endif
    906   |
    907   |->cont_cat:				// BASE = base, RC = result, RB = mbase
    908   |  movzx RA, PC_RB
    909   |  sub RB, 16
    910   |  lea RA, [BASE+RA*8]
    911   |  sub RA, RB
    912   |  je ->cont_ra
    913   |  neg RA
    914   |  shr RA, 3
    915   |.if X64WIN
    916   |  mov CARG3d, RA
    917   |  mov L:CARG1d, SAVE_L
    918   |  mov L:CARG1d->base, BASE
    919   |  mov RCa, [RC]
    920   |  mov [RB], RCa
    921   |  mov CARG2d, RB
    922   |.elif X64
    923   |  mov L:CARG1d, SAVE_L
    924   |  mov L:CARG1d->base, BASE
    925   |  mov CARG3d, RA
    926   |  mov RAa, [RC]
    927   |  mov [RB], RAa
    928   |  mov CARG2d, RB
    929   |.else
    930   |  mov ARG3, RA
    931   |  mov RA, [RC+4]
    932   |  mov RC, [RC]
    933   |  mov [RB+4], RA
    934   |  mov [RB], RC
    935   |  mov ARG2, RB
    936   |.endif
    937   |  jmp ->BC_CAT_Z
    938   |
    939   |//-- Table indexing metamethods -----------------------------------------
    940   |
    941   |->vmeta_tgets:
    942   |  mov TMP1, RC			// RC = GCstr *
    943   |  mov TMP2, LJ_TSTR
    944   |  lea RCa, TMP1			// Store temp. TValue in TMP1/TMP2.
    945   |  cmp PC_OP, BC_GGET
    946   |  jne >1
    947   |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
    948   |  mov [RA], TAB:RB			// RB = GCtab *
    949   |  mov dword [RA+4], LJ_TTAB
    950   |  mov RB, RA
    951   |  jmp >2
    952   |
    953   |->vmeta_tgetb:
    954   |  movzx RC, PC_RC
    955   |.if DUALNUM
    956   |  mov TMP2, LJ_TISNUM
    957   |  mov TMP1, RC
    958   |.else
    959   |  cvtsi2sd xmm0, RC
    960   |  movsd TMPQ, xmm0
    961   |.endif
    962   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
    963   |  jmp >1
    964   |
    965   |->vmeta_tgetv:
    966   |  movzx RC, PC_RC			// Reload TValue *k from RC.
    967   |  lea RC, [BASE+RC*8]
    968   |1:
    969   |  movzx RB, PC_RB			// Reload TValue *t from RB.
    970   |  lea RB, [BASE+RB*8]
    971   |2:
    972   |.if X64
    973   |  mov L:CARG1d, SAVE_L
    974   |  mov L:CARG1d->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
    975   |  mov CARG2d, RB
    976   |  mov CARG3, RCa			// May be 64 bit ptr to stack.
    977   |  mov L:RB, L:CARG1d
    978   |.else
    979   |  mov ARG2, RB
    980   |  mov L:RB, SAVE_L
    981   |  mov ARG3, RC
    982   |  mov ARG1, L:RB
    983   |  mov L:RB->base, BASE
    984   |.endif
    985   |  mov SAVE_PC, PC
    986   |  call extern lj_meta_tget		// (lua_State *L, TValue *o, TValue *k)
    987   |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
    988   |  mov BASE, L:RB->base
    989   |  test RC, RC
    990   |  jz >3
    991   |->cont_ra:				// BASE = base, RC = result
    992   |  movzx RA, PC_RA
    993   |.if X64
    994   |  mov RBa, [RC]
    995   |  mov [BASE+RA*8], RBa
    996   |.else
    997   |  mov RB, [RC+4]
    998   |  mov RC, [RC]
    999   |  mov [BASE+RA*8+4], RB
   1000   |  mov [BASE+RA*8], RC
   1001   |.endif
   1002   |  ins_next
   1003   |
   1004   |3:  // Call __index metamethod.
   1005   |  // BASE = base, L->top = new base, stack = cont/func/t/k/origt
   1006   |  mov RA, L:RB->top
   1007   |  mov [RA-12], PC			// [cont|PC]
   1008   |  lea PC, [RA+FRAME_CONT]
   1009   |  sub PC, BASE
   1010   |  mov LFUNC:RB, [RA-8]		// Guaranteed to be a function here.
   1011   |  mov NARGS:RD, 3+1			// 2 args for func(t, k).
   1012   |  jmp ->vm_call_dispatch_f
   1013   |
   1014   |->vmeta_tgetr:
   1015   |  mov FCARG1, TAB:RB
   1016   |  mov RB, BASE			// Save BASE.
   1017   |  mov FCARG2, RC			// Caveat: FCARG2 == BASE
   1018   |  call extern lj_tab_getinth@8	// (GCtab *t, int32_t key)
   1019   |  // cTValue * or NULL returned in eax (RC).
   1020   |  movzx RA, PC_RA
   1021   |  mov BASE, RB			// Restore BASE.
   1022   |  test RC, RC
   1023   |  jnz ->BC_TGETR_Z
   1024   |  mov dword [BASE+RA*8+4], LJ_TNIL
   1025   |  jmp ->BC_TGETR2_Z
   1026   |
   1027   |//-----------------------------------------------------------------------
   1028   |
   1029   |->vmeta_tsets:
   1030   |  mov TMP1, RC			// RC = GCstr *
   1031   |  mov TMP2, LJ_TSTR
   1032   |  lea RCa, TMP1			// Store temp. TValue in TMP1/TMP2.
   1033   |  cmp PC_OP, BC_GSET
   1034   |  jne >1
   1035   |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
   1036   |  mov [RA], TAB:RB			// RB = GCtab *
   1037   |  mov dword [RA+4], LJ_TTAB
   1038   |  mov RB, RA
   1039   |  jmp >2
   1040   |
   1041   |->vmeta_tsetb:
   1042   |  movzx RC, PC_RC
   1043   |.if DUALNUM
   1044   |  mov TMP2, LJ_TISNUM
   1045   |  mov TMP1, RC
   1046   |.else
   1047   |  cvtsi2sd xmm0, RC
   1048   |  movsd TMPQ, xmm0
   1049   |.endif
   1050   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
   1051   |  jmp >1
   1052   |
   1053   |->vmeta_tsetv:
   1054   |  movzx RC, PC_RC			// Reload TValue *k from RC.
   1055   |  lea RC, [BASE+RC*8]
   1056   |1:
   1057   |  movzx RB, PC_RB			// Reload TValue *t from RB.
   1058   |  lea RB, [BASE+RB*8]
   1059   |2:
   1060   |.if X64
   1061   |  mov L:CARG1d, SAVE_L
   1062   |  mov L:CARG1d->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
   1063   |  mov CARG2d, RB
   1064   |  mov CARG3, RCa			// May be 64 bit ptr to stack.
   1065   |  mov L:RB, L:CARG1d
   1066   |.else
   1067   |  mov ARG2, RB
   1068   |  mov L:RB, SAVE_L
   1069   |  mov ARG3, RC
   1070   |  mov ARG1, L:RB
   1071   |  mov L:RB->base, BASE
   1072   |.endif
   1073   |  mov SAVE_PC, PC
   1074   |  call extern lj_meta_tset		// (lua_State *L, TValue *o, TValue *k)
   1075   |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
   1076   |  mov BASE, L:RB->base
   1077   |  test RC, RC
   1078   |  jz >3
   1079   |  // NOBARRIER: lj_meta_tset ensures the table is not black.
   1080   |  movzx RA, PC_RA
   1081   |.if X64
   1082   |  mov RBa, [BASE+RA*8]
   1083   |  mov [RC], RBa
   1084   |.else
   1085   |  mov RB, [BASE+RA*8+4]
   1086   |  mov RA, [BASE+RA*8]
   1087   |  mov [RC+4], RB
   1088   |  mov [RC], RA
   1089   |.endif
   1090   |->cont_nop:				// BASE = base, (RC = result)
   1091   |  ins_next
   1092   |
   1093   |3:  // Call __newindex metamethod.
   1094   |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
   1095   |  mov RA, L:RB->top
   1096   |  mov [RA-12], PC			// [cont|PC]
   1097   |  movzx RC, PC_RA
   1098   |  // Copy value to third argument.
   1099   |.if X64
   1100   |  mov RBa, [BASE+RC*8]
   1101   |  mov [RA+16], RBa
   1102   |.else
   1103   |  mov RB, [BASE+RC*8+4]
   1104   |  mov RC, [BASE+RC*8]
   1105   |  mov [RA+20], RB
   1106   |  mov [RA+16], RC
   1107   |.endif
   1108   |  lea PC, [RA+FRAME_CONT]
   1109   |  sub PC, BASE
   1110   |  mov LFUNC:RB, [RA-8]		// Guaranteed to be a function here.
   1111   |  mov NARGS:RD, 3+1			// 3 args for func(t, k, v).
   1112   |  jmp ->vm_call_dispatch_f
   1113   |
   1114   |->vmeta_tsetr:
   1115   |.if X64WIN
   1116   |  mov L:CARG1d, SAVE_L
   1117   |  mov CARG3d, RC
   1118   |  mov L:CARG1d->base, BASE
   1119   |  xchg CARG2d, TAB:RB		// Caveat: CARG2d == BASE.
   1120   |.elif X64
   1121   |  mov L:CARG1d, SAVE_L
   1122   |  mov CARG2d, TAB:RB
   1123   |  mov L:CARG1d->base, BASE
   1124   |  mov RB, BASE			// Save BASE.
   1125   |  mov CARG3d, RC			// Caveat: CARG3d == BASE.
   1126   |.else
   1127   |  mov L:RA, SAVE_L
   1128   |  mov ARG2, TAB:RB
   1129   |  mov RB, BASE			// Save BASE.
   1130   |  mov ARG3, RC
   1131   |  mov ARG1, L:RA
   1132   |  mov L:RA->base, BASE
   1133   |.endif
   1134   |  mov SAVE_PC, PC
   1135   |  call extern lj_tab_setinth  // (lua_State *L, GCtab *t, int32_t key)
   1136   |  // TValue * returned in eax (RC).
   1137   |  movzx RA, PC_RA
   1138   |  mov BASE, RB			// Restore BASE.
   1139   |  jmp ->BC_TSETR_Z
   1140   |
   1141   |//-- Comparison metamethods ---------------------------------------------
   1142   |
   1143   |->vmeta_comp:
   1144   |.if X64
   1145   |  mov L:RB, SAVE_L
   1146   |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d == BASE.
   1147   |.if X64WIN
   1148   |  lea CARG3d, [BASE+RD*8]
   1149   |  lea CARG2d, [BASE+RA*8]
   1150   |.else
   1151   |  lea CARG2d, [BASE+RA*8]
   1152   |  lea CARG3d, [BASE+RD*8]
   1153   |.endif
   1154   |  mov CARG1d, L:RB			// Caveat: CARG1d/CARG4d == RA.
   1155   |  movzx CARG4d, PC_OP
   1156   |.else
   1157   |  movzx RB, PC_OP
   1158   |  lea RD, [BASE+RD*8]
   1159   |  lea RA, [BASE+RA*8]
   1160   |  mov ARG4, RB
   1161   |  mov L:RB, SAVE_L
   1162   |  mov ARG3, RD
   1163   |  mov ARG2, RA
   1164   |  mov ARG1, L:RB
   1165   |  mov L:RB->base, BASE
   1166   |.endif
   1167   |  mov SAVE_PC, PC
   1168   |  call extern lj_meta_comp	// (lua_State *L, TValue *o1, *o2, int op)
   1169   |  // 0/1 or TValue * (metamethod) returned in eax (RC).
   1170   |3:
   1171   |  mov BASE, L:RB->base
   1172   |  cmp RC, 1
   1173   |  ja ->vmeta_binop
   1174   |4:
   1175   |  lea PC, [PC+4]
   1176   |  jb >6
   1177   |5:
   1178   |  movzx RD, PC_RD
   1179   |  branchPC RD
   1180   |6:
   1181   |  ins_next
   1182   |
   1183   |->cont_condt:			// BASE = base, RC = result
   1184   |  add PC, 4
   1185   |  cmp dword [RC+4], LJ_TISTRUECOND	// Branch if result is true.
   1186   |  jb <5
   1187   |  jmp <6
   1188   |
   1189   |->cont_condf:			// BASE = base, RC = result
   1190   |  cmp dword [RC+4], LJ_TISTRUECOND	// Branch if result is false.
   1191   |  jmp <4
   1192   |
   1193   |->vmeta_equal:
   1194   |  sub PC, 4
   1195   |.if X64WIN
   1196   |  mov CARG3d, RD
   1197   |  mov CARG4d, RB
   1198   |  mov L:RB, SAVE_L
   1199   |  mov L:RB->base, BASE		// Caveat: CARG2d == BASE.
   1200   |  mov CARG2d, RA
   1201   |  mov CARG1d, L:RB			// Caveat: CARG1d == RA.
   1202   |.elif X64
   1203   |  mov CARG2d, RA
   1204   |  mov CARG4d, RB			// Caveat: CARG4d == RA.
   1205   |  mov L:RB, SAVE_L
   1206   |  mov L:RB->base, BASE		// Caveat: CARG3d == BASE.
   1207   |  mov CARG3d, RD
   1208   |  mov CARG1d, L:RB
   1209   |.else
   1210   |  mov ARG4, RB
   1211   |  mov L:RB, SAVE_L
   1212   |  mov ARG3, RD
   1213   |  mov ARG2, RA
   1214   |  mov ARG1, L:RB
   1215   |  mov L:RB->base, BASE
   1216   |.endif
   1217   |  mov SAVE_PC, PC
   1218   |  call extern lj_meta_equal	// (lua_State *L, GCobj *o1, *o2, int ne)
   1219   |  // 0/1 or TValue * (metamethod) returned in eax (RC).
   1220   |  jmp <3
   1221   |
   1222   |->vmeta_equal_cd:
   1223   |.if FFI
   1224   |  sub PC, 4
   1225   |  mov L:RB, SAVE_L
   1226   |  mov L:RB->base, BASE
   1227   |  mov FCARG1, L:RB
   1228   |  mov FCARG2, dword [PC-4]
   1229   |  mov SAVE_PC, PC
   1230   |  call extern lj_meta_equal_cd@8	// (lua_State *L, BCIns ins)
   1231   |  // 0/1 or TValue * (metamethod) returned in eax (RC).
   1232   |  jmp <3
   1233   |.endif
   1234   |
   1235   |->vmeta_istype:
   1236   |.if X64
   1237   |  mov L:RB, SAVE_L
   1238   |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
   1239   |  mov CARG2d, RA
   1240   |  movzx CARG3d, PC_RD
   1241   |  mov L:CARG1d, L:RB
   1242   |.else
   1243   |  movzx RD, PC_RD
   1244   |  mov ARG2, RA
   1245   |  mov L:RB, SAVE_L
   1246   |  mov ARG3, RD
   1247   |  mov ARG1, L:RB
   1248   |  mov L:RB->base, BASE
   1249   |.endif
   1250   |  mov SAVE_PC, PC
   1251   |  call extern lj_meta_istype  // (lua_State *L, BCReg ra, BCReg tp)
   1252   |  mov BASE, L:RB->base
   1253   |  jmp <6
   1254   |
   1255   |//-- Arithmetic metamethods ---------------------------------------------
   1256   |
   1257   |->vmeta_arith_vno:
   1258   |.if DUALNUM
   1259   |  movzx RB, PC_RB
   1260   |.endif
   1261   |->vmeta_arith_vn:
   1262   |  lea RC, [KBASE+RC*8]
   1263   |  jmp >1
   1264   |
   1265   |->vmeta_arith_nvo:
   1266   |.if DUALNUM
   1267   |  movzx RC, PC_RC
   1268   |.endif
   1269   |->vmeta_arith_nv:
   1270   |  lea RC, [KBASE+RC*8]
   1271   |  lea RB, [BASE+RB*8]
   1272   |  xchg RB, RC
   1273   |  jmp >2
   1274   |
   1275   |->vmeta_unm:
   1276   |  lea RC, [BASE+RD*8]
   1277   |  mov RB, RC
   1278   |  jmp >2
   1279   |
   1280   |->vmeta_arith_vvo:
   1281   |.if DUALNUM
   1282   |  movzx RB, PC_RB
   1283   |.endif
   1284   |->vmeta_arith_vv:
   1285   |  lea RC, [BASE+RC*8]
   1286   |1:
   1287   |  lea RB, [BASE+RB*8]
   1288   |2:
   1289   |  lea RA, [BASE+RA*8]
   1290   |.if X64WIN
   1291   |  mov CARG3d, RB
   1292   |  mov CARG4d, RC
   1293   |  movzx RC, PC_OP
   1294   |  mov ARG5d, RC
   1295   |  mov L:RB, SAVE_L
   1296   |  mov L:RB->base, BASE		// Caveat: CARG2d == BASE.
   1297   |  mov CARG2d, RA
   1298   |  mov CARG1d, L:RB			// Caveat: CARG1d == RA.
   1299   |.elif X64
   1300   |  movzx CARG5d, PC_OP
   1301   |  mov CARG2d, RA
   1302   |  mov CARG4d, RC			// Caveat: CARG4d == RA.
   1303   |  mov L:CARG1d, SAVE_L
   1304   |  mov L:CARG1d->base, BASE		// Caveat: CARG3d == BASE.
   1305   |  mov CARG3d, RB
   1306   |  mov L:RB, L:CARG1d
   1307   |.else
   1308   |  mov ARG3, RB
   1309   |  mov L:RB, SAVE_L
   1310   |  mov ARG4, RC
   1311   |  movzx RC, PC_OP
   1312   |  mov ARG2, RA
   1313   |  mov ARG5, RC
   1314   |  mov ARG1, L:RB
   1315   |  mov L:RB->base, BASE
   1316   |.endif
   1317   |  mov SAVE_PC, PC
   1318   |  call extern lj_meta_arith	// (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
   1319   |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
   1320   |  mov BASE, L:RB->base
   1321   |  test RC, RC
   1322   |  jz ->cont_nop
   1323   |
   1324   |  // Call metamethod for binary op.
   1325   |->vmeta_binop:
   1326   |  // BASE = base, RC = new base, stack = cont/func/o1/o2
   1327   |  mov RA, RC
   1328   |  sub RC, BASE
   1329   |  mov [RA-12], PC			// [cont|PC]
   1330   |  lea PC, [RC+FRAME_CONT]
   1331   |  mov NARGS:RD, 2+1			// 2 args for func(o1, o2).
   1332   |  jmp ->vm_call_dispatch
   1333   |
   1334   |->vmeta_len:
   1335   |  mov L:RB, SAVE_L
   1336   |  mov L:RB->base, BASE
   1337   |  lea FCARG2, [BASE+RD*8]		// Caveat: FCARG2 == BASE
   1338   |  mov L:FCARG1, L:RB
   1339   |  mov SAVE_PC, PC
   1340   |  call extern lj_meta_len@8		// (lua_State *L, TValue *o)
   1341   |  // NULL (retry) or TValue * (metamethod) returned in eax (RC).
   1342   |  mov BASE, L:RB->base
   1343   |  test RC, RC
   1344   |  jne ->vmeta_binop			// Binop call for compatibility.
   1345   |  movzx RD, PC_RD
   1346   |  mov TAB:FCARG1, [BASE+RD*8]
   1347   |  jmp ->BC_LEN_Z
   1348   |
   1349   |//-- Call metamethod ----------------------------------------------------
   1350   |
   1351   |->vmeta_call_ra:
   1352   |  lea RA, [BASE+RA*8+8]
   1353   |->vmeta_call:			// Resolve and call __call metamethod.
   1354   |  // BASE = old base, RA = new base, RC = nargs+1, PC = return
   1355   |  mov TMP2, RA			// Save RA, RC for us.
   1356   |  mov TMP1, NARGS:RD
   1357   |  sub RA, 8
   1358   |.if X64
   1359   |  mov L:RB, SAVE_L
   1360   |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
   1361   |  mov CARG2d, RA
   1362   |  lea CARG3d, [RA+NARGS:RD*8]
   1363   |  mov CARG1d, L:RB			// Caveat: CARG1d may be RA.
   1364   |.else
   1365   |  lea RC, [RA+NARGS:RD*8]
   1366   |  mov L:RB, SAVE_L
   1367   |  mov ARG2, RA
   1368   |  mov ARG3, RC
   1369   |  mov ARG1, L:RB
   1370   |  mov L:RB->base, BASE		// This is the callers base!
   1371   |.endif
   1372   |  mov SAVE_PC, PC
   1373   |  call extern lj_meta_call	// (lua_State *L, TValue *func, TValue *top)
   1374   |  mov BASE, L:RB->base
   1375   |  mov RA, TMP2
   1376   |  mov NARGS:RD, TMP1
   1377   |  mov LFUNC:RB, [RA-8]
   1378   |  add NARGS:RD, 1
   1379   |  // This is fragile. L->base must not move, KBASE must always be defined.
   1380   |  cmp KBASE, BASE			// Continue with CALLT if flag set.
   1381   |  je ->BC_CALLT_Z
   1382   |  mov BASE, RA
   1383   |  ins_call				// Otherwise call resolved metamethod.
   1384   |
   1385   |//-- Argument coercion for 'for' statement ------------------------------
   1386   |
   1387   |->vmeta_for:
   1388   |  mov L:RB, SAVE_L
   1389   |  mov L:RB->base, BASE
   1390   |  mov FCARG2, RA			// Caveat: FCARG2 == BASE
   1391   |  mov L:FCARG1, L:RB			// Caveat: FCARG1 == RA
   1392   |  mov SAVE_PC, PC
   1393   |  call extern lj_meta_for@8	// (lua_State *L, TValue *base)
   1394   |  mov BASE, L:RB->base
   1395   |  mov RC, [PC-4]
   1396   |  movzx RA, RCH
   1397   |  movzx OP, RCL
   1398   |  shr RC, 16
   1399   |.if X64
   1400   |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]	// Retry FORI or JFORI.
   1401   |.else
   1402   |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]	// Retry FORI or JFORI.
   1403   |.endif
   1404   |
   1405   |//-----------------------------------------------------------------------
   1406   |//-- Fast functions -----------------------------------------------------
   1407   |//-----------------------------------------------------------------------
   1408   |
   1409   |.macro .ffunc, name
   1410   |->ff_ .. name:
   1411   |.endmacro
   1412   |
   1413   |.macro .ffunc_1, name
   1414   |->ff_ .. name:
   1415   |  cmp NARGS:RD, 1+1;  jb ->fff_fallback
   1416   |.endmacro
   1417   |
   1418   |.macro .ffunc_2, name
   1419   |->ff_ .. name:
   1420   |  cmp NARGS:RD, 2+1;  jb ->fff_fallback
   1421   |.endmacro
   1422   |
   1423   |.macro .ffunc_nsse, name, op
   1424   |  .ffunc_1 name
   1425   |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
   1426   |  op xmm0, qword [BASE]
   1427   |.endmacro
   1428   |
   1429   |.macro .ffunc_nsse, name
   1430   |  .ffunc_nsse name, movsd
   1431   |.endmacro
   1432   |
   1433   |.macro .ffunc_nnsse, name
   1434   |  .ffunc_2 name
   1435   |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
   1436   |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
   1437   |  movsd xmm0, qword [BASE]
   1438   |  movsd xmm1, qword [BASE+8]
   1439   |.endmacro
   1440   |
   1441   |.macro .ffunc_nnr, name
   1442   |  .ffunc_2 name
   1443   |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
   1444   |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
   1445   |  fld qword [BASE+8]
   1446   |  fld qword [BASE]
   1447   |.endmacro
   1448   |
   1449   |// Inlined GC threshold check. Caveat: uses label 1.
   1450   |.macro ffgccheck
   1451   |  mov RB, [DISPATCH+DISPATCH_GL(gc.total)]
   1452   |  cmp RB, [DISPATCH+DISPATCH_GL(gc.threshold)]
   1453   |  jb >1
   1454   |  call ->fff_gcstep
   1455   |1:
   1456   |.endmacro
   1457   |
   1458   |//-- Base library: checks -----------------------------------------------
   1459   |
   1460   |.ffunc_1 assert
   1461   |  mov RB, [BASE+4]
   1462   |  cmp RB, LJ_TISTRUECOND;  jae ->fff_fallback
   1463   |  mov PC, [BASE-4]
   1464   |  mov MULTRES, RD
   1465   |  mov [BASE-4], RB
   1466   |  mov RB, [BASE]
   1467   |  mov [BASE-8], RB
   1468   |  sub RD, 2
   1469   |  jz >2
   1470   |  mov RA, BASE
   1471   |1:
   1472   |  add RA, 8
   1473   |.if X64
   1474   |  mov RBa, [RA]
   1475   |  mov [RA-8], RBa
   1476   |.else
   1477   |  mov RB, [RA+4]
   1478   |  mov [RA-4], RB
   1479   |  mov RB, [RA]
   1480   |  mov [RA-8], RB
   1481   |.endif
   1482   |  sub RD, 1
   1483   |  jnz <1
   1484   |2:
   1485   |  mov RD, MULTRES
   1486   |  jmp ->fff_res_
   1487   |
   1488   |.ffunc_1 type
   1489   |  mov RB, [BASE+4]
   1490   |.if X64
   1491   |  mov RA, RB
   1492   |  sar RA, 15
   1493   |  cmp RA, -2
   1494   |  je >3
   1495   |.endif
   1496   |  mov RC, ~LJ_TNUMX
   1497   |  not RB
   1498   |  cmp RC, RB
   1499   |  cmova RC, RB
   1500   |2:
   1501   |  mov CFUNC:RB, [BASE-8]
   1502   |  mov STR:RC, [CFUNC:RB+RC*8+((char *)(&((GCfuncC *)0)->upvalue))]
   1503   |  mov PC, [BASE-4]
   1504   |  mov dword [BASE-4], LJ_TSTR
   1505   |  mov [BASE-8], STR:RC
   1506   |  jmp ->fff_res1
   1507   |.if X64
   1508   |3:
   1509   |  mov RC, ~LJ_TLIGHTUD
   1510   |  jmp <2
   1511   |.endif
   1512   |
   1513   |//-- Base library: getters and setters ---------------------------------
   1514   |
   1515   |.ffunc_1 getmetatable
   1516   |  mov RB, [BASE+4]
   1517   |  mov PC, [BASE-4]
   1518   |  cmp RB, LJ_TTAB;  jne >6
   1519   |1:  // Field metatable must be at same offset for GCtab and GCudata!
   1520   |  mov TAB:RB, [BASE]
   1521   |  mov TAB:RB, TAB:RB->metatable
   1522   |2:
   1523   |  test TAB:RB, TAB:RB
   1524   |  mov dword [BASE-4], LJ_TNIL
   1525   |  jz ->fff_res1
   1526   |  mov STR:RC, [DISPATCH+DISPATCH_GL(gcroot)+4*(GCROOT_MMNAME+MM_metatable)]
   1527   |  mov dword [BASE-4], LJ_TTAB	// Store metatable as default result.
   1528   |  mov [BASE-8], TAB:RB
   1529   |  mov RA, TAB:RB->hmask
   1530   |  and RA, STR:RC->hash
   1531   |  imul RA, #NODE
   1532   |  add NODE:RA, TAB:RB->node
   1533   |3:  // Rearranged logic, because we expect _not_ to find the key.
   1534   |  cmp dword NODE:RA->key.it, LJ_TSTR
   1535   |  jne >4
   1536   |  cmp dword NODE:RA->key.gcr, STR:RC
   1537   |  je >5
   1538   |4:
   1539   |  mov NODE:RA, NODE:RA->next
   1540   |  test NODE:RA, NODE:RA
   1541   |  jnz <3
   1542   |  jmp ->fff_res1			// Not found, keep default result.
   1543   |5:
   1544   |  mov RB, [RA+4]
   1545   |  cmp RB, LJ_TNIL;  je ->fff_res1	// Ditto for nil value.
   1546   |  mov RC, [RA]
   1547   |  mov [BASE-4], RB			// Return value of mt.__metatable.
   1548   |  mov [BASE-8], RC
   1549   |  jmp ->fff_res1
   1550   |
   1551   |6:
   1552   |  cmp RB, LJ_TUDATA;  je <1
   1553   |.if X64
   1554   |  cmp RB, LJ_TNUMX;  ja >8
   1555   |  cmp RB, LJ_TISNUM;  jbe >7
   1556   |  mov RB, LJ_TLIGHTUD
   1557   |  jmp >8
   1558   |7:
   1559   |.else
   1560   |  cmp RB, LJ_TISNUM;  ja >8
   1561   |.endif
   1562   |  mov RB, LJ_TNUMX
   1563   |8:
   1564   |  not RB
   1565   |  mov TAB:RB, [DISPATCH+RB*4+DISPATCH_GL(gcroot[GCROOT_BASEMT])]
   1566   |  jmp <2
   1567   |
   1568   |.ffunc_2 setmetatable
   1569   |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
   1570   |  // Fast path: no mt for table yet and not clearing the mt.
   1571   |  mov TAB:RB, [BASE]
   1572   |  cmp dword TAB:RB->metatable, 0;  jne ->fff_fallback
   1573   |  cmp dword [BASE+12], LJ_TTAB;  jne ->fff_fallback
   1574   |  mov TAB:RA, [BASE+8]
   1575   |  // fallback if metatable contains __gc
   1576   |  test byte TAB:RA->nomm, 1<<MM_gc; jz ->fff_fallback
   1577   |  mov TAB:RB->metatable, TAB:RA
   1578   |  mov PC, [BASE-4]
   1579   |  mov dword [BASE-4], LJ_TTAB		// Return original table.
   1580   |  mov [BASE-8], TAB:RB
   1581   |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
   1582   |  jz >1
   1583   |  // Possible write barrier. Table is black, but skip iswhite(mt) check.
   1584   |  barrierback TAB:RB, RA
   1585   |1:
   1586   |  jmp ->fff_res1
   1587   |
   1588   |.ffunc_2 rawget
   1589   |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
   1590   |.if X64WIN
   1591   |  mov RB, BASE			// Save BASE.
   1592   |  lea CARG3d, [BASE+8]
   1593   |  mov CARG2d, [BASE]			// Caveat: CARG2d == BASE.
   1594   |  mov CARG1d, SAVE_L
   1595   |.elif X64
   1596   |  mov RB, BASE			// Save BASE.
   1597   |  mov CARG2d, [BASE]
   1598   |  lea CARG3d, [BASE+8]		// Caveat: CARG3d == BASE.
   1599   |  mov CARG1d, SAVE_L
   1600   |.else
   1601   |  mov TAB:RD, [BASE]
   1602   |  mov L:RB, SAVE_L
   1603   |  mov ARG2, TAB:RD
   1604   |  mov ARG1, L:RB
   1605   |  mov RB, BASE			// Save BASE.
   1606   |  add BASE, 8
   1607   |  mov ARG3, BASE
   1608   |.endif
   1609   |  call extern lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
   1610   |  // cTValue * returned in eax (RD).
   1611   |  mov BASE, RB			// Restore BASE.
   1612   |  // Copy table slot.
   1613   |.if X64
   1614   |  mov RBa, [RD]
   1615   |  mov PC, [BASE-4]
   1616   |  mov [BASE-8], RBa
   1617   |.else
   1618   |  mov RB, [RD]
   1619   |  mov RD, [RD+4]
   1620   |  mov PC, [BASE-4]
   1621   |  mov [BASE-8], RB
   1622   |  mov [BASE-4], RD
   1623   |.endif
   1624   |  jmp ->fff_res1
   1625   |
   1626   |//-- Base library: conversions ------------------------------------------
   1627   |
   1628   |.ffunc tonumber
   1629   |  // Only handles the number case inline (without a base argument).
   1630   |  cmp NARGS:RD, 1+1;  jne ->fff_fallback	// Exactly one argument.
   1631   |  cmp dword [BASE+4], LJ_TISNUM
   1632   |.if DUALNUM
   1633   |  jne >1
   1634   |  mov RB, dword [BASE]; jmp ->fff_resi
   1635   |1:
   1636   |  ja ->fff_fallback
   1637   |.else
   1638   |  jae ->fff_fallback
   1639   |.endif
   1640   |  movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
   1641   |
   1642   |.ffunc_1 tostring
   1643   |  // Only handles the string or number case inline.
   1644   |  mov PC, [BASE-4]
   1645   |  cmp dword [BASE+4], LJ_TSTR;  jne >3
   1646   |  // A __tostring method in the string base metatable is ignored.
   1647   |  mov STR:RD, [BASE]
   1648   |2:
   1649   |  mov dword [BASE-4], LJ_TSTR
   1650   |  mov [BASE-8], STR:RD
   1651   |  jmp ->fff_res1
   1652   |3:  // Handle numbers inline, unless a number base metatable is present.
   1653   |  cmp dword [BASE+4], LJ_TISNUM;  ja ->fff_fallback
   1654   |  cmp dword [DISPATCH+DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])], 0
   1655   |  jne ->fff_fallback
   1656   |  ffgccheck				// Caveat: uses label 1.
   1657   |  mov L:RB, SAVE_L
   1658   |  mov L:RB->base, BASE		// Add frame since C call can throw.
   1659   |  mov SAVE_PC, PC			// Redundant (but a defined value).
   1660   |.if X64 and not X64WIN
   1661   |  mov FCARG2, BASE			// Otherwise: FCARG2 == BASE
   1662   |.endif
   1663   |  mov L:FCARG1, L:RB
   1664   |.if DUALNUM
   1665   |  call extern lj_strfmt_number@8	// (lua_State *L, cTValue *o)
   1666   |.else
   1667   |  call extern lj_strfmt_num@8	// (lua_State *L, lua_Number *np)
   1668   |.endif
   1669   |  // GCstr returned in eax (RD).
   1670   |  mov BASE, L:RB->base
   1671   |  jmp <2
   1672   |
   1673   |//-- Base library: iterators -------------------------------------------
   1674   |
   1675   |.ffunc_1 next
   1676   |  je >2				// Missing 2nd arg?
   1677   |1:
   1678   |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
   1679   |  mov L:RB, SAVE_L
   1680   |  mov L:RB->base, BASE		// Add frame since C call can throw.
   1681   |  mov L:RB->top, BASE		// Dummy frame length is ok.
   1682   |  mov PC, [BASE-4]
   1683   |.if X64WIN
   1684   |  lea CARG3d, [BASE+8]
   1685   |  mov CARG2d, [BASE]			// Caveat: CARG2d == BASE.
   1686   |  mov CARG1d, L:RB
   1687   |.elif X64
   1688   |  mov CARG2d, [BASE]
   1689   |  lea CARG3d, [BASE+8]		// Caveat: CARG3d == BASE.
   1690   |  mov CARG1d, L:RB
   1691   |.else
   1692   |  mov TAB:RD, [BASE]
   1693   |  mov ARG2, TAB:RD
   1694   |  mov ARG1, L:RB
   1695   |  add BASE, 8
   1696   |  mov ARG3, BASE
   1697   |.endif
   1698   |  mov SAVE_PC, PC			// Needed for ITERN fallback.
   1699   |  call extern lj_tab_next	// (lua_State *L, GCtab *t, TValue *key)
   1700   |  // Flag returned in eax (RD).
   1701   |  mov BASE, L:RB->base
   1702   |  test RD, RD;  jz >3		// End of traversal?
   1703   |  // Copy key and value to results.
   1704   |.if X64
   1705   |  mov RBa, [BASE+8]
   1706   |  mov RDa, [BASE+16]
   1707   |  mov [BASE-8], RBa
   1708   |  mov [BASE], RDa
   1709   |.else
   1710   |  mov RB, [BASE+8]
   1711   |  mov RD, [BASE+12]
   1712   |  mov [BASE-8], RB
   1713   |  mov [BASE-4], RD
   1714   |  mov RB, [BASE+16]
   1715   |  mov RD, [BASE+20]
   1716   |  mov [BASE], RB
   1717   |  mov [BASE+4], RD
   1718   |.endif
   1719   |->fff_res2:
   1720   |  mov RD, 1+2
   1721   |  jmp ->fff_res
   1722   |2:  // Set missing 2nd arg to nil.
   1723   |  mov dword [BASE+12], LJ_TNIL
   1724   |  jmp <1
   1725   |3:  // End of traversal: return nil.
   1726   |  mov dword [BASE-4], LJ_TNIL
   1727   |  jmp ->fff_res1
   1728   |
   1729   |.ffunc_1 pairs
   1730   |  mov TAB:RB, [BASE]
   1731   |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
   1732   |  cmp dword TAB:RB->metatable, 0; jne ->fff_fallback
   1733   |  mov CFUNC:RB, [BASE-8]
   1734   |  mov CFUNC:RD, CFUNC:RB->upvalue[0]
   1735   |  mov PC, [BASE-4]
   1736   |  mov dword [BASE-4], LJ_TFUNC
   1737   |  mov [BASE-8], CFUNC:RD
   1738   |  mov dword [BASE+12], LJ_TNIL
   1739   |  mov RD, 1+3
   1740   |  jmp ->fff_res
   1741   |
   1742   |.ffunc_2 ipairs_aux
   1743   |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
   1744   |  cmp dword [BASE+12], LJ_TISNUM
   1745   |.if DUALNUM
   1746   |  jne ->fff_fallback
   1747   |.else
   1748   |  jae ->fff_fallback
   1749   |.endif
   1750   |  mov PC, [BASE-4]
   1751   |.if DUALNUM
   1752   |  mov RD, dword [BASE+8]
   1753   |  add RD, 1
   1754   |  mov dword [BASE-4], LJ_TISNUM
   1755   |  mov dword [BASE-8], RD
   1756   |.else
   1757   |  movsd xmm0, qword [BASE+8]
   1758   |  sseconst_1 xmm1, RBa
   1759   |  addsd xmm0, xmm1
   1760   |  cvttsd2si RD, xmm0
   1761   |  movsd qword [BASE-8], xmm0
   1762   |.endif
   1763   |  mov TAB:RB, [BASE]
   1764   |  cmp RD, TAB:RB->asize;  jae >2	// Not in array part?
   1765   |  shl RD, 3
   1766   |  add RD, TAB:RB->array
   1767   |1:
   1768   |  cmp dword [RD+4], LJ_TNIL;  je ->fff_res0
   1769   |  // Copy array slot.
   1770   |.if X64
   1771   |  mov RBa, [RD]
   1772   |  mov [BASE], RBa
   1773   |.else
   1774   |  mov RB, [RD]
   1775   |  mov RD, [RD+4]
   1776   |  mov [BASE], RB
   1777   |  mov [BASE+4], RD
   1778   |.endif
   1779   |  jmp ->fff_res2
   1780   |2:  // Check for empty hash part first. Otherwise call C function.
   1781   |  cmp dword TAB:RB->hmask, 0; je ->fff_res0
   1782   |  mov FCARG1, TAB:RB
   1783   |  mov RB, BASE			// Save BASE.
   1784   |  mov FCARG2, RD			// Caveat: FCARG2 == BASE
   1785   |  call extern lj_tab_getinth@8	// (GCtab *t, int32_t key)
   1786   |  // cTValue * or NULL returned in eax (RD).
   1787   |  mov BASE, RB
   1788   |  test RD, RD
   1789   |  jnz <1
   1790   |->fff_res0:
   1791   |  mov RD, 1+0
   1792   |  jmp ->fff_res
   1793   |
   1794   |.ffunc_1 ipairs
   1795   |  mov TAB:RB, [BASE]
   1796   |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
   1797   |  cmp dword TAB:RB->metatable, 0; jne ->fff_fallback
   1798   |  mov CFUNC:RB, [BASE-8]
   1799   |  mov CFUNC:RD, CFUNC:RB->upvalue[0]
   1800   |  mov PC, [BASE-4]
   1801   |  mov dword [BASE-4], LJ_TFUNC
   1802   |  mov [BASE-8], CFUNC:RD
   1803   |.if DUALNUM
   1804   |  mov dword [BASE+12], LJ_TISNUM
   1805   |  mov dword [BASE+8], 0
   1806   |.else
   1807   |  xorps xmm0, xmm0
   1808   |  movsd qword [BASE+8], xmm0
   1809   |.endif
   1810   |  mov RD, 1+3
   1811   |  jmp ->fff_res
   1812   |
   1813   |//-- Base library: catch errors ----------------------------------------
   1814   |
   1815   |.ffunc_1 pcall
   1816   |  lea RA, [BASE+8]
   1817   |  sub NARGS:RD, 1
   1818   |  mov PC, 8+FRAME_PCALL
   1819   |1:
   1820   |  movzx RB, byte [DISPATCH+DISPATCH_GL(hookmask)]
   1821   |  shr RB, HOOK_ACTIVE_SHIFT
   1822   |  and RB, 1
   1823   |  add PC, RB				// Remember active hook before pcall.
   1824   |  jmp ->vm_call_dispatch
   1825   |
   1826   |.ffunc_2 xpcall
   1827   |  cmp dword [BASE+12], LJ_TFUNC;  jne ->fff_fallback
   1828   |  mov RB, [BASE+4]			// Swap function and traceback.
   1829   |  mov [BASE+12], RB
   1830   |  mov dword [BASE+4], LJ_TFUNC
   1831   |  mov LFUNC:RB, [BASE]
   1832   |  mov PC, [BASE+8]
   1833   |  mov [BASE+8], LFUNC:RB
   1834   |  mov [BASE], PC
   1835   |  lea RA, [BASE+16]
   1836   |  sub NARGS:RD, 2
   1837   |  mov PC, 16+FRAME_PCALL
   1838   |  jmp <1
   1839   |
   1840   |//-- Coroutine library --------------------------------------------------
   1841   |
   1842   |.macro coroutine_resume_wrap, resume
   1843   |.if resume
   1844   |.ffunc_1 coroutine_resume
   1845   |  mov L:RB, [BASE]
   1846   |.else
   1847   |.ffunc coroutine_wrap_aux
   1848   |  mov CFUNC:RB, [BASE-8]
   1849   |  mov L:RB, CFUNC:RB->upvalue[0].gcr
   1850   |.endif
   1851   |  mov PC, [BASE-4]
   1852   |  mov SAVE_PC, PC
   1853   |.if X64
   1854   |  mov TMP1, L:RB
   1855   |.else
   1856   |  mov ARG1, L:RB
   1857   |.endif
   1858   |.if resume
   1859   |  cmp dword [BASE+4], LJ_TTHREAD;  jne ->fff_fallback
   1860   |.endif
   1861   |  cmp aword L:RB->cframe, 0; jne ->fff_fallback
   1862   |  cmp byte L:RB->status, LUA_YIELD;  ja ->fff_fallback
   1863   |  mov RA, L:RB->top
   1864   |  je >1				// Status != LUA_YIELD (i.e. 0)?
   1865   |  cmp RA, L:RB->base			// Check for presence of initial func.
   1866   |  je ->fff_fallback
   1867   |1:
   1868   |.if resume
   1869   |  lea PC, [RA+NARGS:RD*8-16]		// Check stack space (-1-thread).
   1870   |.else
   1871   |  lea PC, [RA+NARGS:RD*8-8]		// Check stack space (-1).
   1872   |.endif
   1873   |  cmp PC, L:RB->maxstack; ja ->fff_fallback
   1874   |  mov L:RB->top, PC
   1875   |
   1876   |  mov L:RB, SAVE_L
   1877   |  mov L:RB->base, BASE
   1878   |.if resume
   1879   |  add BASE, 8			// Keep resumed thread in stack for GC.
   1880   |.endif
   1881   |  mov L:RB->top, BASE
   1882   |.if resume
   1883   |  lea RB, [BASE+NARGS:RD*8-24]	// RB = end of source for stack move.
   1884   |.else
   1885   |  lea RB, [BASE+NARGS:RD*8-16]	// RB = end of source for stack move.
   1886   |.endif
   1887   |  sub RBa, PCa			// Relative to PC.
   1888   |
   1889   |  cmp PC, RA
   1890   |  je >3
   1891   |2:  // Move args to coroutine.
   1892   |.if X64
   1893   |  mov RCa, [PC+RB]
   1894   |  mov [PC-8], RCa
   1895   |.else
   1896   |  mov RC, [PC+RB+4]
   1897   |  mov [PC-4], RC
   1898   |  mov RC, [PC+RB]
   1899   |  mov [PC-8], RC
   1900   |.endif
   1901   |  sub PC, 8
   1902   |  cmp PC, RA
   1903   |  jne <2
   1904   |3:
   1905   |.if X64
   1906   |  mov CARG2d, RA
   1907   |  mov CARG1d, TMP1
   1908   |.else
   1909   |  mov ARG2, RA
   1910   |  xor RA, RA
   1911   |  mov ARG4, RA
   1912   |  mov ARG3, RA
   1913   |.endif
   1914   |  call ->vm_resume			// (lua_State *L, TValue *base, 0, 0)
   1915   |
   1916   |  mov L:RB, SAVE_L
   1917   |.if X64
   1918   |  mov L:PC, TMP1
   1919   |.else
   1920   |  mov L:PC, ARG1			// The callee doesn't modify SAVE_L.
   1921   |.endif
   1922   |  mov BASE, L:RB->base
   1923   |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
   1924   |  set_vmstate INTERP
   1925   |
   1926   |  cmp eax, LUA_YIELD
   1927   |  ja >8
   1928   |4:
   1929   |  mov RA, L:PC->base
   1930   |  mov KBASE, L:PC->top
   1931   |  mov L:PC->top, RA			// Clear coroutine stack.
   1932   |  mov PC, KBASE
   1933   |  sub PC, RA
   1934   |  je >6				// No results?
   1935   |  lea RD, [BASE+PC]
   1936   |  shr PC, 3
   1937   |  cmp RD, L:RB->maxstack
   1938   |  ja >9				// Need to grow stack?
   1939   |
   1940   |  mov RB, BASE
   1941   |  sub RBa, RAa
   1942   |5:  // Move results from coroutine.
   1943   |.if X64
   1944   |  mov RDa, [RA]
   1945   |  mov [RA+RB], RDa
   1946   |.else
   1947   |  mov RD, [RA]
   1948   |  mov [RA+RB], RD
   1949   |  mov RD, [RA+4]
   1950   |  mov [RA+RB+4], RD
   1951   |.endif
   1952   |  add RA, 8
   1953   |  cmp RA, KBASE
   1954   |  jne <5
   1955   |6:
   1956   |.if resume
   1957   |  lea RD, [PC+2]			// nresults+1 = 1 + true + results.
   1958   |  mov dword [BASE-4], LJ_TTRUE	// Prepend true to results.
   1959   |.else
   1960   |  lea RD, [PC+1]			// nresults+1 = 1 + results.
   1961   |.endif
   1962   |7:
   1963   |  mov PC, SAVE_PC
   1964   |  mov MULTRES, RD
   1965   |.if resume
   1966   |  mov RAa, -8
   1967   |.else
   1968   |  xor RA, RA
   1969   |.endif
   1970   |  test PC, FRAME_TYPE
   1971   |  jz ->BC_RET_Z
   1972   |  jmp ->vm_return
   1973   |
   1974   |8:  // Coroutine returned with error (at co->top-1).
   1975   |.if resume
   1976   |  mov dword [BASE-4], LJ_TFALSE	// Prepend false to results.
   1977   |  mov RA, L:PC->top
   1978   |  sub RA, 8
   1979   |  mov L:PC->top, RA			// Clear error from coroutine stack.
   1980   |  // Copy error message.
   1981   |.if X64
   1982   |  mov RDa, [RA]
   1983   |  mov [BASE], RDa
   1984   |.else
   1985   |  mov RD, [RA]
   1986   |  mov [BASE], RD
   1987   |  mov RD, [RA+4]
   1988   |  mov [BASE+4], RD
   1989   |.endif
   1990   |  mov RD, 1+2			// nresults+1 = 1 + false + error.
   1991   |  jmp <7
   1992   |.else
   1993   |  mov FCARG2, L:PC
   1994   |  mov FCARG1, L:RB
   1995   |  call extern lj_ffh_coroutine_wrap_err@8  // (lua_State *L, lua_State *co)
   1996   |  // Error function does not return.
   1997   |.endif
   1998   |
   1999   |9:  // Handle stack expansion on return from yield.
   2000   |.if X64
   2001   |  mov L:RA, TMP1
   2002   |.else
   2003   |  mov L:RA, ARG1			// The callee doesn't modify SAVE_L.
   2004   |.endif
   2005   |  mov L:RA->top, KBASE		// Undo coroutine stack clearing.
   2006   |  mov FCARG2, PC
   2007   |  mov FCARG1, L:RB
   2008   |  call extern lj_state_growstack@8	// (lua_State *L, int n)
   2009   |.if X64
   2010   |  mov L:PC, TMP1
   2011   |.else
   2012   |  mov L:PC, ARG1
   2013   |.endif
   2014   |  mov BASE, L:RB->base
   2015   |  jmp <4				// Retry the stack move.
   2016   |.endmacro
   2017   |
   2018   |  coroutine_resume_wrap 1		// coroutine.resume
   2019   |  coroutine_resume_wrap 0		// coroutine.wrap
   2020   |
   2021   |.ffunc coroutine_yield
   2022   |  mov L:RB, SAVE_L
   2023   |  test aword L:RB->cframe, CFRAME_RESUME
   2024   |  jz ->fff_fallback
   2025   |  mov L:RB->base, BASE
   2026   |  lea RD, [BASE+NARGS:RD*8-8]
   2027   |  mov L:RB->top, RD
   2028   |  xor RD, RD
   2029   |  mov aword L:RB->cframe, RDa
   2030   |  mov al, LUA_YIELD
   2031   |  mov byte L:RB->status, al
   2032   |  jmp ->vm_leave_unw
   2033   |
   2034   |//-- Math library -------------------------------------------------------
   2035   |
   2036   |.if not DUALNUM
   2037   |->fff_resi:  // Dummy.
   2038   |.endif
   2039   |
   2040   |->fff_resn:
   2041   |  mov PC, [BASE-4]
   2042   |  fstp qword [BASE-8]
   2043   |  jmp ->fff_res1
   2044   |
   2045   |  .ffunc_1 math_abs
   2046   |.if DUALNUM
   2047   |  cmp dword [BASE+4], LJ_TISNUM; jne >2
   2048   |  mov RB, dword [BASE]
   2049   |  cmp RB, 0; jns ->fff_resi
   2050   |  neg RB; js >1
   2051   |->fff_resbit:
   2052   |->fff_resi:
   2053   |  mov PC, [BASE-4]
   2054   |  mov dword [BASE-4], LJ_TISNUM
   2055   |  mov dword [BASE-8], RB
   2056   |  jmp ->fff_res1
   2057   |1:
   2058   |  mov PC, [BASE-4]
   2059   |  mov dword [BASE-4], 0x41e00000  // 2^31.
   2060   |  mov dword [BASE-8], 0
   2061   |  jmp ->fff_res1
   2062   |2:
   2063   |  ja ->fff_fallback
   2064   |.else
   2065   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
   2066   |.endif
   2067   |  movsd xmm0, qword [BASE]
   2068   |  sseconst_abs xmm1, RDa
   2069   |  andps xmm0, xmm1
   2070   |->fff_resxmm0:
   2071   |  mov PC, [BASE-4]
   2072   |  movsd qword [BASE-8], xmm0
   2073   |  // fallthrough
   2074   |
   2075   |->fff_res1:
   2076   |  mov RD, 1+1
   2077   |->fff_res:
   2078   |  mov MULTRES, RD
   2079   |->fff_res_:
   2080   |  test PC, FRAME_TYPE
   2081   |  jnz >7
   2082   |5:
   2083   |  cmp PC_RB, RDL			// More results expected?
   2084   |  ja >6
   2085   |  // Adjust BASE. KBASE is assumed to be set for the calling frame.
   2086   |  movzx RA, PC_RA
   2087   |  not RAa				// Note: ~RA = -(RA+1)
   2088   |  lea BASE, [BASE+RA*8]		// base = base - (RA+1)*8
   2089   |  ins_next
   2090   |
   2091   |6:  // Fill up results with nil.
   2092   |  mov dword [BASE+RD*8-12], LJ_TNIL
   2093   |  add RD, 1
   2094   |  jmp <5
   2095   |
   2096   |7:  // Non-standard return case.
   2097   |  mov RAa, -8			// Results start at BASE+RA = BASE-8.
   2098   |  jmp ->vm_return
   2099   |
   2100   |.if X64
   2101   |.define fff_resfp, fff_resxmm0
   2102   |.else
   2103   |.define fff_resfp, fff_resn
   2104   |.endif
   2105   |
   2106   |.macro math_round, func
   2107   |  .ffunc math_ .. func
   2108   |.if DUALNUM
   2109   |  cmp dword [BASE+4], LJ_TISNUM; jne >1
   2110   |  mov RB, dword [BASE]; jmp ->fff_resi
   2111   |1:
   2112   |  ja ->fff_fallback
   2113   |.else
   2114   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
   2115   |.endif
   2116   |  movsd xmm0, qword [BASE]
   2117   |  call ->vm_ .. func .. _sse
   2118   |.if DUALNUM
   2119   |  cvttsd2si RB, xmm0
   2120   |  cmp RB, 0x80000000
   2121   |  jne ->fff_resi
   2122   |  cvtsi2sd xmm1, RB
   2123   |  ucomisd xmm0, xmm1
   2124   |  jp ->fff_resxmm0
   2125   |  je ->fff_resi
   2126   |.endif
   2127   |  jmp ->fff_resxmm0
   2128   |.endmacro
   2129   |
   2130   |  math_round floor
   2131   |  math_round ceil
   2132   |
   2133   |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
   2134   |
   2135   |.ffunc math_log
   2136   |  cmp NARGS:RD, 1+1; jne ->fff_fallback	// Exactly one argument.
   2137   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
   2138   |  movsd xmm0, qword [BASE]
   2139   |.if not X64
   2140   |  movsd FPARG1, xmm0
   2141   |.endif
   2142   |  mov RB, BASE
   2143   |  call extern log
   2144   |  mov BASE, RB
   2145   |  jmp ->fff_resfp
   2146   |
   2147   |.macro math_extern, func
   2148   |  .ffunc_nsse math_ .. func
   2149   |.if not X64
   2150   |  movsd FPARG1, xmm0
   2151   |.endif
   2152   |  mov RB, BASE
   2153   |  call extern func
   2154   |  mov BASE, RB
   2155   |  jmp ->fff_resfp
   2156   |.endmacro
   2157   |
   2158   |.macro math_extern2, func
   2159   |  .ffunc_nnsse math_ .. func
   2160   |.if not X64
   2161   |  movsd FPARG1, xmm0
   2162   |  movsd FPARG3, xmm1
   2163   |.endif
   2164   |  mov RB, BASE
   2165   |  call extern func
   2166   |  mov BASE, RB
   2167   |  jmp ->fff_resfp
   2168   |.endmacro
   2169   |
   2170   |  math_extern log10
   2171   |  math_extern exp
   2172   |  math_extern sin
   2173   |  math_extern cos
   2174   |  math_extern tan
   2175   |  math_extern asin
   2176   |  math_extern acos
   2177   |  math_extern atan
   2178   |  math_extern sinh
   2179   |  math_extern cosh
   2180   |  math_extern tanh
   2181   |  math_extern2 pow
   2182   |  math_extern2 atan2
   2183   |  math_extern2 fmod
   2184   |
   2185   |.ffunc_nnr math_ldexp;	fscale; fpop1;	jmp ->fff_resn
   2186   |
   2187   |.ffunc_1 math_frexp
   2188   |  mov RB, [BASE+4]
   2189   |  cmp RB, LJ_TISNUM;  jae ->fff_fallback
   2190   |  mov PC, [BASE-4]
   2191   |  mov RC, [BASE]
   2192   |  mov [BASE-4], RB; mov [BASE-8], RC
   2193   |  shl RB, 1; cmp RB, 0xffe00000; jae >3
   2194   |  or RC, RB; jz >3
   2195   |  mov RC, 1022
   2196   |  cmp RB, 0x00200000; jb >4
   2197   |1:
   2198   |  shr RB, 21; sub RB, RC		// Extract and unbias exponent.
   2199   |  cvtsi2sd xmm0, RB
   2200   |  mov RB, [BASE-4]
   2201   |  and RB, 0x800fffff			// Mask off exponent.
   2202   |  or RB, 0x3fe00000			// Put mantissa in range [0.5,1) or 0.
   2203   |  mov [BASE-4], RB
   2204   |2:
   2205   |  movsd qword [BASE], xmm0
   2206   |  mov RD, 1+2
   2207   |  jmp ->fff_res
   2208   |3:  // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
   2209   |  xorps xmm0, xmm0; jmp <2
   2210   |4:  // Handle denormals by multiplying with 2^54 and adjusting the bias.
   2211   |  movsd xmm0, qword [BASE]
   2212   |  sseconst_hi xmm1, RBa, 43500000  // 2^54.
   2213   |  mulsd xmm0, xmm1
   2214   |  movsd qword [BASE-8], xmm0
   2215   |  mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
   2216   |
   2217   |.ffunc_nsse math_modf
   2218   |  mov RB, [BASE+4]
   2219   |  mov PC, [BASE-4]
   2220   |  shl RB, 1; cmp RB, 0xffe00000; je >4	// +-Inf?
   2221   |  movaps xmm4, xmm0
   2222   |  call ->vm_trunc_sse
   2223   |  subsd xmm4, xmm0
   2224   |1:
   2225   |  movsd qword [BASE-8], xmm0
   2226   |  movsd qword [BASE], xmm4
   2227   |  mov RC, [BASE-4]; mov RB, [BASE+4]
   2228   |  xor RC, RB; js >3				// Need to adjust sign?
   2229   |2:
   2230   |  mov RD, 1+2
   2231   |  jmp ->fff_res
   2232   |3:
   2233   |  xor RB, 0x80000000; mov [BASE+4], RB	// Flip sign of fraction.
   2234   |  jmp <2
   2235   |4:
   2236   |  xorps xmm4, xmm4; jmp <1			// Return +-Inf and +-0.
   2237   |
   2238   |.macro math_minmax, name, cmovop, sseop
   2239   |  .ffunc name
   2240   |  mov RA, 2
   2241   |  cmp dword [BASE+4], LJ_TISNUM
   2242   |.if DUALNUM
   2243   |  jne >4
   2244   |  mov RB, dword [BASE]
   2245   |1:  // Handle integers.
   2246   |  cmp RA, RD; jae ->fff_resi
   2247   |  cmp dword [BASE+RA*8-4], LJ_TISNUM; jne >3
   2248   |  cmp RB, dword [BASE+RA*8-8]
   2249   |  cmovop RB, dword [BASE+RA*8-8]
   2250   |  add RA, 1
   2251   |  jmp <1
   2252   |3:
   2253   |  ja ->fff_fallback
   2254   |  // Convert intermediate result to number and continue below.
   2255   |  cvtsi2sd xmm0, RB
   2256   |  jmp >6
   2257   |4:
   2258   |  ja ->fff_fallback
   2259   |.else
   2260   |  jae ->fff_fallback
   2261   |.endif
   2262   |
   2263   |  movsd xmm0, qword [BASE]
   2264   |5:  // Handle numbers or integers.
   2265   |  cmp RA, RD; jae ->fff_resxmm0
   2266   |  cmp dword [BASE+RA*8-4], LJ_TISNUM
   2267   |.if DUALNUM
   2268   |  jb >6
   2269   |  ja ->fff_fallback
   2270   |  cvtsi2sd xmm1, dword [BASE+RA*8-8]
   2271   |  jmp >7
   2272   |.else
   2273   |  jae ->fff_fallback
   2274   |.endif
   2275   |6:
   2276   |  movsd xmm1, qword [BASE+RA*8-8]
   2277   |7:
   2278   |  sseop xmm0, xmm1
   2279   |  add RA, 1
   2280   |  jmp <5
   2281   |.endmacro
   2282   |
   2283   |  math_minmax math_min, cmovg, minsd
   2284   |  math_minmax math_max, cmovl, maxsd
   2285   |
   2286   |//-- String library -----------------------------------------------------
   2287   |
   2288   |.ffunc string_byte			// Only handle the 1-arg case here.
   2289   |  cmp NARGS:RD, 1+1;  jne ->fff_fallback
   2290   |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
   2291   |  mov STR:RB, [BASE]
   2292   |  mov PC, [BASE-4]
   2293   |  cmp dword STR:RB->len, 1
   2294   |  jb ->fff_res0			// Return no results for empty string.
   2295   |  movzx RB, byte STR:RB[1]
   2296   |.if DUALNUM
   2297   |  jmp ->fff_resi
   2298   |.else
   2299   |  cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
   2300   |.endif
   2301   |
   2302   |.ffunc string_char			// Only handle the 1-arg case here.
   2303   |  ffgccheck
   2304   |  cmp NARGS:RD, 1+1;  jne ->fff_fallback	// *Exactly* 1 arg.
   2305   |  cmp dword [BASE+4], LJ_TISNUM
   2306   |.if DUALNUM
   2307   |  jne ->fff_fallback
   2308   |  mov RB, dword [BASE]
   2309   |  cmp RB, 255;  ja ->fff_fallback
   2310   |  mov TMP2, RB
   2311   |.else
   2312   |  jae ->fff_fallback
   2313   |  cvttsd2si RB, qword [BASE]
   2314   |  cmp RB, 255;  ja ->fff_fallback
   2315   |  mov TMP2, RB
   2316   |.endif
   2317   |.if X64
   2318   |  mov TMP3, 1
   2319   |.else
   2320   |  mov ARG3, 1
   2321   |.endif
   2322   |  lea RDa, TMP2			// Points to stack. Little-endian.
   2323   |->fff_newstr:
   2324   |  mov L:RB, SAVE_L
   2325   |  mov L:RB->base, BASE
   2326   |.if X64
   2327   |  mov CARG3d, TMP3			// Zero-extended to size_t.
   2328   |  mov CARG2, RDa			// May be 64 bit ptr to stack.
   2329   |  mov CARG1d, L:RB
   2330   |.else
   2331   |  mov ARG2, RD
   2332   |  mov ARG1, L:RB
   2333   |.endif
   2334   |  mov SAVE_PC, PC
   2335   |  call extern lj_str_new		// (lua_State *L, char *str, size_t l)
   2336   |->fff_resstr:
   2337   |  // GCstr * returned in eax (RD).
   2338   |  mov BASE, L:RB->base
   2339   |  mov PC, [BASE-4]
   2340   |  mov dword [BASE-4], LJ_TSTR
   2341   |  mov [BASE-8], STR:RD
   2342   |  jmp ->fff_res1
   2343   |
   2344   |.ffunc string_sub
   2345   |  ffgccheck
   2346   |  mov TMP2, -1
   2347   |  cmp NARGS:RD, 1+2;  jb ->fff_fallback
   2348   |  jna >1
   2349   |  cmp dword [BASE+20], LJ_TISNUM
   2350   |.if DUALNUM
   2351   |  jne ->fff_fallback
   2352   |  mov RB, dword [BASE+16]
   2353   |  mov TMP2, RB
   2354   |.else
   2355   |  jae ->fff_fallback
   2356   |  cvttsd2si RB, qword [BASE+16]
   2357   |  mov TMP2, RB
   2358   |.endif
   2359   |1:
   2360   |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
   2361   |  cmp dword [BASE+12], LJ_TISNUM
   2362   |.if DUALNUM
   2363   |  jne ->fff_fallback
   2364   |.else
   2365   |  jae ->fff_fallback
   2366   |.endif
   2367   |  mov STR:RB, [BASE]
   2368   |  mov TMP3, STR:RB
   2369   |  mov RB, STR:RB->len
   2370   |.if DUALNUM
   2371   |  mov RA, dword [BASE+8]
   2372   |.else
   2373   |  cvttsd2si RA, qword [BASE+8]
   2374   |.endif
   2375   |  mov RC, TMP2
   2376   |  cmp RB, RC				// len < end? (unsigned compare)
   2377   |  jb >5
   2378   |2:
   2379   |  test RA, RA			// start <= 0?
   2380   |  jle >7
   2381   |3:
   2382   |  mov STR:RB, TMP3
   2383   |  sub RC, RA				// start > end?
   2384   |  jl ->fff_emptystr
   2385   |  lea RB, [STR:RB+RA+#STR-1]
   2386   |  add RC, 1
   2387   |4:
   2388   |.if X64
   2389   |  mov TMP3, RC
   2390   |.else
   2391   |  mov ARG3, RC
   2392   |.endif
   2393   |  mov RD, RB
   2394   |  jmp ->fff_newstr
   2395   |
   2396   |5:  // Negative end or overflow.
   2397   |  jl >6
   2398   |  lea RC, [RC+RB+1]			// end = end+(len+1)
   2399   |  jmp <2
   2400   |6:  // Overflow.
   2401   |  mov RC, RB				// end = len
   2402   |  jmp <2
   2403   |
   2404   |7:  // Negative start or underflow.
   2405   |  je >8
   2406   |  add RA, RB				// start = start+(len+1)
   2407   |  add RA, 1
   2408   |  jg <3				// start > 0?
   2409   |8:  // Underflow.
   2410   |  mov RA, 1				// start = 1
   2411   |  jmp <3
   2412   |
   2413   |->fff_emptystr:  // Range underflow.
   2414   |  xor RC, RC				// Zero length. Any ptr in RB is ok.
   2415   |  jmp <4
   2416   |
   2417   |.macro ffstring_op, name
   2418   |  .ffunc_1 string_ .. name
   2419   |  ffgccheck
   2420   |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
   2421   |  mov L:RB, SAVE_L
   2422   |   lea SBUF:FCARG1, [DISPATCH+DISPATCH_GL(tmpbuf)]
   2423   |  mov L:RB->base, BASE
   2424   |  mov STR:FCARG2, [BASE]		// Caveat: FCARG2 == BASE
   2425   |   mov RC, SBUF:FCARG1->b
   2426   |   mov SBUF:FCARG1->L, L:RB
   2427   |   mov SBUF:FCARG1->p, RC
   2428   |  mov SAVE_PC, PC
   2429   |  call extern lj_buf_putstr_ .. name .. @8
   2430   |  mov FCARG1, eax
   2431   |  call extern lj_buf_tostr@4
   2432   |  jmp ->fff_resstr
   2433   |.endmacro
   2434   |
   2435   |ffstring_op reverse
   2436   |ffstring_op lower
   2437   |ffstring_op upper
   2438   |
   2439   |//-- Bit library --------------------------------------------------------
   2440   |
   2441   |.macro .ffunc_bit, name, kind, fdef
   2442   |  fdef name
   2443   |.if kind == 2
   2444   |  sseconst_tobit xmm1, RBa
   2445   |.endif
   2446   |  cmp dword [BASE+4], LJ_TISNUM
   2447   |.if DUALNUM
   2448   |  jne >1
   2449   |  mov RB, dword [BASE]
   2450   |.if kind > 0
   2451   |  jmp >2
   2452   |.else
   2453   |  jmp ->fff_resbit
   2454   |.endif
   2455   |1:
   2456   |  ja ->fff_fallback
   2457   |.else
   2458   |  jae ->fff_fallback
   2459   |.endif
   2460   |  movsd xmm0, qword [BASE]
   2461   |.if kind < 2
   2462   |  sseconst_tobit xmm1, RBa
   2463   |.endif
   2464   |  addsd xmm0, xmm1
   2465   |  movd RB, xmm0
   2466   |2:
   2467   |.endmacro
   2468   |
   2469   |.macro .ffunc_bit, name, kind
   2470   |  .ffunc_bit name, kind, .ffunc_1
   2471   |.endmacro
   2472   |
   2473   |.ffunc_bit bit_tobit, 0
   2474   |  jmp ->fff_resbit
   2475   |
   2476   |.macro .ffunc_bit_op, name, ins
   2477   |  .ffunc_bit name, 2
   2478   |  mov TMP2, NARGS:RD			// Save for fallback.
   2479   |  lea RD, [BASE+NARGS:RD*8-16]
   2480   |1:
   2481   |  cmp RD, BASE
   2482   |  jbe ->fff_resbit
   2483   |  cmp dword [RD+4], LJ_TISNUM
   2484   |.if DUALNUM
   2485   |  jne >2
   2486   |  ins RB, dword [RD]
   2487   |  sub RD, 8
   2488   |  jmp <1
   2489   |2:
   2490   |  ja ->fff_fallback_bit_op
   2491   |.else
   2492   |  jae ->fff_fallback_bit_op
   2493   |.endif
   2494   |  movsd xmm0, qword [RD]
   2495   |  addsd xmm0, xmm1
   2496   |  movd RA, xmm0
   2497   |  ins RB, RA
   2498   |  sub RD, 8
   2499   |  jmp <1
   2500   |.endmacro
   2501   |
   2502   |.ffunc_bit_op bit_band, and
   2503   |.ffunc_bit_op bit_bor, or
   2504   |.ffunc_bit_op bit_bxor, xor
   2505   |
   2506   |.ffunc_bit bit_bswap, 1
   2507   |  bswap RB
   2508   |  jmp ->fff_resbit
   2509   |
   2510   |.ffunc_bit bit_bnot, 1
   2511   |  not RB
   2512   |.if DUALNUM
   2513   |  jmp ->fff_resbit
   2514   |.else
   2515   |->fff_resbit:
   2516   |  cvtsi2sd xmm0, RB
   2517   |  jmp ->fff_resxmm0
   2518   |.endif
   2519   |
   2520   |->fff_fallback_bit_op:
   2521   |  mov NARGS:RD, TMP2			// Restore for fallback
   2522   |  jmp ->fff_fallback
   2523   |
   2524   |.macro .ffunc_bit_sh, name, ins
   2525   |.if DUALNUM
   2526   |  .ffunc_bit name, 1, .ffunc_2
   2527   |  // Note: no inline conversion from number for 2nd argument!
   2528   |  cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
   2529   |  mov RA, dword [BASE+8]
   2530   |.else
   2531   |  .ffunc_nnsse name
   2532   |  sseconst_tobit xmm2, RBa
   2533   |  addsd xmm0, xmm2
   2534   |  addsd xmm1, xmm2
   2535   |  movd RB, xmm0
   2536   |  movd RA, xmm1
   2537   |.endif
   2538   |  ins RB, cl				// Assumes RA is ecx.
   2539   |  jmp ->fff_resbit
   2540   |.endmacro
   2541   |
   2542   |.ffunc_bit_sh bit_lshift, shl
   2543   |.ffunc_bit_sh bit_rshift, shr
   2544   |.ffunc_bit_sh bit_arshift, sar
   2545   |.ffunc_bit_sh bit_rol, rol
   2546   |.ffunc_bit_sh bit_ror, ror
   2547   |
   2548   |//-----------------------------------------------------------------------
   2549   |
   2550   |->fff_fallback_2:
   2551   |  mov NARGS:RD, 1+2			// Other args are ignored, anyway.
   2552   |  jmp ->fff_fallback
   2553   |->fff_fallback_1:
   2554   |  mov NARGS:RD, 1+1			// Other args are ignored, anyway.
   2555   |->fff_fallback:			// Call fast function fallback handler.
   2556   |  // BASE = new base, RD = nargs+1
   2557   |  mov L:RB, SAVE_L
   2558   |  mov PC, [BASE-4]			// Fallback may overwrite PC.
   2559   |  mov SAVE_PC, PC			// Redundant (but a defined value).
   2560   |  mov L:RB->base, BASE
   2561   |  lea RD, [BASE+NARGS:RD*8-8]
   2562   |  lea RA, [RD+8*LUA_MINSTACK]	// Ensure enough space for handler.
   2563   |  mov L:RB->top, RD
   2564   |  mov CFUNC:RD, [BASE-8]
   2565   |  cmp RA, L:RB->maxstack
   2566   |  ja >5				// Need to grow stack.
   2567   |.if X64
   2568   |  mov CARG1d, L:RB
   2569   |.else
   2570   |  mov ARG1, L:RB
   2571   |.endif
   2572   |  call aword CFUNC:RD->f		// (lua_State *L)
   2573   |  mov BASE, L:RB->base
   2574   |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
   2575   |  test RD, RD;  jg ->fff_res		// Returned nresults+1?
   2576   |1:
   2577   |  mov RA, L:RB->top
   2578   |  sub RA, BASE
   2579   |  shr RA, 3
   2580   |  test RD, RD
   2581   |  lea NARGS:RD, [RA+1]
   2582   |  mov LFUNC:RB, [BASE-8]
   2583   |  jne ->vm_call_tail			// Returned -1?
   2584   |  ins_callt				// Returned 0: retry fast path.
   2585   |
   2586   |// Reconstruct previous base for vmeta_call during tailcall.
   2587   |->vm_call_tail:
   2588   |  mov RA, BASE
   2589   |  test PC, FRAME_TYPE
   2590   |  jnz >3
   2591   |  movzx RB, PC_RA
   2592   |  not RBa				// Note: ~RB = -(RB+1)
   2593   |  lea BASE, [BASE+RB*8]		// base = base - (RB+1)*8
   2594   |  jmp ->vm_call_dispatch		// Resolve again for tailcall.
   2595   |3:
   2596   |  mov RB, PC
   2597   |  and RB, -8
   2598   |  sub BASE, RB
   2599   |  jmp ->vm_call_dispatch		// Resolve again for tailcall.
   2600   |
   2601   |5:  // Grow stack for fallback handler.
   2602   |  mov FCARG2, LUA_MINSTACK
   2603   |  mov FCARG1, L:RB
   2604   |  call extern lj_state_growstack@8	// (lua_State *L, int n)
   2605   |  mov BASE, L:RB->base
   2606   |  xor RD, RD				// Simulate a return 0.
   2607   |  jmp <1				// Dumb retry (goes through ff first).
   2608   |
   2609   |->fff_gcstep:			// Call GC step function.
   2610   |  // BASE = new base, RD = nargs+1
   2611   |  pop RBa				// Must keep stack at same level.
   2612   |  mov TMPa, RBa			// Save return address
   2613   |  mov L:RB, SAVE_L
   2614   |  mov SAVE_PC, PC			// Redundant (but a defined value).
   2615   |  mov L:RB->base, BASE
   2616   |  lea RD, [BASE+NARGS:RD*8-8]
   2617   |  mov FCARG1, L:RB
   2618   |  mov L:RB->top, RD
   2619   |  call extern lj_gc_step@4		// (lua_State *L)
   2620   |  mov BASE, L:RB->base
   2621   |  mov RD, L:RB->top
   2622   |  sub RD, BASE
   2623   |  shr RD, 3
   2624   |  add NARGS:RD, 1
   2625   |  mov RBa, TMPa
   2626   |  push RBa				// Restore return address.
   2627   |  ret
   2628   |
   2629   |//-----------------------------------------------------------------------
   2630   |//-- Special dispatch targets -------------------------------------------
   2631   |//-----------------------------------------------------------------------
   2632   |
   2633   |->vm_record:				// Dispatch target for recording phase.
   2634   |.if JIT
   2635   |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
   2636   |  test RDL, HOOK_VMEVENT		// No recording while in vmevent.
   2637   |  jnz >5
   2638   |  // Decrement the hookcount for consistency, but always do the call.
   2639   |  test RDL, HOOK_ACTIVE
   2640   |  jnz >1
   2641   |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
   2642   |  jz >1
   2643   |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
   2644   |  jmp >1
   2645   |.endif
   2646   |
   2647   |->vm_rethook:			// Dispatch target for return hooks.
   2648   |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
   2649   |  test RDL, HOOK_ACTIVE		// Hook already active?
   2650   |  jnz >5
   2651   |  jmp >1
   2652   |
   2653   |->vm_inshook:			// Dispatch target for instr/line hooks.
   2654   |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
   2655   |  test RDL, HOOK_ACTIVE		// Hook already active?
   2656   |  jnz >5
   2657   |
   2658   |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
   2659   |  jz >5
   2660   |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
   2661   |  jz >1
   2662   |  test RDL, LUA_MASKLINE
   2663   |  jz >5
   2664   |1:
   2665   |  mov L:RB, SAVE_L
   2666   |  mov L:RB->base, BASE
   2667   |  mov FCARG2, PC			// Caveat: FCARG2 == BASE
   2668   |  mov FCARG1, L:RB
   2669   |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
   2670   |  call extern lj_dispatch_ins@8	// (lua_State *L, const BCIns *pc)
   2671   |3:
   2672   |  mov BASE, L:RB->base
   2673   |4:
   2674   |  movzx RA, PC_RA
   2675   |5:
   2676   |  movzx OP, PC_OP
   2677   |  movzx RD, PC_RD
   2678   |.if X64
   2679   |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]	// Re-dispatch to static ins.
   2680   |.else
   2681   |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]	// Re-dispatch to static ins.
   2682   |.endif
   2683   |
   2684   |->cont_hook:				// Continue from hook yield.
   2685   |  add PC, 4
   2686   |  mov RA, [RB-24]
   2687   |  mov MULTRES, RA			// Restore MULTRES for *M ins.
   2688   |  jmp <4
   2689   |
   2690   |->vm_hotloop:			// Hot loop counter underflow.
   2691   |.if JIT
   2692   |  mov LFUNC:RB, [BASE-8]		// Same as curr_topL(L).
   2693   |  mov RB, LFUNC:RB->pc
   2694   |  movzx RD, byte [RB+PC2PROTO(framesize)]
   2695   |  lea RD, [BASE+RD*8]
   2696   |  mov L:RB, SAVE_L
   2697   |  mov L:RB->base, BASE
   2698   |  mov L:RB->top, RD
   2699   |  mov FCARG2, PC
   2700   |  lea FCARG1, [DISPATCH+GG_DISP2J]
   2701   |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
   2702   |  mov SAVE_PC, PC
   2703   |  call extern lj_trace_hot@8		// (jit_State *J, const BCIns *pc)
   2704   |  jmp <3
   2705   |.endif
   2706   |
   2707   |->vm_callhook:			// Dispatch target for call hooks.
   2708   |  mov SAVE_PC, PC
   2709   |.if JIT
   2710   |  jmp >1
   2711   |.endif
   2712   |
   2713   |->vm_hotcall:			// Hot call counter underflow.
   2714   |.if JIT
   2715   |  mov SAVE_PC, PC
   2716   |  or PC, 1				// Marker for hot call.
   2717   |1:
   2718   |.endif
   2719   |  lea RD, [BASE+NARGS:RD*8-8]
   2720   |  mov L:RB, SAVE_L
   2721   |  mov L:RB->base, BASE
   2722   |  mov L:RB->top, RD
   2723   |  mov FCARG2, PC
   2724   |  mov FCARG1, L:RB
   2725   |  call extern lj_dispatch_call@8	// (lua_State *L, const BCIns *pc)
   2726   |  // ASMFunction returned in eax/rax (RDa).
   2727   |  mov SAVE_PC, 0			// Invalidate for subsequent line hook.
   2728   |.if JIT
   2729   |  and PC, -2
   2730   |.endif
   2731   |  mov BASE, L:RB->base
   2732   |  mov RAa, RDa
   2733   |  mov RD, L:RB->top
   2734   |  sub RD, BASE
   2735   |  mov RBa, RAa
   2736   |  movzx RA, PC_RA
   2737   |  shr RD, 3
   2738   |  add NARGS:RD, 1
   2739   |  jmp RBa
   2740   |
   2741   |->cont_stitch:			// Trace stitching.
   2742   |.if JIT
   2743   |  // BASE = base, RC = result, RB = mbase
   2744   |  mov TRACE:RA, [RB-24]		// Save previous trace.
   2745   |  mov TMP1, TRACE:RA
   2746   |  mov TMP3, DISPATCH			// Need one more register.
   2747   |  mov DISPATCH, MULTRES
   2748   |  movzx RA, PC_RA
   2749   |  lea RA, [BASE+RA*8]		// Call base.
   2750   |  sub DISPATCH, 1
   2751   |  jz >2
   2752   |1:  // Move results down.
   2753   |.if X64
   2754   |  mov RBa, [RC]
   2755   |  mov [RA], RBa
   2756   |.else
   2757   |  mov RB, [RC]
   2758   |  mov [RA], RB
   2759   |  mov RB, [RC+4]
   2760   |  mov [RA+4], RB
   2761   |.endif
   2762   |  add RC, 8
   2763   |  add RA, 8
   2764   |  sub DISPATCH, 1
   2765   |  jnz <1
   2766   |2:
   2767   |  movzx RC, PC_RA
   2768   |  movzx RB, PC_RB
   2769   |  add RC, RB
   2770   |  lea RC, [BASE+RC*8-8]
   2771   |3:
   2772   |  cmp RC, RA
   2773   |  ja >9				// More results wanted?
   2774   |
   2775   |  mov DISPATCH, TMP3
   2776   |  mov TRACE:RD, TMP1			// Get previous trace.
   2777   |  movzx RB, word TRACE:RD->traceno
   2778   |  movzx RD, word TRACE:RD->link
   2779   |  cmp RD, RB
   2780   |  je ->cont_nop			// Blacklisted.
   2781   |  test RD, RD
   2782   |  jne =>BC_JLOOP			// Jump to stitched trace.
   2783   |
   2784   |  // Stitch a new trace to the previous trace.
   2785   |  mov [DISPATCH+DISPATCH_J(exitno)], RB
   2786   |  mov L:RB, SAVE_L
   2787   |  mov L:RB->base, BASE
   2788   |  mov FCARG2, PC
   2789   |  lea FCARG1, [DISPATCH+GG_DISP2J]
   2790   |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
   2791   |  call extern lj_dispatch_stitch@8	// (jit_State *J, const BCIns *pc)
   2792   |  mov BASE, L:RB->base
   2793   |  jmp ->cont_nop
   2794   |
   2795   |9:  // Fill up results with nil.
   2796   |  mov dword [RA+4], LJ_TNIL
   2797   |  add RA, 8
   2798   |  jmp <3
   2799   |.endif
   2800   |
   2801   |->vm_profhook:			// Dispatch target for profiler hook.
   2802 #if LJ_HASPROFILE
   2803   |  mov L:RB, SAVE_L
   2804   |  mov L:RB->base, BASE
   2805   |  mov FCARG2, PC			// Caveat: FCARG2 == BASE
   2806   |  mov FCARG1, L:RB
   2807   |  call extern lj_dispatch_profile@8	// (lua_State *L, const BCIns *pc)
   2808   |  mov BASE, L:RB->base
   2809   |  // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction.
   2810   |  sub PC, 4
   2811   |  jmp ->cont_nop
   2812 #endif
   2813   |
   2814   |//-----------------------------------------------------------------------
   2815   |//-- Trace exit handler -------------------------------------------------
   2816   |//-----------------------------------------------------------------------
   2817   |
   2818   |// Called from an exit stub with the exit number on the stack.
   2819   |// The 16 bit exit number is stored with two (sign-extended) push imm8.
   2820   |->vm_exit_handler:
   2821   |.if JIT
   2822   |.if X64
   2823   |  push r13; push r12
   2824   |  push r11; push r10; push r9; push r8
   2825   |  push rdi; push rsi; push rbp; lea rbp, [rsp+88]; push rbp
   2826   |  push rbx; push rdx; push rcx; push rax
   2827   |  movzx RC, byte [rbp-8]		// Reconstruct exit number.
   2828   |  mov RCH, byte [rbp-16]
   2829   |  mov [rbp-8], r15; mov [rbp-16], r14
   2830   |.else
   2831   |  push ebp; lea ebp, [esp+12]; push ebp
   2832   |  push ebx; push edx; push ecx; push eax
   2833   |  movzx RC, byte [ebp-4]		// Reconstruct exit number.
   2834   |  mov RCH, byte [ebp-8]
   2835   |  mov [ebp-4], edi; mov [ebp-8], esi
   2836   |.endif
   2837   |  // Caveat: DISPATCH is ebx.
   2838   |  mov DISPATCH, [ebp]
   2839   |  mov RA, [DISPATCH+DISPATCH_GL(vmstate)]	// Get trace number.
   2840   |  set_vmstate EXIT
   2841   |  mov [DISPATCH+DISPATCH_J(exitno)], RC
   2842   |  mov [DISPATCH+DISPATCH_J(parent)], RA
   2843   |.if X64
   2844   |.if X64WIN
   2845   |  sub rsp, 16*8+4*8			// Room for SSE regs + save area.
   2846   |.else
   2847   |  sub rsp, 16*8			// Room for SSE regs.
   2848   |.endif
   2849   |  add rbp, -128
   2850   |  movsd qword [rbp-8],   xmm15; movsd qword [rbp-16],  xmm14
   2851   |  movsd qword [rbp-24],  xmm13; movsd qword [rbp-32],  xmm12
   2852   |  movsd qword [rbp-40],  xmm11; movsd qword [rbp-48],  xmm10
   2853   |  movsd qword [rbp-56],  xmm9;  movsd qword [rbp-64],  xmm8
   2854   |  movsd qword [rbp-72],  xmm7;  movsd qword [rbp-80],  xmm6
   2855   |  movsd qword [rbp-88],  xmm5;  movsd qword [rbp-96],  xmm4
   2856   |  movsd qword [rbp-104], xmm3;  movsd qword [rbp-112], xmm2
   2857   |  movsd qword [rbp-120], xmm1;  movsd qword [rbp-128], xmm0
   2858   |.else
   2859   |  sub esp, 8*8+16			// Room for SSE regs + args.
   2860   |  movsd qword [ebp-40], xmm7; movsd qword [ebp-48], xmm6
   2861   |  movsd qword [ebp-56], xmm5; movsd qword [ebp-64], xmm4
   2862   |  movsd qword [ebp-72], xmm3; movsd qword [ebp-80], xmm2
   2863   |  movsd qword [ebp-88], xmm1; movsd qword [ebp-96], xmm0
   2864   |.endif
   2865   |  // Caveat: RB is ebp.
   2866   |  mov L:RB, [DISPATCH+DISPATCH_GL(cur_L)]
   2867   |  mov BASE, [DISPATCH+DISPATCH_GL(jit_base)]
   2868   |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
   2869   |  mov L:RB->base, BASE
   2870   |.if X64WIN
   2871   |  lea CARG2, [rsp+4*8]
   2872   |.elif X64
   2873   |  mov CARG2, rsp
   2874   |.else
   2875   |  lea FCARG2, [esp+16]
   2876   |.endif
   2877   |  lea FCARG1, [DISPATCH+GG_DISP2J]
   2878   |  mov dword [DISPATCH+DISPATCH_GL(jit_base)], 0
   2879   |  call extern lj_trace_exit@8	// (jit_State *J, ExitState *ex)
   2880   |  // MULTRES or negated error code returned in eax (RD).
   2881   |  mov RAa, L:RB->cframe
   2882   |  and RAa, CFRAME_RAWMASK
   2883   |.if X64WIN
   2884   |  // Reposition stack later.
   2885   |.elif X64
   2886   |  mov rsp, RAa			// Reposition stack to C frame.
   2887   |.else
   2888   |  mov esp, RAa			// Reposition stack to C frame.
   2889   |.endif
   2890   |  mov [RAa+CFRAME_OFS_L], L:RB	// Set SAVE_L (on-trace resume/yield).
   2891   |  mov BASE, L:RB->base
   2892   |  mov PC, [RAa+CFRAME_OFS_PC]	// Get SAVE_PC.
   2893   |.if X64
   2894   |  jmp >1
   2895   |.endif
   2896   |.endif
   2897   |->vm_exit_interp:
   2898   |  // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
   2899   |.if JIT
   2900   |.if X64
   2901   |  // Restore additional callee-save registers only used in compiled code.
   2902   |.if X64WIN
   2903   |  lea RAa, [rsp+9*16+4*8]
   2904   |1:
   2905   |  movdqa xmm15, [RAa-9*16]
   2906   |  movdqa xmm14, [RAa-8*16]
   2907   |  movdqa xmm13, [RAa-7*16]
   2908   |  movdqa xmm12, [RAa-6*16]
   2909   |  movdqa xmm11, [RAa-5*16]
   2910   |  movdqa xmm10, [RAa-4*16]
   2911   |  movdqa xmm9, [RAa-3*16]
   2912   |  movdqa xmm8, [RAa-2*16]
   2913   |  movdqa xmm7, [RAa-1*16]
   2914   |  mov rsp, RAa			// Reposition stack to C frame.
   2915   |  movdqa xmm6, [RAa]
   2916   |  mov r15, CSAVE_3
   2917   |  mov r14, CSAVE_4
   2918   |.else
   2919   |  add rsp, 16			// Reposition stack to C frame.
   2920   |1:
   2921   |.endif
   2922   |  mov r13, TMPa
   2923   |  mov r12, TMPQ
   2924   |.endif
   2925   |  test RD, RD; js >9			// Check for error from exit.
   2926   |  mov L:RB, SAVE_L
   2927   |  mov MULTRES, RD
   2928   |  mov LFUNC:KBASE, [BASE-8]
   2929   |  mov KBASE, LFUNC:KBASE->pc
   2930   |  mov KBASE, [KBASE+PC2PROTO(k)]
   2931   |  mov L:RB->base, BASE
   2932   |  mov dword [DISPATCH+DISPATCH_GL(jit_base)], 0
   2933   |  set_vmstate INTERP
   2934   |  // Modified copy of ins_next which handles function header dispatch, too.
   2935   |  mov RC, [PC]
   2936   |  movzx RA, RCH
   2937   |  movzx OP, RCL
   2938   |  add PC, 4
   2939   |  shr RC, 16
   2940   |  cmp OP, BC_FUNCF			// Function header?
   2941   |  jb >3
   2942   |  cmp OP, BC_FUNCC+2			// Fast function?
   2943   |  jae >4
   2944   |2:
   2945   |  mov RC, MULTRES			// RC/RD holds nres+1.
   2946   |3:
   2947   |.if X64
   2948   |  jmp aword [DISPATCH+OP*8]
   2949   |.else
   2950   |  jmp aword [DISPATCH+OP*4]
   2951   |.endif
   2952   |
   2953   |4:  // Check frame below fast function.
   2954   |  mov RC, [BASE-4]
   2955   |  test RC, FRAME_TYPE
   2956   |  jnz <2				// Trace stitching continuation?
   2957   |  // Otherwise set KBASE for Lua function below fast function.
   2958   |  movzx RC, byte [RC-3]
   2959   |  not RCa
   2960   |  mov LFUNC:KBASE, [BASE+RC*8-8]
   2961   |  mov KBASE, LFUNC:KBASE->pc
   2962   |  mov KBASE, [KBASE+PC2PROTO(k)]
   2963   |  jmp <2
   2964   |
   2965   |9:  // Rethrow error from the right C frame.
   2966   |  neg RD
   2967   |  mov FCARG1, L:RB
   2968   |  mov FCARG2, RD
   2969   |  call extern lj_err_throw@8		// (lua_State *L, int errcode)
   2970   |.endif
   2971   |
   2972   |//-----------------------------------------------------------------------
   2973   |//-- Math helper functions ----------------------------------------------
   2974   |//-----------------------------------------------------------------------
   2975   |
   2976   |// FP value rounding. Called by math.floor/math.ceil fast functions
   2977   |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
   2978   |.macro vm_round, name, mode, cond
   2979   |->name:
   2980   |.if not X64 and cond
   2981   |  movsd xmm0, qword [esp+4]
   2982   |  call ->name .. _sse
   2983   |  movsd qword [esp+4], xmm0  // Overwrite callee-owned arg.
   2984   |  fld qword [esp+4]
   2985   |  ret
   2986   |.endif
   2987   |
   2988   |->name .. _sse:
   2989   |  sseconst_abs xmm2, RDa
   2990   |  sseconst_2p52 xmm3, RDa
   2991   |  movaps xmm1, xmm0
   2992   |  andpd xmm1, xmm2			// |x|
   2993   |  ucomisd xmm3, xmm1			// No truncation if 2^52 <= |x|.
   2994   |  jbe >1
   2995   |  andnpd xmm2, xmm0			// Isolate sign bit.
   2996   |.if mode == 2		// trunc(x)?
   2997   |  movaps xmm0, xmm1
   2998   |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
   2999   |  subsd xmm1, xmm3
   3000   |  sseconst_1 xmm3, RDa
   3001   |  cmpsd xmm0, xmm1, 1		// |x| < result?
   3002   |  andpd xmm0, xmm3
   3003   |  subsd xmm1, xmm0			// If yes, subtract -1.
   3004   |  orpd xmm1, xmm2			// Merge sign bit back in.
   3005   |.else
   3006   |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
   3007   |  subsd xmm1, xmm3
   3008   |  orpd xmm1, xmm2			// Merge sign bit back in.
   3009   |  .if mode == 1		// ceil(x)?
   3010   |    sseconst_m1 xmm2, RDa		// Must subtract -1 to preserve -0.
   3011   |    cmpsd xmm0, xmm1, 6		// x > result?
   3012   |  .else			// floor(x)?
   3013   |    sseconst_1 xmm2, RDa
   3014   |    cmpsd xmm0, xmm1, 1		// x < result?
   3015   |  .endif
   3016   |  andpd xmm0, xmm2
   3017   |  subsd xmm1, xmm0			// If yes, subtract +-1.
   3018   |.endif
   3019   |  movaps xmm0, xmm1
   3020   |1:
   3021   |  ret
   3022   |.endmacro
   3023   |
   3024   |  vm_round vm_floor, 0, 1
   3025   |  vm_round vm_ceil,  1, JIT
   3026   |  vm_round vm_trunc, 2, JIT
   3027   |
   3028   |// FP modulo x%y. Called by BC_MOD* and vm_arith.
   3029   |->vm_mod:
   3030   |// Args in xmm0/xmm1, return value in xmm0.
   3031   |// Caveat: xmm0-xmm5 and RC (eax) modified!
   3032   |  movaps xmm5, xmm0
   3033   |  divsd xmm0, xmm1
   3034   |  sseconst_abs xmm2, RDa
   3035   |  sseconst_2p52 xmm3, RDa
   3036   |  movaps xmm4, xmm0
   3037   |  andpd xmm4, xmm2			// |x/y|
   3038   |  ucomisd xmm3, xmm4			// No truncation if 2^52 <= |x/y|.
   3039   |  jbe >1
   3040   |  andnpd xmm2, xmm0			// Isolate sign bit.
   3041   |  addsd xmm4, xmm3			// (|x/y| + 2^52) - 2^52
   3042   |  subsd xmm4, xmm3
   3043   |  orpd xmm4, xmm2			// Merge sign bit back in.
   3044   |  sseconst_1 xmm2, RDa
   3045   |  cmpsd xmm0, xmm4, 1		// x/y < result?
   3046   |  andpd xmm0, xmm2
   3047   |  subsd xmm4, xmm0			// If yes, subtract 1.0.
   3048   |  movaps xmm0, xmm5
   3049   |  mulsd xmm1, xmm4
   3050   |  subsd xmm0, xmm1
   3051   |  ret
   3052   |1:
   3053   |  mulsd xmm1, xmm0
   3054   |  movaps xmm0, xmm5
   3055   |  subsd xmm0, xmm1
   3056   |  ret
   3057   |
   3058   |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
   3059   |->vm_powi_sse:
   3060   |  cmp eax, 1; jle >6			// i<=1?
   3061   |  // Now 1 < (unsigned)i <= 0x80000000.
   3062   |1:  // Handle leading zeros.
   3063   |  test eax, 1; jnz >2
   3064   |  mulsd xmm0, xmm0
   3065   |  shr eax, 1
   3066   |  jmp <1
   3067   |2:
   3068   |  shr eax, 1; jz >5
   3069   |  movaps xmm1, xmm0
   3070   |3:  // Handle trailing bits.
   3071   |  mulsd xmm0, xmm0
   3072   |  shr eax, 1; jz >4
   3073   |  jnc <3
   3074   |  mulsd xmm1, xmm0
   3075   |  jmp <3
   3076   |4:
   3077   |  mulsd xmm0, xmm1
   3078   |5:
   3079   |  ret
   3080   |6:
   3081   |  je <5				// x^1 ==> x
   3082   |  jb >7				// x^0 ==> 1
   3083   |  neg eax
   3084   |  call <1
   3085   |  sseconst_1 xmm1, RDa
   3086   |  divsd xmm1, xmm0
   3087   |  movaps xmm0, xmm1
   3088   |  ret
   3089   |7:
   3090   |  sseconst_1 xmm0, RDa
   3091   |  ret
   3092   |
   3093   |//-----------------------------------------------------------------------
   3094   |//-- Miscellaneous functions --------------------------------------------
   3095   |//-----------------------------------------------------------------------
   3096   |
   3097   |// int lj_vm_cpuid(uint32_t f, uint32_t res[4])
   3098   |->vm_cpuid:
   3099   |.if X64
   3100   |  mov eax, CARG1d
   3101   |  .if X64WIN; push rsi; mov rsi, CARG2; .endif
   3102   |  push rbx
   3103   |  xor ecx, ecx
   3104   |  cpuid
   3105   |  mov [rsi], eax
   3106   |  mov [rsi+4], ebx
   3107   |  mov [rsi+8], ecx
   3108   |  mov [rsi+12], edx
   3109   |  pop rbx
   3110   |  .if X64WIN; pop rsi; .endif
   3111   |  ret
   3112   |.else
   3113   |  pushfd
   3114   |  pop edx
   3115   |  mov ecx, edx
   3116   |  xor edx, 0x00200000		// Toggle ID bit in flags.
   3117   |  push edx
   3118   |  popfd
   3119   |  pushfd
   3120   |  pop edx
   3121   |  xor eax, eax			// Zero means no features supported.
   3122   |  cmp ecx, edx
   3123   |  jz >1				// No ID toggle means no CPUID support.
   3124   |  mov eax, [esp+4]			// Argument 1 is function number.
   3125   |  push edi
   3126   |  push ebx
   3127   |  xor ecx, ecx
   3128   |  cpuid
   3129   |  mov edi, [esp+16]			// Argument 2 is result area.
   3130   |  mov [edi], eax
   3131   |  mov [edi+4], ebx
   3132   |  mov [edi+8], ecx
   3133   |  mov [edi+12], edx
   3134   |  pop ebx
   3135   |  pop edi
   3136   |1:
   3137   |  ret
   3138   |.endif
   3139   |
   3140   |//-----------------------------------------------------------------------
   3141   |//-- Assertions ---------------------------------------------------------
   3142   |//-----------------------------------------------------------------------
   3143   |
   3144   |->assert_bad_for_arg_type:
   3145 #ifdef LUA_USE_ASSERT
   3146   |  int3
   3147 #endif
   3148   |  int3
   3149   |
   3150   |//-----------------------------------------------------------------------
   3151   |//-- FFI helper functions -----------------------------------------------
   3152   |//-----------------------------------------------------------------------
   3153   |
   3154   |// Handler for callback functions. Callback slot number in ah/al.
   3155   |->vm_ffi_callback:
   3156   |.if FFI
   3157   |.type CTSTATE, CTState, PC
   3158   |.if not X64
   3159   |  sub esp, 16			// Leave room for SAVE_ERRF etc.
   3160   |.endif
   3161   |  saveregs_	// ebp/rbp already saved. ebp now holds global_State *.
   3162   |  lea DISPATCH, [ebp+GG_G2DISP]
   3163   |  mov CTSTATE, GL:ebp->ctype_state
   3164   |  movzx eax, ax
   3165   |  mov CTSTATE->cb.slot, eax
   3166   |.if X64
   3167   |  mov CTSTATE->cb.gpr[0], CARG1
   3168   |  mov CTSTATE->cb.gpr[1], CARG2
   3169   |  mov CTSTATE->cb.gpr[2], CARG3
   3170   |  mov CTSTATE->cb.gpr[3], CARG4
   3171   |  movsd qword CTSTATE->cb.fpr[0], xmm0
   3172   |  movsd qword CTSTATE->cb.fpr[1], xmm1
   3173   |  movsd qword CTSTATE->cb.fpr[2], xmm2
   3174   |  movsd qword CTSTATE->cb.fpr[3], xmm3
   3175   |.if X64WIN
   3176   |  lea rax, [rsp+CFRAME_SIZE+4*8]
   3177   |.else
   3178   |  lea rax, [rsp+CFRAME_SIZE]
   3179   |  mov CTSTATE->cb.gpr[4], CARG5
   3180   |  mov CTSTATE->cb.gpr[5], CARG6
   3181   |  movsd qword CTSTATE->cb.fpr[4], xmm4
   3182   |  movsd qword CTSTATE->cb.fpr[5], xmm5
   3183   |  movsd qword CTSTATE->cb.fpr[6], xmm6
   3184   |  movsd qword CTSTATE->cb.fpr[7], xmm7
   3185   |.endif
   3186   |  mov CTSTATE->cb.stack, rax
   3187   |  mov CARG2, rsp
   3188   |.else
   3189   |  lea eax, [esp+CFRAME_SIZE+16]
   3190   |  mov CTSTATE->cb.gpr[0], FCARG1
   3191   |  mov CTSTATE->cb.gpr[1], FCARG2
   3192   |  mov CTSTATE->cb.stack, eax
   3193   |  mov FCARG1, [esp+CFRAME_SIZE+12]	// Move around misplaced retaddr/ebp.
   3194   |  mov FCARG2, [esp+CFRAME_SIZE+8]
   3195   |  mov SAVE_RET, FCARG1
   3196   |  mov SAVE_R4, FCARG2
   3197   |  mov FCARG2, esp
   3198   |.endif
   3199   |  mov SAVE_PC, CTSTATE		// Any value outside of bytecode is ok.
   3200   |  mov FCARG1, CTSTATE
   3201   |  call extern lj_ccallback_enter@8	// (CTState *cts, void *cf)
   3202   |  // lua_State * returned in eax (RD).
   3203   |  set_vmstate INTERP
   3204   |  mov BASE, L:RD->base
   3205   |  mov RD, L:RD->top
   3206   |  sub RD, BASE
   3207   |  mov LFUNC:RB, [BASE-8]
   3208   |  shr RD, 3
   3209   |  add RD, 1
   3210   |  ins_callt
   3211   |.endif
   3212   |
   3213   |->cont_ffi_callback:			// Return from FFI callback.
   3214   |.if FFI
   3215   |  mov L:RA, SAVE_L
   3216   |  mov CTSTATE, [DISPATCH+DISPATCH_GL(ctype_state)]
   3217   |  mov aword CTSTATE->L, L:RAa
   3218   |  mov L:RA->base, BASE
   3219   |  mov L:RA->top, RB
   3220   |  mov FCARG1, CTSTATE
   3221   |  mov FCARG2, RC
   3222   |  call extern lj_ccallback_leave@8	// (CTState *cts, TValue *o)
   3223   |.if X64
   3224   |  mov rax, CTSTATE->cb.gpr[0]
   3225   |  movsd xmm0, qword CTSTATE->cb.fpr[0]
   3226   |  jmp ->vm_leave_unw
   3227   |.else
   3228   |  mov L:RB, SAVE_L
   3229   |  mov eax, CTSTATE->cb.gpr[0]
   3230   |  mov edx, CTSTATE->cb.gpr[1]
   3231   |  cmp dword CTSTATE->cb.gpr[2], 1
   3232   |  jb >7
   3233   |  je >6
   3234   |  fld qword CTSTATE->cb.fpr[0].d
   3235   |  jmp >7
   3236   |6:
   3237   |  fld dword CTSTATE->cb.fpr[0].f
   3238   |7:
   3239   |  mov ecx, L:RB->top
   3240   |  movzx ecx, word [ecx+6]		// Get stack adjustment and copy up.
   3241   |  mov SAVE_L, ecx			// Must be one slot above SAVE_RET
   3242   |  restoreregs
   3243   |  pop ecx				// Move return addr from SAVE_RET.
   3244   |  add esp, [esp]			// Adjust stack.
   3245   |  add esp, 16
   3246   |  push ecx
   3247   |  ret
   3248   |.endif
   3249   |.endif
   3250   |
   3251   |->vm_ffi_call@4:			// Call C function via FFI.
   3252   |  // Caveat: needs special frame unwinding, see below.
   3253   |.if FFI
   3254   |.if X64
   3255   |  .type CCSTATE, CCallState, rbx
   3256   |  push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1
   3257   |.else
   3258   |  .type CCSTATE, CCallState, ebx
   3259   |  push ebp; mov ebp, esp; push ebx; mov CCSTATE, FCARG1
   3260   |.endif
   3261   |
   3262   |  // Readjust stack.
   3263   |.if X64
   3264   |  mov eax, CCSTATE->spadj
   3265   |  sub rsp, rax
   3266   |.else
   3267   |  sub esp, CCSTATE->spadj
   3268   |.if WIN
   3269   |  mov CCSTATE->spadj, esp
   3270   |.endif
   3271   |.endif
   3272   |
   3273   |  // Copy stack slots.
   3274   |  movzx ecx, byte CCSTATE->nsp
   3275   |  sub ecx, 1
   3276   |  js >2
   3277   |1:
   3278   |.if X64
   3279   |  mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)]
   3280   |  mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax
   3281   |.else
   3282   |  mov eax, [CCSTATE+ecx*4+offsetof(CCallState, stack)]
   3283   |  mov [esp+ecx*4], eax
   3284   |.endif
   3285   |  sub ecx, 1
   3286   |  jns <1
   3287   |2:
   3288   |
   3289   |.if X64
   3290   |  movzx eax, byte CCSTATE->nfpr
   3291   |  mov CARG1, CCSTATE->gpr[0]
   3292   |  mov CARG2, CCSTATE->gpr[1]
   3293   |  mov CARG3, CCSTATE->gpr[2]
   3294   |  mov CARG4, CCSTATE->gpr[3]
   3295   |.if not X64WIN
   3296   |  mov CARG5, CCSTATE->gpr[4]
   3297   |  mov CARG6, CCSTATE->gpr[5]
   3298   |.endif
   3299   |  test eax, eax; jz >5
   3300   |  movaps xmm0, CCSTATE->fpr[0]
   3301   |  movaps xmm1, CCSTATE->fpr[1]
   3302   |  movaps xmm2, CCSTATE->fpr[2]
   3303   |  movaps xmm3, CCSTATE->fpr[3]
   3304   |.if not X64WIN
   3305   |  cmp eax, 4; jbe >5
   3306   |  movaps xmm4, CCSTATE->fpr[4]
   3307   |  movaps xmm5, CCSTATE->fpr[5]
   3308   |  movaps xmm6, CCSTATE->fpr[6]
   3309   |  movaps xmm7, CCSTATE->fpr[7]
   3310   |.endif
   3311   |5:
   3312   |.else
   3313   |  mov FCARG1, CCSTATE->gpr[0]
   3314   |  mov FCARG2, CCSTATE->gpr[1]
   3315   |.endif
   3316   |
   3317   |  call aword CCSTATE->func
   3318   |
   3319   |.if X64
   3320   |  mov CCSTATE->gpr[0], rax
   3321   |  movaps CCSTATE->fpr[0], xmm0
   3322   |.if not X64WIN
   3323   |  mov CCSTATE->gpr[1], rdx
   3324   |  movaps CCSTATE->fpr[1], xmm1
   3325   |.endif
   3326   |.else
   3327   |  mov CCSTATE->gpr[0], eax
   3328   |  mov CCSTATE->gpr[1], edx
   3329   |  cmp byte CCSTATE->resx87, 1
   3330   |  jb >7
   3331   |  je >6
   3332   |  fstp qword CCSTATE->fpr[0].d[0]
   3333   |  jmp >7
   3334   |6:
   3335   |  fstp dword CCSTATE->fpr[0].f[0]
   3336   |7:
   3337   |.if WIN
   3338   |  sub CCSTATE->spadj, esp
   3339   |.endif
   3340   |.endif
   3341   |
   3342   |.if X64
   3343   |  mov rbx, [rbp-8]; leave; ret
   3344   |.else
   3345   |  mov ebx, [ebp-4]; leave; ret
   3346   |.endif
   3347   |.endif
   3348   |// Note: vm_ffi_call must be the last function in this object file!
   3349   |
   3350   |//-----------------------------------------------------------------------
   3351 }
   3352 
   3353 /* Generate the code for a single instruction. */
   3354 static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   3355 {
   3356   int vk = 0;
   3357   |// Note: aligning all instructions does not pay off.
   3358   |=>defop:
   3359 
   3360   switch (op) {
   3361 
   3362   /* -- Comparison ops ---------------------------------------------------- */
   3363 
   3364   /* Remember: all ops branch for a true comparison, fall through otherwise. */
   3365 
   3366   |.macro jmp_comp, lt, ge, le, gt, target
   3367   ||switch (op) {
   3368   ||case BC_ISLT:
   3369   |   lt target
   3370   ||break;
   3371   ||case BC_ISGE:
   3372   |   ge target
   3373   ||break;
   3374   ||case BC_ISLE:
   3375   |   le target
   3376   ||break;
   3377   ||case BC_ISGT:
   3378   |   gt target
   3379   ||break;
   3380   ||default: break;  /* Shut up GCC. */
   3381   ||}
   3382   |.endmacro
   3383 
   3384   case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
   3385     |  // RA = src1, RD = src2, JMP with RD = target
   3386     |  ins_AD
   3387     |.if DUALNUM
   3388     |  checkint RA, >7
   3389     |  checkint RD, >8
   3390     |  mov RB, dword [BASE+RA*8]
   3391     |  add PC, 4
   3392     |  cmp RB, dword [BASE+RD*8]
   3393     |  jmp_comp jge, jl, jg, jle, >9
   3394     |6:
   3395     |  movzx RD, PC_RD
   3396     |  branchPC RD
   3397     |9:
   3398     |  ins_next
   3399     |
   3400     |7:  // RA is not an integer.
   3401     |  ja ->vmeta_comp
   3402     |  // RA is a number.
   3403     |  cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
   3404     |  // RA is a number, RD is an integer.
   3405     |  cvtsi2sd xmm0, dword [BASE+RD*8]
   3406     |  jmp >2
   3407     |
   3408     |8:  // RA is an integer, RD is not an integer.
   3409     |  ja ->vmeta_comp
   3410     |  // RA is an integer, RD is a number.
   3411     |  cvtsi2sd xmm1, dword [BASE+RA*8]
   3412     |  movsd xmm0, qword [BASE+RD*8]
   3413     |  add PC, 4
   3414     |  ucomisd xmm0, xmm1
   3415     |  jmp_comp jbe, ja, jb, jae, <9
   3416     |  jmp <6
   3417     |.else
   3418     |  checknum RA, ->vmeta_comp
   3419     |  checknum RD, ->vmeta_comp
   3420     |.endif
   3421     |1:
   3422     |  movsd xmm0, qword [BASE+RD*8]
   3423     |2:
   3424     |  add PC, 4
   3425     |  ucomisd xmm0, qword [BASE+RA*8]
   3426     |3:
   3427     |  // Unordered: all of ZF CF PF set, ordered: PF clear.
   3428     |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
   3429     |.if DUALNUM
   3430     |  jmp_comp jbe, ja, jb, jae, <9
   3431     |  jmp <6
   3432     |.else
   3433     |  jmp_comp jbe, ja, jb, jae, >1
   3434     |  movzx RD, PC_RD
   3435     |  branchPC RD
   3436     |1:
   3437     |  ins_next
   3438     |.endif
   3439     break;
   3440 
   3441   case BC_ISEQV: case BC_ISNEV:
   3442     vk = op == BC_ISEQV;
   3443     |  ins_AD	// RA = src1, RD = src2, JMP with RD = target
   3444     |  mov RB, [BASE+RD*8+4]
   3445     |  add PC, 4
   3446     |.if DUALNUM
   3447     |  cmp RB, LJ_TISNUM; jne >7
   3448     |  checkint RA, >8
   3449     |  mov RB, dword [BASE+RD*8]
   3450     |  cmp RB, dword [BASE+RA*8]
   3451     if (vk) {
   3452       |  jne >9
   3453     } else {
   3454       |  je >9
   3455     }
   3456     |  movzx RD, PC_RD
   3457     |  branchPC RD
   3458     |9:
   3459     |  ins_next
   3460     |
   3461     |7:  // RD is not an integer.
   3462     |  ja >5
   3463     |  // RD is a number.
   3464     |  cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
   3465     |  // RD is a number, RA is an integer.
   3466     |  cvtsi2sd xmm0, dword [BASE+RA*8]
   3467     |  jmp >2
   3468     |
   3469     |8:  // RD is an integer, RA is not an integer.
   3470     |  ja >5
   3471     |  // RD is an integer, RA is a number.
   3472     |  cvtsi2sd xmm0, dword [BASE+RD*8]
   3473     |  ucomisd xmm0, qword [BASE+RA*8]
   3474     |  jmp >4
   3475     |
   3476     |.else
   3477     |  cmp RB, LJ_TISNUM; jae >5
   3478     |  checknum RA, >5
   3479     |.endif
   3480     |1:
   3481     |  movsd xmm0, qword [BASE+RA*8]
   3482     |2:
   3483     |  ucomisd xmm0, qword [BASE+RD*8]
   3484     |4:
   3485   iseqne_fp:
   3486     if (vk) {
   3487       |  jp >2				// Unordered means not equal.
   3488       |  jne >2
   3489     } else {
   3490       |  jp >2				// Unordered means not equal.
   3491       |  je >1
   3492     }
   3493   iseqne_end:
   3494     if (vk) {
   3495       |1:				// EQ: Branch to the target.
   3496       |  movzx RD, PC_RD
   3497       |  branchPC RD
   3498       |2:				// NE: Fallthrough to next instruction.
   3499       |.if not FFI
   3500       |3:
   3501       |.endif
   3502     } else {
   3503       |.if not FFI
   3504       |3:
   3505       |.endif
   3506       |2:				// NE: Branch to the target.
   3507       |  movzx RD, PC_RD
   3508       |  branchPC RD
   3509       |1:				// EQ: Fallthrough to next instruction.
   3510     }
   3511     if (LJ_DUALNUM && (op == BC_ISEQV || op == BC_ISNEV ||
   3512 		       op == BC_ISEQN || op == BC_ISNEN)) {
   3513       |  jmp <9
   3514     } else {
   3515       |  ins_next
   3516     }
   3517     |
   3518     if (op == BC_ISEQV || op == BC_ISNEV) {
   3519       |5:  // Either or both types are not numbers.
   3520       |.if FFI
   3521       |  cmp RB, LJ_TCDATA; je ->vmeta_equal_cd
   3522       |  checktp RA, LJ_TCDATA; je ->vmeta_equal_cd
   3523       |.endif
   3524       |  checktp RA, RB			// Compare types.
   3525       |  jne <2				// Not the same type?
   3526       |  cmp RB, LJ_TISPRI
   3527       |  jae <1				// Same type and primitive type?
   3528       |
   3529       |  // Same types and not a primitive type. Compare GCobj or pvalue.
   3530       |  mov RA, [BASE+RA*8]
   3531       |  mov RD, [BASE+RD*8]
   3532       |  cmp RA, RD
   3533       |  je <1				// Same GCobjs or pvalues?
   3534       |  cmp RB, LJ_TISTABUD
   3535       |  ja <2				// Different objects and not table/ud?
   3536       |.if X64
   3537       |  cmp RB, LJ_TUDATA		// And not 64 bit lightuserdata.
   3538       |  jb <2
   3539       |.endif
   3540       |
   3541       |  // Different tables or userdatas. Need to check __eq metamethod.
   3542       |  // Field metatable must be at same offset for GCtab and GCudata!
   3543       |  mov TAB:RB, TAB:RA->metatable
   3544       |  test TAB:RB, TAB:RB
   3545       |  jz <2				// No metatable?
   3546       |  test byte TAB:RB->nomm, 1<<MM_eq
   3547       |  jnz <2				// Or 'no __eq' flag set?
   3548       if (vk) {
   3549 	|  xor RB, RB			// ne = 0
   3550       } else {
   3551 	|  mov RB, 1			// ne = 1
   3552       }
   3553       |  jmp ->vmeta_equal		// Handle __eq metamethod.
   3554     } else {
   3555       |.if FFI
   3556       |3:
   3557       |  cmp RB, LJ_TCDATA
   3558       if (LJ_DUALNUM && vk) {
   3559 	|  jne <9
   3560       } else {
   3561 	|  jne <2
   3562       }
   3563       |  jmp ->vmeta_equal_cd
   3564       |.endif
   3565     }
   3566     break;
   3567   case BC_ISEQS: case BC_ISNES:
   3568     vk = op == BC_ISEQS;
   3569     |  ins_AND	// RA = src, RD = str const, JMP with RD = target
   3570     |  mov RB, [BASE+RA*8+4]
   3571     |  add PC, 4
   3572     |  cmp RB, LJ_TSTR; jne >3
   3573     |  mov RA, [BASE+RA*8]
   3574     |  cmp RA, [KBASE+RD*4]
   3575   iseqne_test:
   3576     if (vk) {
   3577       |  jne >2
   3578     } else {
   3579       |  je >1
   3580     }
   3581     goto iseqne_end;
   3582   case BC_ISEQN: case BC_ISNEN:
   3583     vk = op == BC_ISEQN;
   3584     |  ins_AD	// RA = src, RD = num const, JMP with RD = target
   3585     |  mov RB, [BASE+RA*8+4]
   3586     |  add PC, 4
   3587     |.if DUALNUM
   3588     |  cmp RB, LJ_TISNUM; jne >7
   3589     |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jne >8
   3590     |  mov RB, dword [KBASE+RD*8]
   3591     |  cmp RB, dword [BASE+RA*8]
   3592     if (vk) {
   3593       |  jne >9
   3594     } else {
   3595       |  je >9
   3596     }
   3597     |  movzx RD, PC_RD
   3598     |  branchPC RD
   3599     |9:
   3600     |  ins_next
   3601     |
   3602     |7:  // RA is not an integer.
   3603     |  ja >3
   3604     |  // RA is a number.
   3605     |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
   3606     |  // RA is a number, RD is an integer.
   3607     |  cvtsi2sd xmm0, dword [KBASE+RD*8]
   3608     |  jmp >2
   3609     |
   3610     |8:  // RA is an integer, RD is a number.
   3611     |  cvtsi2sd xmm0, dword [BASE+RA*8]
   3612     |  ucomisd xmm0, qword [KBASE+RD*8]
   3613     |  jmp >4
   3614     |.else
   3615     |  cmp RB, LJ_TISNUM; jae >3
   3616     |.endif
   3617     |1:
   3618     |  movsd xmm0, qword [KBASE+RD*8]
   3619     |2:
   3620     |  ucomisd xmm0, qword [BASE+RA*8]
   3621     |4:
   3622     goto iseqne_fp;
   3623   case BC_ISEQP: case BC_ISNEP:
   3624     vk = op == BC_ISEQP;
   3625     |  ins_AND	// RA = src, RD = primitive type (~), JMP with RD = target
   3626     |  mov RB, [BASE+RA*8+4]
   3627     |  add PC, 4
   3628     |  cmp RB, RD
   3629     if (!LJ_HASFFI) goto iseqne_test;
   3630     if (vk) {
   3631       |  jne >3
   3632       |  movzx RD, PC_RD
   3633       |  branchPC RD
   3634       |2:
   3635       |  ins_next
   3636       |3:
   3637       |  cmp RB, LJ_TCDATA; jne <2
   3638       |  jmp ->vmeta_equal_cd
   3639     } else {
   3640       |  je >2
   3641       |  cmp RB, LJ_TCDATA; je ->vmeta_equal_cd
   3642       |  movzx RD, PC_RD
   3643       |  branchPC RD
   3644       |2:
   3645       |  ins_next
   3646     }
   3647     break;
   3648 
   3649   /* -- Unary test and copy ops ------------------------------------------- */
   3650 
   3651   case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
   3652     |  ins_AD	// RA = dst or unused, RD = src, JMP with RD = target
   3653     |  mov RB, [BASE+RD*8+4]
   3654     |  add PC, 4
   3655     |  cmp RB, LJ_TISTRUECOND
   3656     if (op == BC_IST || op == BC_ISTC) {
   3657       |  jae >1
   3658     } else {
   3659       |  jb >1
   3660     }
   3661     if (op == BC_ISTC || op == BC_ISFC) {
   3662       |  mov [BASE+RA*8+4], RB
   3663       |  mov RB, [BASE+RD*8]
   3664       |  mov [BASE+RA*8], RB
   3665     }
   3666     |  movzx RD, PC_RD
   3667     |  branchPC RD
   3668     |1:					// Fallthrough to the next instruction.
   3669     |  ins_next
   3670     break;
   3671 
   3672   case BC_ISTYPE:
   3673     |  ins_AD	// RA = src, RD = -type
   3674     |  add RD, [BASE+RA*8+4]
   3675     |  jne ->vmeta_istype
   3676     |  ins_next
   3677     break;
   3678   case BC_ISNUM:
   3679     |  ins_AD	// RA = src, RD = -(TISNUM-1)
   3680     |  checknum RA, ->vmeta_istype
   3681     |  ins_next
   3682     break;
   3683 
   3684   /* -- Unary ops --------------------------------------------------------- */
   3685 
   3686   case BC_MOV:
   3687     |  ins_AD	// RA = dst, RD = src
   3688     |.if X64
   3689     |  mov RBa, [BASE+RD*8]
   3690     |  mov [BASE+RA*8], RBa
   3691     |.else
   3692     |  mov RB, [BASE+RD*8+4]
   3693     |  mov RD, [BASE+RD*8]
   3694     |  mov [BASE+RA*8+4], RB
   3695     |  mov [BASE+RA*8], RD
   3696     |.endif
   3697     |  ins_next_
   3698     break;
   3699   case BC_NOT:
   3700     |  ins_AD	// RA = dst, RD = src
   3701     |  xor RB, RB
   3702     |  checktp RD, LJ_TISTRUECOND
   3703     |  adc RB, LJ_TTRUE
   3704     |  mov [BASE+RA*8+4], RB
   3705     |  ins_next
   3706     break;
   3707   case BC_UNM:
   3708     |  ins_AD	// RA = dst, RD = src
   3709     |.if DUALNUM
   3710     |  checkint RD, >5
   3711     |  mov RB, [BASE+RD*8]
   3712     |  neg RB
   3713     |  jo >4
   3714     |  mov dword [BASE+RA*8+4], LJ_TISNUM
   3715     |  mov dword [BASE+RA*8], RB
   3716     |9:
   3717     |  ins_next
   3718     |4:
   3719     |  mov dword [BASE+RA*8+4], 0x41e00000  // 2^31.
   3720     |  mov dword [BASE+RA*8], 0
   3721     |  jmp <9
   3722     |5:
   3723     |  ja ->vmeta_unm
   3724     |.else
   3725     |  checknum RD, ->vmeta_unm
   3726     |.endif
   3727     |  movsd xmm0, qword [BASE+RD*8]
   3728     |  sseconst_sign xmm1, RDa
   3729     |  xorps xmm0, xmm1
   3730     |  movsd qword [BASE+RA*8], xmm0
   3731     |.if DUALNUM
   3732     |  jmp <9
   3733     |.else
   3734     |  ins_next
   3735     |.endif
   3736     break;
   3737   case BC_LEN:
   3738     |  ins_AD	// RA = dst, RD = src
   3739     |  checkstr RD, >2
   3740     |  mov STR:RD, [BASE+RD*8]
   3741     |.if DUALNUM
   3742     |  mov RD, dword STR:RD->len
   3743     |1:
   3744     |  mov dword [BASE+RA*8+4], LJ_TISNUM
   3745     |  mov dword [BASE+RA*8], RD
   3746     |.else
   3747     |  xorps xmm0, xmm0
   3748     |  cvtsi2sd xmm0, dword STR:RD->len
   3749     |1:
   3750     |  movsd qword [BASE+RA*8], xmm0
   3751     |.endif
   3752     |  ins_next
   3753     |2:
   3754     |  checktab RD, ->vmeta_len
   3755     |  mov TAB:FCARG1, [BASE+RD*8]
   3756     |  mov TAB:RB, TAB:FCARG1->metatable
   3757     |  cmp TAB:RB, 0
   3758     |  jnz >9
   3759     |3:
   3760     |->BC_LEN_Z:
   3761     |  mov RB, BASE			// Save BASE.
   3762     |  call extern lj_tab_len@4		// (GCtab *t)
   3763     |  // Length of table returned in eax (RD).
   3764     |.if DUALNUM
   3765     |  // Nothing to do.
   3766     |.else
   3767     |  cvtsi2sd xmm0, RD
   3768     |.endif
   3769     |  mov BASE, RB			// Restore BASE.
   3770     |  movzx RA, PC_RA
   3771     |  jmp <1
   3772     |9:  // Check for __len.
   3773     |  test byte TAB:RB->nomm, 1<<MM_len
   3774     |  jnz <3
   3775     |  jmp ->vmeta_len			// 'no __len' flag NOT set: check.
   3776     break;
   3777 #if LJ_53
   3778   case BC_BNOT:
   3779     |  ins_AD	// RA = dst, RD = src
   3780     |  jmp ->vmeta_unm
   3781     break;
   3782 
   3783   /* -- Binary ops -------------------------------------------------------- */
   3784   case BC_IDIV:
   3785   case BC_BAND:
   3786   case BC_BOR:
   3787   case BC_BXOR:
   3788   case BC_SHL:
   3789   case BC_SHR:
   3790     |  ins_ABC
   3791     |  jmp ->vmeta_arith_vv
   3792     break;
   3793 #endif
   3794     |.macro ins_arithpre, sseins, ssereg
   3795     |  ins_ABC
   3796     ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
   3797     ||switch (vk) {
   3798     ||case 0:
   3799     |   checknum RB, ->vmeta_arith_vn
   3800     |   .if DUALNUM
   3801     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
   3802     |   .endif
   3803     |   movsd xmm0, qword [BASE+RB*8]
   3804     |   sseins ssereg, qword [KBASE+RC*8]
   3805     ||  break;
   3806     ||case 1:
   3807     |   checknum RB, ->vmeta_arith_nv
   3808     |   .if DUALNUM
   3809     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
   3810     |   .endif
   3811     |   movsd xmm0, qword [KBASE+RC*8]
   3812     |   sseins ssereg, qword [BASE+RB*8]
   3813     ||  break;
   3814     ||default:
   3815     |   checknum RB, ->vmeta_arith_vv
   3816     |   checknum RC, ->vmeta_arith_vv
   3817     |   movsd xmm0, qword [BASE+RB*8]
   3818     |   sseins ssereg, qword [BASE+RC*8]
   3819     ||  break;
   3820     ||}
   3821     |.endmacro
   3822     |
   3823     |.macro ins_arithdn, intins
   3824     |  ins_ABC
   3825     ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
   3826     ||switch (vk) {
   3827     ||case 0:
   3828     |   checkint RB, ->vmeta_arith_vn
   3829     |   cmp dword [KBASE+RC*8+4], LJ_TISNUM; jne ->vmeta_arith_vn
   3830     |   mov RB, [BASE+RB*8]
   3831     |   intins RB, [KBASE+RC*8]; jo ->vmeta_arith_vno
   3832     ||  break;
   3833     ||case 1:
   3834     |   checkint RB, ->vmeta_arith_nv
   3835     |   cmp dword [KBASE+RC*8+4], LJ_TISNUM; jne ->vmeta_arith_nv
   3836     |   mov RC, [KBASE+RC*8]
   3837     |   intins RC, [BASE+RB*8]; jo ->vmeta_arith_nvo
   3838     ||  break;
   3839     ||default:
   3840     |   checkint RB, ->vmeta_arith_vv
   3841     |   checkint RC, ->vmeta_arith_vv
   3842     |   mov RB, [BASE+RB*8]
   3843     |   intins RB, [BASE+RC*8]; jo ->vmeta_arith_vvo
   3844     ||  break;
   3845     ||}
   3846     |  mov dword [BASE+RA*8+4], LJ_TISNUM
   3847     ||if (vk == 1) {
   3848     |   mov dword [BASE+RA*8], RC
   3849     ||} else {
   3850     |   mov dword [BASE+RA*8], RB
   3851     ||}
   3852     |  ins_next
   3853     |.endmacro
   3854     |
   3855     |.macro ins_arithpost
   3856     |  movsd qword [BASE+RA*8], xmm0
   3857     |.endmacro
   3858     |
   3859     |.macro ins_arith, sseins
   3860     |  ins_arithpre sseins, xmm0
   3861     |  ins_arithpost
   3862     |  ins_next
   3863     |.endmacro
   3864     |
   3865     |.macro ins_arith, intins, sseins
   3866     |.if DUALNUM
   3867     |  ins_arithdn intins
   3868     |.else
   3869     |  ins_arith, sseins
   3870     |.endif
   3871     |.endmacro
   3872 
   3873     |  // RA = dst, RB = src1 or num const, RC = src2 or num const
   3874   case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
   3875     |  ins_arith add, addsd
   3876     break;
   3877   case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
   3878     |  ins_arith sub, subsd
   3879     break;
   3880   case BC_MULVN: case BC_MULNV: case BC_MULVV:
   3881     |  ins_arith imul, mulsd
   3882     break;
   3883   case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
   3884     |  ins_arith divsd
   3885     break;
   3886   case BC_MODVN:
   3887     |  ins_arithpre movsd, xmm1
   3888     |->BC_MODVN_Z:
   3889     |  call ->vm_mod
   3890     |  ins_arithpost
   3891     |  ins_next
   3892     break;
   3893   case BC_MODNV: case BC_MODVV:
   3894     |  ins_arithpre movsd, xmm1
   3895     |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
   3896     break;
   3897   case BC_POW:
   3898     |  ins_arithpre movsd, xmm1
   3899     |  mov RB, BASE
   3900     |.if not X64
   3901     |  movsd FPARG1, xmm0
   3902     |  movsd FPARG3, xmm1
   3903     |.endif
   3904     |  call extern pow
   3905     |  movzx RA, PC_RA
   3906     |  mov BASE, RB
   3907     |.if X64
   3908     |  ins_arithpost
   3909     |.else
   3910     |  fstp qword [BASE+RA*8]
   3911     |.endif
   3912     |  ins_next
   3913     break;
   3914 
   3915   case BC_CAT:
   3916     |  ins_ABC	// RA = dst, RB = src_start, RC = src_end
   3917     |.if X64
   3918     |  mov L:CARG1d, SAVE_L
   3919     |  mov L:CARG1d->base, BASE
   3920     |  lea CARG2d, [BASE+RC*8]
   3921     |  mov CARG3d, RC
   3922     |  sub CARG3d, RB
   3923     |->BC_CAT_Z:
   3924     |  mov L:RB, L:CARG1d
   3925     |.else
   3926     |  lea RA, [BASE+RC*8]
   3927     |  sub RC, RB
   3928     |  mov ARG2, RA
   3929     |  mov ARG3, RC
   3930     |->BC_CAT_Z:
   3931     |  mov L:RB, SAVE_L
   3932     |  mov ARG1, L:RB
   3933     |  mov L:RB->base, BASE
   3934     |.endif
   3935     |  mov SAVE_PC, PC
   3936     |  call extern lj_meta_cat		// (lua_State *L, TValue *top, int left)
   3937     |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
   3938     |  mov BASE, L:RB->base
   3939     |  test RC, RC
   3940     |  jnz ->vmeta_binop
   3941     |  movzx RB, PC_RB			// Copy result to Stk[RA] from Stk[RB].
   3942     |  movzx RA, PC_RA
   3943     |.if X64
   3944     |  mov RCa, [BASE+RB*8]
   3945     |  mov [BASE+RA*8], RCa
   3946     |.else
   3947     |  mov RC, [BASE+RB*8+4]
   3948     |  mov RB, [BASE+RB*8]
   3949     |  mov [BASE+RA*8+4], RC
   3950     |  mov [BASE+RA*8], RB
   3951     |.endif
   3952     |  ins_next
   3953     break;
   3954 
   3955   /* -- Constant ops ------------------------------------------------------ */
   3956 
   3957   case BC_KSTR:
   3958     |  ins_AND	// RA = dst, RD = str const (~)
   3959     |  mov RD, [KBASE+RD*4]
   3960     |  mov dword [BASE+RA*8+4], LJ_TSTR
   3961     |  mov [BASE+RA*8], RD
   3962     |  ins_next
   3963     break;
   3964   case BC_KCDATA:
   3965     |.if FFI
   3966     |  ins_AND	// RA = dst, RD = cdata const (~)
   3967     |  mov RD, [KBASE+RD*4]
   3968     |  mov dword [BASE+RA*8+4], LJ_TCDATA
   3969     |  mov [BASE+RA*8], RD
   3970     |  ins_next
   3971     |.endif
   3972     break;
   3973   case BC_KSHORT:
   3974     |  ins_AD	// RA = dst, RD = signed int16 literal
   3975     |.if DUALNUM
   3976     |  movsx RD, RDW
   3977     |  mov dword [BASE+RA*8+4], LJ_TISNUM
   3978     |  mov dword [BASE+RA*8], RD
   3979     |.else
   3980     |  movsx RD, RDW			// Sign-extend literal.
   3981     |  cvtsi2sd xmm0, RD
   3982     |  movsd qword [BASE+RA*8], xmm0
   3983     |.endif
   3984     |  ins_next
   3985     break;
   3986   case BC_KNUM:
   3987     |  ins_AD	// RA = dst, RD = num const
   3988     |  movsd xmm0, qword [KBASE+RD*8]
   3989     |  movsd qword [BASE+RA*8], xmm0
   3990     |  ins_next
   3991     break;
   3992   case BC_KPRI:
   3993     |  ins_AND	// RA = dst, RD = primitive type (~)
   3994     |  mov [BASE+RA*8+4], RD
   3995     |  ins_next
   3996     break;
   3997   case BC_KNIL:
   3998     |  ins_AD	// RA = dst_start, RD = dst_end
   3999     |  lea RA, [BASE+RA*8+12]
   4000     |  lea RD, [BASE+RD*8+4]
   4001     |  mov RB, LJ_TNIL
   4002     |  mov [RA-8], RB			// Sets minimum 2 slots.
   4003     |1:
   4004     |  mov [RA], RB
   4005     |  add RA, 8
   4006     |  cmp RA, RD
   4007     |  jbe <1
   4008     |  ins_next
   4009     break;
   4010 
   4011   /* -- Upvalue and function ops ------------------------------------------ */
   4012 
   4013   case BC_UGET:
   4014     |  ins_AD	// RA = dst, RD = upvalue #
   4015     |  mov LFUNC:RB, [BASE-8]
   4016     |  mov UPVAL:RB, [LFUNC:RB+RD*4+offsetof(GCfuncL, uvptr)]
   4017     |  mov RB, UPVAL:RB->v
   4018     |.if X64
   4019     |  mov RDa, [RB]
   4020     |  mov [BASE+RA*8], RDa
   4021     |.else
   4022     |  mov RD, [RB+4]
   4023     |  mov RB, [RB]
   4024     |  mov [BASE+RA*8+4], RD
   4025     |  mov [BASE+RA*8], RB
   4026     |.endif
   4027     |  ins_next
   4028     break;
   4029   case BC_ESETV: // TBD: NOBARRIER?
   4030     |  ins_AD
   4031     |  checktp RD, LJ_TTAB
   4032     |  mov RD, [BASE+RD*8] // val to assign
   4033     |  mov LFUNC:RB, [BASE-8] // lfunc
   4034     |  jne >2 // silently skip if not table
   4035     |1:
   4036     |  mov LFUNC:RB->env, RD // set env
   4037     |  mov LFUNC:RB, LFUNC:RB->next_ENV
   4038     |  cmp LFUNC:RB, [BASE-8] // set for all members sharing this _ENV uv
   4039     |  jnz <1
   4040     |2:
   4041     |  ins_refetch
   4042     break;
   4043   case BC_USETV:
   4044 #define TV2MARKOFS \
   4045  ((int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv))
   4046     |  ins_AD	// RA = upvalue #, RD = src
   4047     |  mov LFUNC:RB, [BASE-8]
   4048     |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
   4049     |  cmp byte UPVAL:RB->closed, 0
   4050     |  mov RB, UPVAL:RB->v
   4051     |  mov RA, [BASE+RD*8]
   4052     |  mov RD, [BASE+RD*8+4]
   4053     |  mov [RB], RA
   4054     |  mov [RB+4], RD
   4055     |  jz >1
   4056     |  // Check barrier for closed upvalue.
   4057     |  test byte [RB+TV2MARKOFS], LJ_GC_BLACK		// isblack(uv)
   4058     |  jnz >2
   4059     |1:
   4060     |  ins_next
   4061     |
   4062     |2:  // Upvalue is black. Check if new value is collectable and white.
   4063     |  sub RD, LJ_TISGCV
   4064     |  cmp RD, LJ_TNUMX - LJ_TISGCV			// tvisgcv(v)
   4065     |  jbe <1
   4066     |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES	// iswhite(v)
   4067     |  jz <1
   4068     |  // Crossed a write barrier. Move the barrier forward.
   4069     |.if X64 and not X64WIN
   4070     |  mov FCARG2, RB
   4071     |  mov RB, BASE			// Save BASE.
   4072     |.else
   4073     |  xchg FCARG2, RB			// Save BASE (FCARG2 == BASE).
   4074     |.endif
   4075     |  lea GL:FCARG1, [DISPATCH+GG_DISP2G]
   4076     |  call extern lj_gc_barrieruv@8	// (global_State *g, TValue *tv)
   4077     |  mov BASE, RB			// Restore BASE.
   4078     |  jmp <1
   4079     break;
   4080 #undef TV2MARKOFS
   4081   case BC_USETS:
   4082     |  ins_AND	// RA = upvalue #, RD = str const (~)
   4083     |  mov LFUNC:RB, [BASE-8]
   4084     |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
   4085     |  mov GCOBJ:RA, [KBASE+RD*4]
   4086     |  mov RD, UPVAL:RB->v
   4087     |  mov [RD], GCOBJ:RA
   4088     |  mov dword [RD+4], LJ_TSTR
   4089     |  test byte UPVAL:RB->marked, LJ_GC_BLACK		// isblack(uv)
   4090     |  jnz >2
   4091     |1:
   4092     |  ins_next
   4093     |
   4094     |2:  // Check if string is white and ensure upvalue is closed.
   4095     |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES	// iswhite(str)
   4096     |  jz <1
   4097     |  cmp byte UPVAL:RB->closed, 0
   4098     |  jz <1
   4099     |  // Crossed a write barrier. Move the barrier forward.
   4100     |  mov RB, BASE			// Save BASE (FCARG2 == BASE).
   4101     |  mov FCARG2, RD
   4102     |  lea GL:FCARG1, [DISPATCH+GG_DISP2G]
   4103     |  call extern lj_gc_barrieruv@8	// (global_State *g, TValue *tv)
   4104     |  mov BASE, RB			// Restore BASE.
   4105     |  jmp <1
   4106     break;
   4107   case BC_USETN:
   4108     |  ins_AD	// RA = upvalue #, RD = num const
   4109     |  mov LFUNC:RB, [BASE-8]
   4110     |  movsd xmm0, qword [KBASE+RD*8]
   4111     |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
   4112     |  mov RA, UPVAL:RB->v
   4113     |  movsd qword [RA], xmm0
   4114     |  ins_next
   4115     break;
   4116   case BC_USETP:
   4117     |  ins_AND	// RA = upvalue #, RD = primitive type (~)
   4118     |  mov LFUNC:RB, [BASE-8]
   4119     |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
   4120     |  mov RA, UPVAL:RB->v
   4121     |  mov [RA+4], RD
   4122     |  ins_next
   4123     break;
   4124   case BC_UCLO:
   4125     |  ins_AD	// RA = level, RD = target
   4126     |  branchPC RD			// Do this first to free RD.
   4127     |  mov L:RB, SAVE_L
   4128     |  cmp dword L:RB->openupval, 0
   4129     |  je >1
   4130     |  mov L:RB->base, BASE
   4131     |  lea FCARG2, [BASE+RA*8]		// Caveat: FCARG2 == BASE
   4132     |  mov L:FCARG1, L:RB		// Caveat: FCARG1 == RA
   4133     |  call extern lj_func_closeuv@8	// (lua_State *L, TValue *level)
   4134     |  mov BASE, L:RB->base
   4135     |1:
   4136     |  ins_next
   4137     break;
   4138 
   4139   case BC_FNEW:
   4140     |  ins_AND	// RA = dst, RD = proto const (~) (holding function prototype)
   4141     |.if X64
   4142     |  mov L:RB, SAVE_L
   4143     |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
   4144     |  mov CARG3d, [BASE-8]
   4145     |  mov CARG2d, [KBASE+RD*4]		// Fetch GCproto *.
   4146     |  mov CARG1d, L:RB
   4147     |.else
   4148     |  mov LFUNC:RA, [BASE-8]
   4149     |  mov PROTO:RD, [KBASE+RD*4]	// Fetch GCproto *.
   4150     |  mov L:RB, SAVE_L
   4151     |  mov ARG3, LFUNC:RA
   4152     |  mov ARG2, PROTO:RD
   4153     |  mov ARG1, L:RB
   4154     |  mov L:RB->base, BASE
   4155     |.endif
   4156     |  mov SAVE_PC, PC
   4157     |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
   4158     |  call extern lj_func_newL_gc
   4159     |  // GCfuncL * returned in eax (RC).
   4160     |  mov BASE, L:RB->base
   4161     |  movzx RA, PC_RA
   4162     |  mov [BASE+RA*8], LFUNC:RC
   4163     |  mov dword [BASE+RA*8+4], LJ_TFUNC
   4164     |  ins_next
   4165     break;
   4166 
   4167   /* -- Table ops --------------------------------------------------------- */
   4168 
   4169   case BC_TNEW:
   4170     |  ins_AD	// RA = dst, RD = hbits|asize
   4171     |  mov L:RB, SAVE_L
   4172     |  mov L:RB->base, BASE
   4173     |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
   4174     |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
   4175     |  mov SAVE_PC, PC
   4176     |  jae >5
   4177     |1:
   4178     |.if X64
   4179     |  mov CARG3d, RD
   4180     |  and RD, 0x7ff
   4181     |  shr CARG3d, 11
   4182     |.else
   4183     |  mov RA, RD
   4184     |  and RD, 0x7ff
   4185     |  shr RA, 11
   4186     |  mov ARG3, RA
   4187     |.endif
   4188     |  cmp RD, 0x7ff
   4189     |  je >3
   4190     |2:
   4191     |.if X64
   4192     |  mov L:CARG1d, L:RB
   4193     |  mov CARG2d, RD
   4194     |.else
   4195     |  mov ARG1, L:RB
   4196     |  mov ARG2, RD
   4197     |.endif
   4198     |  call extern lj_tab_new  // (lua_State *L, int32_t asize, uint32_t hbits)
   4199     |  // Table * returned in eax (RC).
   4200     |  mov BASE, L:RB->base
   4201     |  movzx RA, PC_RA
   4202     |  mov [BASE+RA*8], TAB:RC
   4203     |  mov dword [BASE+RA*8+4], LJ_TTAB
   4204     |  ins_next
   4205     |3:  // Turn 0x7ff into 0x801.
   4206     |  mov RD, 0x801
   4207     |  jmp <2
   4208     |5:
   4209     |  mov L:FCARG1, L:RB
   4210     |  call extern lj_gc_step_fixtop@4	// (lua_State *L)
   4211     |  movzx RD, PC_RD
   4212     |  jmp <1
   4213     break;
   4214   case BC_TDUP:
   4215     |  ins_AND	// RA = dst, RD = table const (~) (holding template table)
   4216     |  mov L:RB, SAVE_L
   4217     |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
   4218     |  mov SAVE_PC, PC
   4219     |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
   4220     |  mov L:RB->base, BASE
   4221     |  jae >3
   4222     |2:
   4223     |  mov TAB:FCARG2, [KBASE+RD*4]	// Caveat: FCARG2 == BASE
   4224     |  mov L:FCARG1, L:RB		// Caveat: FCARG1 == RA
   4225     |  call extern lj_tab_dup@8		// (lua_State *L, Table *kt)
   4226     |  // Table * returned in eax (RC).
   4227     |  mov BASE, L:RB->base
   4228     |  movzx RA, PC_RA
   4229     |  mov [BASE+RA*8], TAB:RC
   4230     |  mov dword [BASE+RA*8+4], LJ_TTAB
   4231     |  ins_next
   4232     |3:
   4233     |  mov L:FCARG1, L:RB
   4234     |  call extern lj_gc_step_fixtop@4	// (lua_State *L)
   4235     |  movzx RD, PC_RD			// Need to reload RD.
   4236     |  not RDa
   4237     |  jmp <2
   4238     break;
   4239 
   4240   case BC_GGET:
   4241     |  ins_AND	// RA = dst, RD = str const (~)
   4242     |  mov LFUNC:RB, [BASE-8]
   4243     |  mov TAB:RB, LFUNC:RB->env
   4244     |  mov STR:RC, [KBASE+RD*4]
   4245     |  jmp ->BC_TGETS_Z
   4246     break;
   4247   case BC_GSET:
   4248     |  ins_AND	// RA = src, RD = str const (~)
   4249     |  mov LFUNC:RB, [BASE-8]
   4250     |  mov TAB:RB, LFUNC:RB->env
   4251     |  mov STR:RC, [KBASE+RD*4]
   4252     |  jmp ->BC_TSETS_Z
   4253     break;
   4254 
   4255   case BC_TGETV:
   4256     |  ins_ABC	// RA = dst, RB = table, RC = key
   4257     |  checktab RB, ->vmeta_tgetv
   4258     |  mov TAB:RB, [BASE+RB*8]
   4259     |
   4260     |  // Integer key?
   4261     |.if DUALNUM
   4262     |  checkint RC, >5
   4263     |  mov RC, dword [BASE+RC*8]
   4264     |.else
   4265     |  // Convert number to int and back and compare.
   4266     |  checknum RC, >5
   4267     |  movsd xmm0, qword [BASE+RC*8]
   4268     |  cvttsd2si RC, xmm0
   4269     |  cvtsi2sd xmm1, RC
   4270     |  ucomisd xmm0, xmm1
   4271     |  jne ->vmeta_tgetv		// Generic numeric key? Use fallback.
   4272     |.endif
   4273     |  cmp RC, TAB:RB->asize	// Takes care of unordered, too.
   4274     |  jae ->vmeta_tgetv		// Not in array part? Use fallback.
   4275     |  shl RC, 3
   4276     |  add RC, TAB:RB->array
   4277     |  cmp dword [RC+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
   4278     |  je >2
   4279     |  // Get array slot.
   4280     |.if X64
   4281     |  mov RBa, [RC]
   4282     |  mov [BASE+RA*8], RBa
   4283     |.else
   4284     |  mov RB, [RC]
   4285     |  mov RC, [RC+4]
   4286     |  mov [BASE+RA*8], RB
   4287     |  mov [BASE+RA*8+4], RC
   4288     |.endif
   4289     |1:
   4290     |  ins_next
   4291     |
   4292     |2:  // Check for __index if table value is nil.
   4293     |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
   4294     |  jz >3
   4295     |  mov TAB:RA, TAB:RB->metatable
   4296     |  test byte TAB:RA->nomm, 1<<MM_index
   4297     |  jz ->vmeta_tgetv			// 'no __index' flag NOT set: check.
   4298     |  movzx RA, PC_RA			// Restore RA.
   4299     |3:
   4300     |  mov dword [BASE+RA*8+4], LJ_TNIL
   4301     |  jmp <1
   4302     |
   4303     |5:  // String key?
   4304     |  checkstr RC, ->vmeta_tgetv
   4305     |  mov STR:RC, [BASE+RC*8]
   4306     |  jmp ->BC_TGETS_Z
   4307     break;
   4308   case BC_TGETS:
   4309     |  ins_ABC	// RA = dst, RB = table, RC = str const (~)
   4310     |  not RCa
   4311     |  mov STR:RC, [KBASE+RC*4]
   4312     |  checktab RB, ->vmeta_tgets
   4313     |  mov TAB:RB, [BASE+RB*8]
   4314     |->BC_TGETS_Z:	// RB = GCtab *, RC = GCstr *, refetches PC_RA.
   4315     |  mov RA, TAB:RB->hmask
   4316     |  and RA, STR:RC->hash
   4317     |  imul RA, #NODE
   4318     |  add NODE:RA, TAB:RB->node
   4319     |1:
   4320     |  cmp dword NODE:RA->key.it, LJ_TSTR
   4321     |  jne >4
   4322     |  cmp dword NODE:RA->key.gcr, STR:RC
   4323     |  jne >4
   4324     |  // Ok, key found. Assumes: offsetof(Node, val) == 0
   4325     |  cmp dword [RA+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
   4326     |  je >5				// Key found, but nil value?
   4327     |  movzx RC, PC_RA
   4328     |  // Get node value.
   4329     |.if X64
   4330     |  mov RBa, [RA]
   4331     |  mov [BASE+RC*8], RBa
   4332     |.else
   4333     |  mov RB, [RA]
   4334     |  mov RA, [RA+4]
   4335     |  mov [BASE+RC*8], RB
   4336     |  mov [BASE+RC*8+4], RA
   4337     |.endif
   4338     |2:
   4339     |  ins_next
   4340     |
   4341     |3:
   4342     |  movzx RC, PC_RA
   4343     |  mov dword [BASE+RC*8+4], LJ_TNIL
   4344     |  jmp <2
   4345     |
   4346     |4:  // Follow hash chain.
   4347     |  mov NODE:RA, NODE:RA->next
   4348     |  test NODE:RA, NODE:RA
   4349     |  jnz <1
   4350     |  // End of hash chain: key not found, nil result.
   4351     |
   4352     |5:  // Check for __index if table value is nil.
   4353     |  mov TAB:RA, TAB:RB->metatable
   4354     |  test TAB:RA, TAB:RA
   4355     |  jz <3				// No metatable: done.
   4356     |  test byte TAB:RA->nomm, 1<<MM_index
   4357     |  jnz <3				// 'no __index' flag set: done.
   4358     |  jmp ->vmeta_tgets		// Caveat: preserve STR:RC.
   4359     break;
   4360   case BC_TGETB:
   4361     |  ins_ABC	// RA = dst, RB = table, RC = byte literal
   4362     |  checktab RB, ->vmeta_tgetb
   4363     |  mov TAB:RB, [BASE+RB*8]
   4364     |  cmp RC, TAB:RB->asize
   4365     |  jae ->vmeta_tgetb
   4366     |  shl RC, 3
   4367     |  add RC, TAB:RB->array
   4368     |  cmp dword [RC+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
   4369     |  je >2
   4370     |  // Get array slot.
   4371     |.if X64
   4372     |  mov RBa, [RC]
   4373     |  mov [BASE+RA*8], RBa
   4374     |.else
   4375     |  mov RB, [RC]
   4376     |  mov RC, [RC+4]
   4377     |  mov [BASE+RA*8], RB
   4378     |  mov [BASE+RA*8+4], RC
   4379     |.endif
   4380     |1:
   4381     |  ins_next
   4382     |
   4383     |2:  // Check for __index if table value is nil.
   4384     |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
   4385     |  jz >3
   4386     |  mov TAB:RA, TAB:RB->metatable
   4387     |  test byte TAB:RA->nomm, 1<<MM_index
   4388     |  jz ->vmeta_tgetb			// 'no __index' flag NOT set: check.
   4389     |  movzx RA, PC_RA			// Restore RA.
   4390     |3:
   4391     |  mov dword [BASE+RA*8+4], LJ_TNIL
   4392     |  jmp <1
   4393     break;
   4394   case BC_TGETR:
   4395     |  ins_ABC	// RA = dst, RB = table, RC = key
   4396     |  mov TAB:RB, [BASE+RB*8]
   4397     |.if DUALNUM
   4398     |  mov RC, dword [BASE+RC*8]
   4399     |.else
   4400     |  cvttsd2si RC, qword [BASE+RC*8]
   4401     |.endif
   4402     |  cmp RC, TAB:RB->asize
   4403     |  jae ->vmeta_tgetr		// Not in array part? Use fallback.
   4404     |  shl RC, 3
   4405     |  add RC, TAB:RB->array
   4406     |  // Get array slot.
   4407     |->BC_TGETR_Z:
   4408     |.if X64
   4409     |  mov RBa, [RC]
   4410     |  mov [BASE+RA*8], RBa
   4411     |.else
   4412     |  mov RB, [RC]
   4413     |  mov RC, [RC+4]
   4414     |  mov [BASE+RA*8], RB
   4415     |  mov [BASE+RA*8+4], RC
   4416     |.endif
   4417     |->BC_TGETR2_Z:
   4418     |  ins_next
   4419     break;
   4420 
   4421   case BC_TSETV:
   4422     |  ins_ABC	// RA = src, RB = table, RC = key
   4423     |  checktab RB, ->vmeta_tsetv
   4424     |  mov TAB:RB, [BASE+RB*8]
   4425     |
   4426     |  // Integer key?
   4427     |.if DUALNUM
   4428     |  checkint RC, >5
   4429     |  mov RC, dword [BASE+RC*8]
   4430     |.else
   4431     |  // Convert number to int and back and compare.
   4432     |  checknum RC, >5
   4433     |  movsd xmm0, qword [BASE+RC*8]
   4434     |  cvttsd2si RC, xmm0
   4435     |  cvtsi2sd xmm1, RC
   4436     |  ucomisd xmm0, xmm1
   4437     |  jne ->vmeta_tsetv		// Generic numeric key? Use fallback.
   4438     |.endif
   4439     |  cmp RC, TAB:RB->asize		// Takes care of unordered, too.
   4440     |  jae ->vmeta_tsetv
   4441     |  shl RC, 3
   4442     |  add RC, TAB:RB->array
   4443     |  cmp dword [RC+4], LJ_TNIL
   4444     |  je >3				// Previous value is nil?
   4445     |1:
   4446     |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
   4447     |  jnz >7
   4448     |2:  // Set array slot.
   4449     |.if X64
   4450     |  mov RBa, [BASE+RA*8]
   4451     |  mov [RC], RBa
   4452     |.else
   4453     |  mov RB, [BASE+RA*8+4]
   4454     |  mov RA, [BASE+RA*8]
   4455     |  mov [RC+4], RB
   4456     |  mov [RC], RA
   4457     |.endif
   4458     |  ins_next
   4459     |
   4460     |3:  // Check for __newindex if previous value is nil.
   4461     |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
   4462     |  jz <1
   4463     |  mov TAB:RA, TAB:RB->metatable
   4464     |  test byte TAB:RA->nomm, 1<<MM_newindex
   4465     |  jz ->vmeta_tsetv			// 'no __newindex' flag NOT set: check.
   4466     |  movzx RA, PC_RA			// Restore RA.
   4467     |  jmp <1
   4468     |
   4469     |5:  // String key?
   4470     |  checkstr RC, ->vmeta_tsetv
   4471     |  mov STR:RC, [BASE+RC*8]
   4472     |  jmp ->BC_TSETS_Z
   4473     |
   4474     |7:  // Possible table write barrier for the value. Skip valiswhite check.
   4475     |  barrierback TAB:RB, RA
   4476     |  movzx RA, PC_RA			// Restore RA.
   4477     |  jmp <2
   4478     break;
   4479   case BC_TSETS:
   4480     |  ins_ABC	// RA = src, RB = table, RC = str const (~)
   4481     |  not RCa
   4482     |  mov STR:RC, [KBASE+RC*4]
   4483     |  checktab RB, ->vmeta_tsets
   4484     |  mov TAB:RB, [BASE+RB*8]
   4485     |->BC_TSETS_Z:	// RB = GCtab *, RC = GCstr *, refetches PC_RA.
   4486     |  mov RA, TAB:RB->hmask
   4487     |  and RA, STR:RC->hash
   4488     |  imul RA, #NODE
   4489     |  mov byte TAB:RB->nomm, 0		// Clear metamethod cache.
   4490     |  add NODE:RA, TAB:RB->node
   4491     |1:
   4492     |  cmp dword NODE:RA->key.it, LJ_TSTR
   4493     |  jne >5
   4494     |  cmp dword NODE:RA->key.gcr, STR:RC
   4495     |  jne >5
   4496     |  // Ok, key found. Assumes: offsetof(Node, val) == 0
   4497     |  cmp dword [RA+4], LJ_TNIL
   4498     |  je >4				// Previous value is nil?
   4499     |2:
   4500     |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
   4501     |  jnz >7
   4502     |3:  // Set node value.
   4503     |  movzx RC, PC_RA
   4504     |.if X64
   4505     |  mov RBa, [BASE+RC*8]
   4506     |  mov [RA], RBa
   4507     |.else
   4508     |  mov RB, [BASE+RC*8+4]
   4509     |  mov RC, [BASE+RC*8]
   4510     |  mov [RA+4], RB
   4511     |  mov [RA], RC
   4512     |.endif
   4513     |  ins_next
   4514     |
   4515     |4:  // Check for __newindex if previous value is nil.
   4516     |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
   4517     |  jz <2
   4518     |  mov TMP1, RA			// Save RA.
   4519     |  mov TAB:RA, TAB:RB->metatable
   4520     |  test byte TAB:RA->nomm, 1<<MM_newindex
   4521     |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
   4522     |  mov RA, TMP1			// Restore RA.
   4523     |  jmp <2
   4524     |
   4525     |5:  // Follow hash chain.
   4526     |  mov NODE:RA, NODE:RA->next
   4527     |  test NODE:RA, NODE:RA
   4528     |  jnz <1
   4529     |  // End of hash chain: key not found, add a new one.
   4530     |
   4531     |  // But check for __newindex first.
   4532     |  mov TAB:RA, TAB:RB->metatable
   4533     |  test TAB:RA, TAB:RA
   4534     |  jz >6				// No metatable: continue.
   4535     |  test byte TAB:RA->nomm, 1<<MM_newindex
   4536     |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
   4537     |6:
   4538     |  mov TMP1, STR:RC
   4539     |  mov TMP2, LJ_TSTR
   4540     |  mov TMP3, TAB:RB			// Save TAB:RB for us.
   4541     |.if X64
   4542     |  mov L:CARG1d, SAVE_L
   4543     |  mov L:CARG1d->base, BASE
   4544     |  lea CARG3, TMP1
   4545     |  mov CARG2d, TAB:RB
   4546     |  mov L:RB, L:CARG1d
   4547     |.else
   4548     |  lea RC, TMP1			// Store temp. TValue in TMP1/TMP2.
   4549     |  mov ARG2, TAB:RB
   4550     |  mov L:RB, SAVE_L
   4551     |  mov ARG3, RC
   4552     |  mov ARG1, L:RB
   4553     |  mov L:RB->base, BASE
   4554     |.endif
   4555     |  mov SAVE_PC, PC
   4556     |  call extern lj_tab_newkey	// (lua_State *L, GCtab *t, TValue *k)
   4557     |  // Handles write barrier for the new key. TValue * returned in eax (RC).
   4558     |  mov BASE, L:RB->base
   4559     |  mov TAB:RB, TMP3			// Need TAB:RB for barrier.
   4560     |  mov RA, eax
   4561     |  jmp <2				// Must check write barrier for value.
   4562     |
   4563     |7:  // Possible table write barrier for the value. Skip valiswhite check.
   4564     |  barrierback TAB:RB, RC		// Destroys STR:RC.
   4565     |  jmp <3
   4566     break;
   4567   case BC_TSETB:
   4568     |  ins_ABC	// RA = src, RB = table, RC = byte literal
   4569     |  checktab RB, ->vmeta_tsetb
   4570     |  mov TAB:RB, [BASE+RB*8]
   4571     |  cmp RC, TAB:RB->asize
   4572     |  jae ->vmeta_tsetb
   4573     |  shl RC, 3
   4574     |  add RC, TAB:RB->array
   4575     |  cmp dword [RC+4], LJ_TNIL
   4576     |  je >3				// Previous value is nil?
   4577     |1:
   4578     |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
   4579     |  jnz >7
   4580     |2:	 // Set array slot.
   4581     |.if X64
   4582     |  mov RAa, [BASE+RA*8]
   4583     |  mov [RC], RAa
   4584     |.else
   4585     |  mov RB, [BASE+RA*8+4]
   4586     |  mov RA, [BASE+RA*8]
   4587     |  mov [RC+4], RB
   4588     |  mov [RC], RA
   4589     |.endif
   4590     |  ins_next
   4591     |
   4592     |3:  // Check for __newindex if previous value is nil.
   4593     |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
   4594     |  jz <1
   4595     |  mov TAB:RA, TAB:RB->metatable
   4596     |  test byte TAB:RA->nomm, 1<<MM_newindex
   4597     |  jz ->vmeta_tsetb			// 'no __newindex' flag NOT set: check.
   4598     |  movzx RA, PC_RA			// Restore RA.
   4599     |  jmp <1
   4600     |
   4601     |7:  // Possible table write barrier for the value. Skip valiswhite check.
   4602     |  barrierback TAB:RB, RA
   4603     |  movzx RA, PC_RA			// Restore RA.
   4604     |  jmp <2
   4605     break;
   4606   case BC_TSETR:
   4607     |  ins_ABC	// RA = src, RB = table, RC = key
   4608     |  mov TAB:RB, [BASE+RB*8]
   4609     |.if DUALNUM
   4610     |  mov RC, dword [BASE+RC*8]
   4611     |.else
   4612     |  cvttsd2si RC, qword [BASE+RC*8]
   4613     |.endif
   4614     |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
   4615     |  jnz >7
   4616     |2:
   4617     |  cmp RC, TAB:RB->asize
   4618     |  jae ->vmeta_tsetr
   4619     |  shl RC, 3
   4620     |  add RC, TAB:RB->array
   4621     |  // Set array slot.
   4622     |->BC_TSETR_Z:
   4623     |.if X64
   4624     |  mov RBa, [BASE+RA*8]
   4625     |  mov [RC], RBa
   4626     |.else
   4627     |  mov RB, [BASE+RA*8+4]
   4628     |  mov RA, [BASE+RA*8]
   4629     |  mov [RC+4], RB
   4630     |  mov [RC], RA
   4631     |.endif
   4632     |  ins_next
   4633     |
   4634     |7:  // Possible table write barrier for the value. Skip valiswhite check.
   4635     |  barrierback TAB:RB, RA
   4636     |  movzx RA, PC_RA			// Restore RA.
   4637     |  jmp <2
   4638     break;
   4639 
   4640   case BC_TSETM:
   4641     |  ins_AD	// RA = base (table at base-1), RD = num const (start index)
   4642     |  mov TMP1, KBASE			// Need one more free register.
   4643     |  mov KBASE, dword [KBASE+RD*8]	// Integer constant is in lo-word.
   4644     |1:
   4645     |  lea RA, [BASE+RA*8]
   4646     |  mov TAB:RB, [RA-8]		// Guaranteed to be a table.
   4647     |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
   4648     |  jnz >7
   4649     |2:
   4650     |  mov RD, MULTRES
   4651     |  sub RD, 1
   4652     |  jz >4				// Nothing to copy?
   4653     |  add RD, KBASE			// Compute needed size.
   4654     |  cmp RD, TAB:RB->asize
   4655     |  ja >5				// Doesn't fit into array part?
   4656     |  sub RD, KBASE
   4657     |  shl KBASE, 3
   4658     |  add KBASE, TAB:RB->array
   4659     |3:  // Copy result slots to table.
   4660     |.if X64
   4661     |  mov RBa, [RA]
   4662     |  add RA, 8
   4663     |  mov [KBASE], RBa
   4664     |.else
   4665     |  mov RB, [RA]
   4666     |  mov [KBASE], RB
   4667     |  mov RB, [RA+4]
   4668     |  add RA, 8
   4669     |  mov [KBASE+4], RB
   4670     |.endif
   4671     |  add KBASE, 8
   4672     |  sub RD, 1
   4673     |  jnz <3
   4674     |4:
   4675     |  mov KBASE, TMP1
   4676     |  ins_next
   4677     |
   4678     |5:  // Need to resize array part.
   4679     |.if X64
   4680     |  mov L:CARG1d, SAVE_L
   4681     |  mov L:CARG1d->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
   4682     |  mov CARG2d, TAB:RB
   4683     |  mov CARG3d, RD
   4684     |  mov L:RB, L:CARG1d
   4685     |.else
   4686     |  mov ARG2, TAB:RB
   4687     |  mov L:RB, SAVE_L
   4688     |  mov L:RB->base, BASE
   4689     |  mov ARG3, RD
   4690     |  mov ARG1, L:RB
   4691     |.endif
   4692     |  mov SAVE_PC, PC
   4693     |  call extern lj_tab_reasize	// (lua_State *L, GCtab *t, int nasize)
   4694     |  mov BASE, L:RB->base
   4695     |  movzx RA, PC_RA			// Restore RA.
   4696     |  jmp <1				// Retry.
   4697     |
   4698     |7:  // Possible table write barrier for any value. Skip valiswhite check.
   4699     |  barrierback TAB:RB, RD
   4700     |  jmp <2
   4701     break;
   4702 
   4703   /* -- Calls and vararg handling ----------------------------------------- */
   4704 
   4705   case BC_CALL: case BC_CALLM:
   4706     |  ins_A_C	// RA = base, (RB = nresults+1,) RC = nargs+1 | extra_nargs
   4707     if (op == BC_CALLM) {
   4708       |  add NARGS:RD, MULTRES
   4709     }
   4710     |  cmp dword [BASE+RA*8+4], LJ_TFUNC
   4711     |  mov LFUNC:RB, [BASE+RA*8]
   4712     |  jne ->vmeta_call_ra
   4713     |  lea BASE, [BASE+RA*8+8]
   4714     |  ins_call
   4715     break;
   4716 
   4717   case BC_CALLMT:
   4718     |  ins_AD	// RA = base, RD = extra_nargs
   4719     |  add NARGS:RD, MULTRES
   4720     |  // Fall through. Assumes BC_CALLT follows and ins_AD is a no-op.
   4721     break;
   4722   case BC_CALLT:
   4723     |  ins_AD	// RA = base, RD = nargs+1
   4724     |  lea RA, [BASE+RA*8+8]
   4725     |  mov KBASE, BASE			// Use KBASE for move + vmeta_call hint.
   4726     |  mov LFUNC:RB, [RA-8]
   4727     |  cmp dword [RA-4], LJ_TFUNC
   4728     |  jne ->vmeta_call
   4729     |->BC_CALLT_Z:
   4730     |  mov PC, [BASE-4]
   4731     |  test PC, FRAME_TYPE
   4732     |  jnz >7
   4733     |1:
   4734     |  mov [BASE-8], LFUNC:RB		// Copy function down, reloaded below.
   4735     |  mov MULTRES, NARGS:RD
   4736     |  sub NARGS:RD, 1
   4737     |  jz >3
   4738     |2:  // Move args down.
   4739     |.if X64
   4740     |  mov RBa, [RA]
   4741     |  add RA, 8
   4742     |  mov [KBASE], RBa
   4743     |.else
   4744     |  mov RB, [RA]
   4745     |  mov [KBASE], RB
   4746     |  mov RB, [RA+4]
   4747     |  add RA, 8
   4748     |  mov [KBASE+4], RB
   4749     |.endif
   4750     |  add KBASE, 8
   4751     |  sub NARGS:RD, 1
   4752     |  jnz <2
   4753     |
   4754     |  mov LFUNC:RB, [BASE-8]
   4755     |3:
   4756     |  mov NARGS:RD, MULTRES
   4757     |  cmp byte LFUNC:RB->ffid, 1	// (> FF_C) Calling a fast function?
   4758     |  ja >5
   4759     |4:
   4760     |  ins_callt
   4761     |
   4762     |5:  // Tailcall to a fast function.
   4763     |  test PC, FRAME_TYPE		// Lua frame below?
   4764     |  jnz <4
   4765     |  movzx RA, PC_RA
   4766     |  not RAa
   4767     |  mov LFUNC:KBASE, [BASE+RA*8-8]	// Need to prepare KBASE.
   4768     |  mov KBASE, LFUNC:KBASE->pc
   4769     |  mov KBASE, [KBASE+PC2PROTO(k)]
   4770     |  jmp <4
   4771     |
   4772     |7:  // Tailcall from a vararg function.
   4773     |  sub PC, FRAME_VARG
   4774     |  test PC, FRAME_TYPEP
   4775     |  jnz >8				// Vararg frame below?
   4776     |  sub BASE, PC			// Need to relocate BASE/KBASE down.
   4777     |  mov KBASE, BASE
   4778     |  mov PC, [BASE-4]
   4779     |  jmp <1
   4780     |8:
   4781     |  add PC, FRAME_VARG
   4782     |  jmp <1
   4783     break;
   4784 
   4785   case BC_ITERC:
   4786     |  ins_A	// RA = base, (RB = nresults+1,) RC = nargs+1 (2+1)
   4787     |  lea RA, [BASE+RA*8+8]		// fb = base+1
   4788     |.if X64
   4789     |  mov RBa, [RA-24]			// Copy state. fb[0] = fb[-3].
   4790     |  mov RCa, [RA-16]			// Copy control var. fb[1] = fb[-2].
   4791     |  mov [RA], RBa
   4792     |  mov [RA+8], RCa
   4793     |.else
   4794     |  mov RB, [RA-24]			// Copy state. fb[0] = fb[-3].
   4795     |  mov RC, [RA-20]
   4796     |  mov [RA], RB
   4797     |  mov [RA+4], RC
   4798     |  mov RB, [RA-16]			// Copy control var. fb[1] = fb[-2].
   4799     |  mov RC, [RA-12]
   4800     |  mov [RA+8], RB
   4801     |  mov [RA+12], RC
   4802     |.endif
   4803     |  mov LFUNC:RB, [RA-32]		// Copy callable. fb[-1] = fb[-4]
   4804     |  mov RC, [RA-28]
   4805     |  mov [RA-8], LFUNC:RB
   4806     |  mov [RA-4], RC
   4807     |  cmp RC, LJ_TFUNC			// Handle like a regular 2-arg call.
   4808     |  mov NARGS:RD, 2+1
   4809     |  jne ->vmeta_call
   4810     |  mov BASE, RA
   4811     |  ins_call
   4812     break;
   4813 
   4814   case BC_ITERN:
   4815     |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
   4816     |.if JIT
   4817     |  // NYI: add hotloop, record BC_ITERN.
   4818     |.endif
   4819     |  mov TMP1, KBASE			// Need two more free registers.
   4820     |  mov TMP2, DISPATCH
   4821     |  mov TAB:RB, [BASE+RA*8-16]
   4822     |  mov RC, [BASE+RA*8-8]		// Get index from control var.
   4823     |  mov DISPATCH, TAB:RB->asize
   4824     |  add PC, 4
   4825     |  mov KBASE, TAB:RB->array
   4826     |1:  // Traverse array part.
   4827     |  cmp RC, DISPATCH; jae >5		// Index points after array part?
   4828     |  cmp dword [KBASE+RC*8+4], LJ_TNIL; je >4
   4829     |.if DUALNUM
   4830     |  mov dword [BASE+RA*8+4], LJ_TISNUM
   4831     |  mov dword [BASE+RA*8], RC
   4832     |.else
   4833     |  cvtsi2sd xmm0, RC
   4834     |.endif
   4835     |  // Copy array slot to returned value.
   4836     |.if X64
   4837     |  mov RBa, [KBASE+RC*8]
   4838     |  mov [BASE+RA*8+8], RBa
   4839     |.else
   4840     |  mov RB, [KBASE+RC*8+4]
   4841     |  mov [BASE+RA*8+12], RB
   4842     |  mov RB, [KBASE+RC*8]
   4843     |  mov [BASE+RA*8+8], RB
   4844     |.endif
   4845     |  add RC, 1
   4846     |  // Return array index as a numeric key.
   4847     |.if DUALNUM
   4848     |  // See above.
   4849     |.else
   4850     |  movsd qword [BASE+RA*8], xmm0
   4851     |.endif
   4852     |  mov [BASE+RA*8-8], RC		// Update control var.
   4853     |2:
   4854     |  movzx RD, PC_RD			// Get target from ITERL.
   4855     |  branchPC RD
   4856     |3:
   4857     |  mov DISPATCH, TMP2
   4858     |  mov KBASE, TMP1
   4859     |  ins_next
   4860     |
   4861     |4:  // Skip holes in array part.
   4862     |  add RC, 1
   4863     |  jmp <1
   4864     |
   4865     |5:  // Traverse hash part.
   4866     |  sub RC, DISPATCH
   4867     |6:
   4868     |  cmp RC, TAB:RB->hmask; ja <3	// End of iteration? Branch to ITERL+1.
   4869     |  imul KBASE, RC, #NODE
   4870     |  add NODE:KBASE, TAB:RB->node
   4871     |  cmp dword NODE:KBASE->val.it, LJ_TNIL; je >7
   4872     |  lea DISPATCH, [RC+DISPATCH+1]
   4873     |  // Copy key and value from hash slot.
   4874     |.if X64
   4875     |  mov RBa, NODE:KBASE->key
   4876     |  mov RCa, NODE:KBASE->val
   4877     |  mov [BASE+RA*8], RBa
   4878     |  mov [BASE+RA*8+8], RCa
   4879     |.else
   4880     |  mov RB, NODE:KBASE->key.gcr
   4881     |  mov RC, NODE:KBASE->key.it
   4882     |  mov [BASE+RA*8], RB
   4883     |  mov [BASE+RA*8+4], RC
   4884     |  mov RB, NODE:KBASE->val.gcr
   4885     |  mov RC, NODE:KBASE->val.it
   4886     |  mov [BASE+RA*8+8], RB
   4887     |  mov [BASE+RA*8+12], RC
   4888     |.endif
   4889     |  mov [BASE+RA*8-8], DISPATCH
   4890     |  jmp <2
   4891     |
   4892     |7:  // Skip holes in hash part.
   4893     |  add RC, 1
   4894     |  jmp <6
   4895     break;
   4896 
   4897   case BC_ISNEXT:
   4898     |  ins_AD	// RA = base, RD = target (points to ITERN)
   4899     |  cmp dword [BASE+RA*8-20], LJ_TFUNC; jne >5
   4900     |  mov CFUNC:RB, [BASE+RA*8-24]
   4901     |  cmp dword [BASE+RA*8-12], LJ_TTAB; jne >5
   4902     |  cmp dword [BASE+RA*8-4], LJ_TNIL; jne >5
   4903     |  cmp byte CFUNC:RB->ffid, FF_next_N; jne >5
   4904     |  branchPC RD
   4905     |  mov dword [BASE+RA*8-8], 0	// Initialize control var.
   4906     |  mov dword [BASE+RA*8-4], 0xfffe7fff
   4907     |1:
   4908     |  ins_next
   4909     |5:  // Despecialize bytecode if any of the checks fail.
   4910     |  mov PC_OP, BC_JMP
   4911     |  branchPC RD
   4912     |  mov byte [PC], BC_ITERC
   4913     |  jmp <1
   4914     break;
   4915 
   4916   case BC_VARG:
   4917     |  ins_ABC	// RA = base, RB = nresults+1, RC = numparams
   4918     |  mov TMP1, KBASE			// Need one more free register.
   4919     |  lea KBASE, [BASE+RC*8+(8+FRAME_VARG)]
   4920     |  lea RA, [BASE+RA*8]
   4921     |  sub KBASE, [BASE-4]
   4922     |  // Note: KBASE may now be even _above_ BASE if nargs was < numparams.
   4923     |  test RB, RB
   4924     |  jz >5				// Copy all varargs?
   4925     |  lea RB, [RA+RB*8-8]
   4926     |  cmp KBASE, BASE			// No vararg slots?
   4927     |  jnb >2
   4928     |1:  // Copy vararg slots to destination slots.
   4929     |.if X64
   4930     |  mov RCa, [KBASE-8]
   4931     |  add KBASE, 8
   4932     |  mov [RA], RCa
   4933     |.else
   4934     |  mov RC, [KBASE-8]
   4935     |  mov [RA], RC
   4936     |  mov RC, [KBASE-4]
   4937     |  add KBASE, 8
   4938     |  mov [RA+4], RC
   4939     |.endif
   4940     |  add RA, 8
   4941     |  cmp RA, RB			// All destination slots filled?
   4942     |  jnb >3
   4943     |  cmp KBASE, BASE			// No more vararg slots?
   4944     |  jb <1
   4945     |2:  // Fill up remainder with nil.
   4946     |  mov dword [RA+4], LJ_TNIL
   4947     |  add RA, 8
   4948     |  cmp RA, RB
   4949     |  jb <2
   4950     |3:
   4951     |  mov KBASE, TMP1
   4952     |  ins_next
   4953     |
   4954     |5:  // Copy all varargs.
   4955     |  mov MULTRES, 1			// MULTRES = 0+1
   4956     |  mov RC, BASE
   4957     |  sub RC, KBASE
   4958     |  jbe <3				// No vararg slots?
   4959     |  mov RB, RC
   4960     |  shr RB, 3
   4961     |  add RB, 1
   4962     |  mov MULTRES, RB			// MULTRES = #varargs+1
   4963     |  mov L:RB, SAVE_L
   4964     |  add RC, RA
   4965     |  cmp RC, L:RB->maxstack
   4966     |  ja >7				// Need to grow stack?
   4967     |6:  // Copy all vararg slots.
   4968     |.if X64
   4969     |  mov RCa, [KBASE-8]
   4970     |  add KBASE, 8
   4971     |  mov [RA], RCa
   4972     |.else
   4973     |  mov RC, [KBASE-8]
   4974     |  mov [RA], RC
   4975     |  mov RC, [KBASE-4]
   4976     |  add KBASE, 8
   4977     |  mov [RA+4], RC
   4978     |.endif
   4979     |  add RA, 8
   4980     |  cmp KBASE, BASE			// No more vararg slots?
   4981     |  jb <6
   4982     |  jmp <3
   4983     |
   4984     |7:  // Grow stack for varargs.
   4985     |  mov L:RB->base, BASE
   4986     |  mov L:RB->top, RA
   4987     |  mov SAVE_PC, PC
   4988     |  sub KBASE, BASE			// Need delta, because BASE may change.
   4989     |  mov FCARG2, MULTRES
   4990     |  sub FCARG2, 1
   4991     |  mov FCARG1, L:RB
   4992     |  call extern lj_state_growstack@8	// (lua_State *L, int n)
   4993     |  mov BASE, L:RB->base
   4994     |  mov RA, L:RB->top
   4995     |  add KBASE, BASE
   4996     |  jmp <6
   4997     break;
   4998 
   4999   /* -- Returns ----------------------------------------------------------- */
   5000 
   5001   case BC_RETM:
   5002     |  ins_AD	// RA = results, RD = extra_nresults
   5003     |  add RD, MULTRES			// MULTRES >=1, so RD >=1.
   5004     |  // Fall through. Assumes BC_RET follows and ins_AD is a no-op.
   5005     break;
   5006 
   5007   case BC_RET: case BC_RET0: case BC_RET1:
   5008     |  ins_AD	// RA = results, RD = nresults+1
   5009     if (op != BC_RET0) {
   5010       |  shl RA, 3
   5011     }
   5012     |1:
   5013     |  mov PC, [BASE-4]
   5014     |  mov MULTRES, RD			// Save nresults+1.
   5015     |  test PC, FRAME_TYPE		// Check frame type marker.
   5016     |  jnz >7				// Not returning to a fixarg Lua func?
   5017     switch (op) {
   5018     case BC_RET:
   5019       |->BC_RET_Z:
   5020       |  mov KBASE, BASE		// Use KBASE for result move.
   5021       |  sub RD, 1
   5022       |  jz >3
   5023       |2:  // Move results down.
   5024       |.if X64
   5025       |  mov RBa, [KBASE+RA]
   5026       |  mov [KBASE-8], RBa
   5027       |.else
   5028       |  mov RB, [KBASE+RA]
   5029       |  mov [KBASE-8], RB
   5030       |  mov RB, [KBASE+RA+4]
   5031       |  mov [KBASE-4], RB
   5032       |.endif
   5033       |  add KBASE, 8
   5034       |  sub RD, 1
   5035       |  jnz <2
   5036       |3:
   5037       |  mov RD, MULTRES		// Note: MULTRES may be >255.
   5038       |  movzx RB, PC_RB		// So cannot compare with RDL!
   5039       |5:
   5040       |  cmp RB, RD			// More results expected?
   5041       |  ja >6
   5042       break;
   5043     case BC_RET1:
   5044       |.if X64
   5045       |  mov RBa, [BASE+RA]
   5046       |  mov [BASE-8], RBa
   5047       |.else
   5048       |  mov RB, [BASE+RA+4]
   5049       |  mov [BASE-4], RB
   5050       |  mov RB, [BASE+RA]
   5051       |  mov [BASE-8], RB
   5052       |.endif
   5053       /* fallthrough */
   5054     case BC_RET0:
   5055       |5:
   5056       |  cmp PC_RB, RDL			// More results expected?
   5057       |  ja >6
   5058     default:
   5059       break;
   5060     }
   5061     |  movzx RA, PC_RA
   5062     |  not RAa				// Note: ~RA = -(RA+1)
   5063     |  lea BASE, [BASE+RA*8]		// base = base - (RA+1)*8
   5064     |  mov LFUNC:KBASE, [BASE-8]
   5065     |  mov KBASE, LFUNC:KBASE->pc
   5066     |  mov KBASE, [KBASE+PC2PROTO(k)]
   5067     |  ins_next
   5068     |
   5069     |6:  // Fill up results with nil.
   5070     if (op == BC_RET) {
   5071       |  mov dword [KBASE-4], LJ_TNIL	// Note: relies on shifted base.
   5072       |  add KBASE, 8
   5073     } else {
   5074       |  mov dword [BASE+RD*8-12], LJ_TNIL
   5075     }
   5076     |  add RD, 1
   5077     |  jmp <5
   5078     |
   5079     |7:  // Non-standard return case.
   5080     |  lea RB, [PC-FRAME_VARG]
   5081     |  test RB, FRAME_TYPEP
   5082     |  jnz ->vm_return
   5083     |  // Return from vararg function: relocate BASE down and RA up.
   5084     |  sub BASE, RB
   5085     if (op != BC_RET0) {
   5086       |  add RA, RB
   5087     }
   5088     |  jmp <1
   5089     break;
   5090 
   5091   /* -- Loops and branches ------------------------------------------------ */
   5092 
   5093   |.define FOR_IDX,  [RA];    .define FOR_TIDX,  dword [RA+4]
   5094   |.define FOR_STOP, [RA+8];  .define FOR_TSTOP, dword [RA+12]
   5095   |.define FOR_STEP, [RA+16]; .define FOR_TSTEP, dword [RA+20]
   5096   |.define FOR_EXT,  [RA+24]; .define FOR_TEXT,  dword [RA+28]
   5097 
   5098   case BC_FORL:
   5099     |.if JIT
   5100     |  hotloop RB
   5101     |.endif
   5102     | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
   5103     break;
   5104 
   5105   case BC_JFORI:
   5106   case BC_JFORL:
   5107 #if !LJ_HASJIT
   5108     break;
   5109 #endif
   5110   case BC_FORI:
   5111   case BC_IFORL:
   5112     vk = (op == BC_IFORL || op == BC_JFORL);
   5113     |  ins_AJ	// RA = base, RD = target (after end of loop or start of loop)
   5114     |  lea RA, [BASE+RA*8]
   5115     if (LJ_DUALNUM) {
   5116       |  cmp FOR_TIDX, LJ_TISNUM; jne >9
   5117       if (!vk) {
   5118 	|  cmp FOR_TSTOP, LJ_TISNUM; jne ->vmeta_for
   5119 	|  cmp FOR_TSTEP, LJ_TISNUM; jne ->vmeta_for
   5120 	|  mov RB, dword FOR_IDX
   5121 	|  cmp dword FOR_STEP, 0; jl >5
   5122       } else {
   5123 #ifdef LUA_USE_ASSERT
   5124 	|  cmp FOR_TSTOP, LJ_TISNUM; jne ->assert_bad_for_arg_type
   5125 	|  cmp FOR_TSTEP, LJ_TISNUM; jne ->assert_bad_for_arg_type
   5126 #endif
   5127 	|  mov RB, dword FOR_STEP
   5128 	|  test RB, RB; js >5
   5129 	|  add RB, dword FOR_IDX; jo >1
   5130 	|  mov dword FOR_IDX, RB
   5131       }
   5132       |  cmp RB, dword FOR_STOP
   5133       |  mov FOR_TEXT, LJ_TISNUM
   5134       |  mov dword FOR_EXT, RB
   5135       if (op == BC_FORI) {
   5136 	|  jle >7
   5137 	|1:
   5138 	|6:
   5139 	|  branchPC RD
   5140       } else if (op == BC_JFORI) {
   5141 	|  branchPC RD
   5142 	|  movzx RD, PC_RD
   5143 	|  jle =>BC_JLOOP
   5144 	|1:
   5145 	|6:
   5146       } else if (op == BC_IFORL) {
   5147 	|  jg >7
   5148 	|6:
   5149 	|  branchPC RD
   5150 	|1:
   5151       } else {
   5152 	|  jle =>BC_JLOOP
   5153 	|1:
   5154 	|6:
   5155       }
   5156       |7:
   5157       |  ins_next
   5158       |
   5159       |5:  // Invert check for negative step.
   5160       if (vk) {
   5161 	|  add RB, dword FOR_IDX; jo <1
   5162 	|  mov dword FOR_IDX, RB
   5163       }
   5164       |  cmp RB, dword FOR_STOP
   5165       |  mov FOR_TEXT, LJ_TISNUM
   5166       |  mov dword FOR_EXT, RB
   5167       if (op == BC_FORI) {
   5168 	|  jge <7
   5169       } else if (op == BC_JFORI) {
   5170 	|  branchPC RD
   5171 	|  movzx RD, PC_RD
   5172 	|  jge =>BC_JLOOP
   5173       } else if (op == BC_IFORL) {
   5174 	|  jl <7
   5175       } else {
   5176 	|  jge =>BC_JLOOP
   5177       }
   5178       |  jmp <6
   5179       |9:  // Fallback to FP variant.
   5180     } else if (!vk) {
   5181       |  cmp FOR_TIDX, LJ_TISNUM
   5182     }
   5183     if (!vk) {
   5184       |  jae ->vmeta_for
   5185       |  cmp FOR_TSTOP, LJ_TISNUM; jae ->vmeta_for
   5186     } else {
   5187 #ifdef LUA_USE_ASSERT
   5188       |  cmp FOR_TSTOP, LJ_TISNUM; jae ->assert_bad_for_arg_type
   5189       |  cmp FOR_TSTEP, LJ_TISNUM; jae ->assert_bad_for_arg_type
   5190 #endif
   5191     }
   5192     |  mov RB, FOR_TSTEP		// Load type/hiword of for step.
   5193     if (!vk) {
   5194       |  cmp RB, LJ_TISNUM; jae ->vmeta_for
   5195     }
   5196     |  movsd xmm0, qword FOR_IDX
   5197     |  movsd xmm1, qword FOR_STOP
   5198     if (vk) {
   5199       |  addsd xmm0, qword FOR_STEP
   5200       |  movsd qword FOR_IDX, xmm0
   5201       |  test RB, RB; js >3
   5202     } else {
   5203       |  jl >3
   5204     }
   5205     |  ucomisd xmm1, xmm0
   5206     |1:
   5207     |  movsd qword FOR_EXT, xmm0
   5208     if (op == BC_FORI) {
   5209       |.if DUALNUM
   5210       |  jnb <7
   5211       |.else
   5212       |  jnb >2
   5213       |  branchPC RD
   5214       |.endif
   5215     } else if (op == BC_JFORI) {
   5216       |  branchPC RD
   5217       |  movzx RD, PC_RD
   5218       |  jnb =>BC_JLOOP
   5219     } else if (op == BC_IFORL) {
   5220       |.if DUALNUM
   5221       |  jb <7
   5222       |.else
   5223       |  jb >2
   5224       |  branchPC RD
   5225       |.endif
   5226     } else {
   5227       |  jnb =>BC_JLOOP
   5228     }
   5229     |.if DUALNUM
   5230     |  jmp <6
   5231     |.else
   5232     |2:
   5233     |  ins_next
   5234     |.endif
   5235     |
   5236     |3:  // Invert comparison if step is negative.
   5237     |  ucomisd xmm0, xmm1
   5238     |  jmp <1
   5239     break;
   5240 
   5241   case BC_ITERL:
   5242     |.if JIT
   5243     |  hotloop RB
   5244     |.endif
   5245     | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
   5246     break;
   5247 
   5248   case BC_JITERL:
   5249 #if !LJ_HASJIT
   5250     break;
   5251 #endif
   5252   case BC_IITERL:
   5253     |  ins_AJ	// RA = base, RD = target
   5254     |  lea RA, [BASE+RA*8]
   5255     |  mov RB, [RA+4]
   5256     |  cmp RB, LJ_TNIL; je >1		// Stop if iterator returned nil.
   5257     if (op == BC_JITERL) {
   5258       |  mov [RA-4], RB
   5259       |  mov RB, [RA]
   5260       |  mov [RA-8], RB
   5261       |  jmp =>BC_JLOOP
   5262     } else {
   5263       |  branchPC RD			// Otherwise save control var + branch.
   5264       |  mov RD, [RA]
   5265       |  mov [RA-4], RB
   5266       |  mov [RA-8], RD
   5267     }
   5268     |1:
   5269     |  ins_next
   5270     break;
   5271 
   5272   case BC_LOOP:
   5273     |  ins_A	// RA = base, RD = target (loop extent)
   5274     |  // Note: RA/RD is only used by trace recorder to determine scope/extent
   5275     |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
   5276     |.if JIT
   5277     |  hotloop RB
   5278     |.endif
   5279     | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.
   5280     break;
   5281 
   5282   case BC_ILOOP:
   5283     |  ins_A	// RA = base, RD = target (loop extent)
   5284     |  ins_next
   5285     break;
   5286 
   5287   case BC_JLOOP:
   5288     |.if JIT
   5289     |  ins_AD	// RA = base (ignored), RD = traceno
   5290     |  mov RA, [DISPATCH+DISPATCH_J(trace)]
   5291     |  mov TRACE:RD, [RA+RD*4]
   5292     |  mov RDa, TRACE:RD->mcode
   5293     |  mov L:RB, SAVE_L
   5294     |  mov [DISPATCH+DISPATCH_GL(jit_base)], BASE
   5295     |  mov [DISPATCH+DISPATCH_GL(tmpbuf.L)], L:RB
   5296     |  // Save additional callee-save registers only used in compiled code.
   5297     |.if X64WIN
   5298     |  mov TMPQ, r12
   5299     |  mov TMPa, r13
   5300     |  mov CSAVE_4, r14
   5301     |  mov CSAVE_3, r15
   5302     |  mov RAa, rsp
   5303     |  sub rsp, 9*16+4*8
   5304     |  movdqa [RAa], xmm6
   5305     |  movdqa [RAa-1*16], xmm7
   5306     |  movdqa [RAa-2*16], xmm8
   5307     |  movdqa [RAa-3*16], xmm9
   5308     |  movdqa [RAa-4*16], xmm10
   5309     |  movdqa [RAa-5*16], xmm11
   5310     |  movdqa [RAa-6*16], xmm12
   5311     |  movdqa [RAa-7*16], xmm13
   5312     |  movdqa [RAa-8*16], xmm14
   5313     |  movdqa [RAa-9*16], xmm15
   5314     |.elif X64
   5315     |  mov TMPQ, r12
   5316     |  mov TMPa, r13
   5317     |  sub rsp, 16
   5318     |.endif
   5319     |  jmp RDa
   5320     |.endif
   5321     break;
   5322 
   5323   case BC_JMP:
   5324     |  ins_AJ	// RA = unused, RD = target
   5325     |  branchPC RD
   5326     |  ins_next
   5327     break;
   5328 
   5329   /* -- Function headers -------------------------------------------------- */
   5330 
   5331    /*
   5332    ** Reminder: A function may be called with func/args above L->maxstack,
   5333    ** i.e. occupying EXTRA_STACK slots. And vmeta_call may add one extra slot,
   5334    ** too. This means all FUNC* ops (including fast functions) must check
   5335    ** for stack overflow _before_ adding more slots!
   5336    */
   5337 
   5338   case BC_FUNCF:
   5339     |.if JIT
   5340     |  hotcall RB
   5341     |.endif
   5342   case BC_FUNCV:  /* NYI: compiled vararg functions. */
   5343     | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
   5344     break;
   5345 
   5346   case BC_JFUNCF:
   5347 #if !LJ_HASJIT
   5348     break;
   5349 #endif
   5350   case BC_IFUNCF:
   5351     |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
   5352     |  mov KBASE, [PC-4+PC2PROTO(k)]
   5353     |  mov L:RB, SAVE_L
   5354     |  lea RA, [BASE+RA*8]		// Top of frame.
   5355     |  cmp RA, L:RB->maxstack
   5356     |  ja ->vm_growstack_f
   5357     |  movzx RA, byte [PC-4+PC2PROTO(numparams)]
   5358     |  cmp NARGS:RD, RA			// Check for missing parameters.
   5359     |  jbe >3
   5360     |2:
   5361     if (op == BC_JFUNCF) {
   5362       |  movzx RD, PC_RD
   5363       |  jmp =>BC_JLOOP
   5364     } else {
   5365       |  ins_next
   5366     }
   5367     |
   5368     |3:  // Clear missing parameters.
   5369     |  mov dword [BASE+NARGS:RD*8-4], LJ_TNIL
   5370     |  add NARGS:RD, 1
   5371     |  cmp NARGS:RD, RA
   5372     |  jbe <3
   5373     |  jmp <2
   5374     break;
   5375 
   5376   case BC_JFUNCV:
   5377 #if !LJ_HASJIT
   5378     break;
   5379 #endif
   5380     | int3  // NYI: compiled vararg functions
   5381     break;  /* NYI: compiled vararg functions. */
   5382 
   5383   case BC_IFUNCV:
   5384     |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
   5385     |  lea RB, [NARGS:RD*8+FRAME_VARG]
   5386     |  lea RD, [BASE+NARGS:RD*8]
   5387     |  mov LFUNC:KBASE, [BASE-8]
   5388     |  mov [RD-4], RB			// Store delta + FRAME_VARG.
   5389     |  mov [RD-8], LFUNC:KBASE		// Store copy of LFUNC.
   5390     |  mov L:RB, SAVE_L
   5391     |  lea RA, [RD+RA*8]
   5392     |  cmp RA, L:RB->maxstack
   5393     |  ja ->vm_growstack_v		// Need to grow stack.
   5394     |  mov RA, BASE
   5395     |  mov BASE, RD
   5396     |  movzx RB, byte [PC-4+PC2PROTO(numparams)]
   5397     |  test RB, RB
   5398     |  jz >2
   5399     |1:  // Copy fixarg slots up to new frame.
   5400     |  add RA, 8
   5401     |  cmp RA, BASE
   5402     |  jnb >3				// Less args than parameters?
   5403     |  mov KBASE, [RA-8]
   5404     |  mov [RD], KBASE
   5405     |  mov KBASE, [RA-4]
   5406     |  mov [RD+4], KBASE
   5407     |  add RD, 8
   5408     |  mov dword [RA-4], LJ_TNIL	// Clear old fixarg slot (help the GC).
   5409     |  sub RB, 1
   5410     |  jnz <1
   5411     |2:
   5412     if (op == BC_JFUNCV) {
   5413       |  movzx RD, PC_RD
   5414       |  jmp =>BC_JLOOP
   5415     } else {
   5416       |  mov KBASE, [PC-4+PC2PROTO(k)]
   5417       |  ins_next
   5418     }
   5419     |
   5420     |3:  // Clear missing parameters.
   5421     |  mov dword [RD+4], LJ_TNIL
   5422     |  add RD, 8
   5423     |  sub RB, 1
   5424     |  jnz <3
   5425     |  jmp <2
   5426     break;
   5427 
   5428   case BC_FUNCC:
   5429   case BC_FUNCCW:
   5430     |  ins_AD  // BASE = new base, RA = ins RA|RD (unused), RD = nargs+1
   5431     |  mov CFUNC:RB, [BASE-8]
   5432     |  mov KBASEa, CFUNC:RB->f
   5433     |  mov L:RB, SAVE_L
   5434     |  lea RD, [BASE+NARGS:RD*8-8]
   5435     |  mov L:RB->base, BASE
   5436     |  lea RA, [RD+8*LUA_MINSTACK]
   5437     |  cmp RA, L:RB->maxstack
   5438     |  mov L:RB->top, RD
   5439     if (op == BC_FUNCC) {
   5440       |.if X64
   5441       |  mov CARG1d, L:RB			// Caveat: CARG1d may be RA.
   5442       |.else
   5443       |  mov ARG1, L:RB
   5444       |.endif
   5445     } else {
   5446       |.if X64
   5447       |  mov CARG2, KBASEa
   5448       |  mov CARG1d, L:RB			// Caveat: CARG1d may be RA.
   5449       |.else
   5450       |  mov ARG2, KBASEa
   5451       |  mov ARG1, L:RB
   5452       |.endif
   5453     }
   5454     |  ja ->vm_growstack_c		// Need to grow stack.
   5455     |  set_vmstate C
   5456     if (op == BC_FUNCC) {
   5457       |  call KBASEa			// (lua_State *L)
   5458     } else {
   5459       |  // (lua_State *L, lua_CFunction f)
   5460       |  call aword [DISPATCH+DISPATCH_GL(wrapf)]
   5461     }
   5462     |  // nresults returned in eax (RD).
   5463     |  mov BASE, L:RB->base
   5464     |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
   5465     |  set_vmstate INTERP
   5466     |  lea RA, [BASE+RD*8]
   5467     |  neg RA
   5468     |  add RA, L:RB->top		// RA = (L->top-(L->base+nresults))*8
   5469     |  mov PC, [BASE-4]			// Fetch PC of caller.
   5470     |  jmp ->vm_returnc
   5471     break;
   5472 
   5473   /* ---------------------------------------------------------------------- */
   5474 
   5475   default:
   5476     fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
   5477     exit(2);
   5478     break;
   5479   }
   5480 }
   5481 
   5482 static int build_backend(BuildCtx *ctx)
   5483 {
   5484   int op;
   5485   dasm_growpc(Dst, BC__MAX);
   5486   build_subroutines(ctx);
   5487   |.code_op
   5488   for (op = 0; op < BC__MAX; op++)
   5489     build_ins(ctx, (BCOp)op, op);
   5490   return BC__MAX;
   5491 }
   5492 
   5493 /* Emit pseudo frame-info for all assembler functions. */
   5494 static void emit_asm_debug(BuildCtx *ctx)
   5495 {
   5496   int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
   5497 #if LJ_64
   5498 #define SZPTR	"8"
   5499 #define BSZPTR	"3"
   5500 #define REG_SP	"0x7"
   5501 #define REG_RA	"0x10"
   5502 #else
   5503 #define SZPTR	"4"
   5504 #define BSZPTR	"2"
   5505 #define REG_SP	"0x4"
   5506 #define REG_RA	"0x8"
   5507 #endif
   5508   switch (ctx->mode) {
   5509   case BUILD_elfasm:
   5510     fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
   5511     fprintf(ctx->fp,
   5512 	".Lframe0:\n"
   5513 	"\t.long .LECIE0-.LSCIE0\n"
   5514 	".LSCIE0:\n"
   5515 	"\t.long 0xffffffff\n"
   5516 	"\t.byte 0x1\n"
   5517 	"\t.string \"\"\n"
   5518 	"\t.uleb128 0x1\n"
   5519 	"\t.sleb128 -" SZPTR "\n"
   5520 	"\t.byte " REG_RA "\n"
   5521 	"\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
   5522 	"\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
   5523 	"\t.align " SZPTR "\n"
   5524 	".LECIE0:\n\n");
   5525     fprintf(ctx->fp,
   5526 	".LSFDE0:\n"
   5527 	"\t.long .LEFDE0-.LASFDE0\n"
   5528 	".LASFDE0:\n"
   5529 	"\t.long .Lframe0\n"
   5530 #if LJ_64
   5531 	"\t.quad .Lbegin\n"
   5532 	"\t.quad %d\n"
   5533 	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
   5534 	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
   5535 	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
   5536 	"\t.byte 0x8f\n\t.uleb128 0x4\n"	/* offset r15 */
   5537 	"\t.byte 0x8e\n\t.uleb128 0x5\n"	/* offset r14 */
   5538 #if LJ_NO_UNWIND
   5539 	"\t.byte 0x8d\n\t.uleb128 0x6\n"	/* offset r13 */
   5540 	"\t.byte 0x8c\n\t.uleb128 0x7\n"	/* offset r12 */
   5541 #endif
   5542 #else
   5543 	"\t.long .Lbegin\n"
   5544 	"\t.long %d\n"
   5545 	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
   5546 	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
   5547 	"\t.byte 0x87\n\t.uleb128 0x3\n"	/* offset edi */
   5548 	"\t.byte 0x86\n\t.uleb128 0x4\n"	/* offset esi */
   5549 	"\t.byte 0x83\n\t.uleb128 0x5\n"	/* offset ebx */
   5550 #endif
   5551 	"\t.align " SZPTR "\n"
   5552 	".LEFDE0:\n\n", fcofs, CFRAME_SIZE);
   5553 #if LJ_HASFFI
   5554     fprintf(ctx->fp,
   5555 	".LSFDE1:\n"
   5556 	"\t.long .LEFDE1-.LASFDE1\n"
   5557 	".LASFDE1:\n"
   5558 	"\t.long .Lframe0\n"
   5559 #if LJ_64
   5560 	"\t.quad lj_vm_ffi_call\n"
   5561 	"\t.quad %d\n"
   5562 	"\t.byte 0xe\n\t.uleb128 16\n"		/* def_cfa_offset */
   5563 	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
   5564 	"\t.byte 0xd\n\t.uleb128 0x6\n"		/* def_cfa_register rbp */
   5565 	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
   5566 #else
   5567 	"\t.long lj_vm_ffi_call\n"
   5568 	"\t.long %d\n"
   5569 	"\t.byte 0xe\n\t.uleb128 8\n"		/* def_cfa_offset */
   5570 	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
   5571 	"\t.byte 0xd\n\t.uleb128 0x5\n"		/* def_cfa_register ebp */
   5572 	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset ebx */
   5573 #endif
   5574 	"\t.align " SZPTR "\n"
   5575 	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
   5576 #endif
   5577 #if !LJ_NO_UNWIND
   5578 #if (defined(__sun__) && defined(__svr4__))
   5579 #if LJ_64
   5580     fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@unwind\n");
   5581 #else
   5582     fprintf(ctx->fp, "\t.section .eh_frame,\"aw\",@progbits\n");
   5583 #endif
   5584 #else
   5585     fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@progbits\n");
   5586 #endif
   5587     fprintf(ctx->fp,
   5588 	".Lframe1:\n"
   5589 	"\t.long .LECIE1-.LSCIE1\n"
   5590 	".LSCIE1:\n"
   5591 	"\t.long 0\n"
   5592 	"\t.byte 0x1\n"
   5593 	"\t.string \"zPR\"\n"
   5594 	"\t.uleb128 0x1\n"
   5595 	"\t.sleb128 -" SZPTR "\n"
   5596 	"\t.byte " REG_RA "\n"
   5597 	"\t.uleb128 6\n"			/* augmentation length */
   5598 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
   5599 	"\t.long lj_err_unwind_dwarf-.\n"
   5600 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
   5601 	"\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
   5602 	"\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
   5603 	"\t.align " SZPTR "\n"
   5604 	".LECIE1:\n\n");
   5605     fprintf(ctx->fp,
   5606 	".LSFDE2:\n"
   5607 	"\t.long .LEFDE2-.LASFDE2\n"
   5608 	".LASFDE2:\n"
   5609 	"\t.long .LASFDE2-.Lframe1\n"
   5610 	"\t.long .Lbegin-.\n"
   5611 	"\t.long %d\n"
   5612 	"\t.uleb128 0\n"			/* augmentation length */
   5613 	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
   5614 #if LJ_64
   5615 	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
   5616 	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
   5617 	"\t.byte 0x8f\n\t.uleb128 0x4\n"	/* offset r15 */
   5618 	"\t.byte 0x8e\n\t.uleb128 0x5\n"	/* offset r14 */
   5619 #else
   5620 	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
   5621 	"\t.byte 0x87\n\t.uleb128 0x3\n"	/* offset edi */
   5622 	"\t.byte 0x86\n\t.uleb128 0x4\n"	/* offset esi */
   5623 	"\t.byte 0x83\n\t.uleb128 0x5\n"	/* offset ebx */
   5624 #endif
   5625 	"\t.align " SZPTR "\n"
   5626 	".LEFDE2:\n\n", fcofs, CFRAME_SIZE);
   5627 #if LJ_HASFFI
   5628     fprintf(ctx->fp,
   5629 	".Lframe2:\n"
   5630 	"\t.long .LECIE2-.LSCIE2\n"
   5631 	".LSCIE2:\n"
   5632 	"\t.long 0\n"
   5633 	"\t.byte 0x1\n"
   5634 	"\t.string \"zR\"\n"
   5635 	"\t.uleb128 0x1\n"
   5636 	"\t.sleb128 -" SZPTR "\n"
   5637 	"\t.byte " REG_RA "\n"
   5638 	"\t.uleb128 1\n"			/* augmentation length */
   5639 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
   5640 	"\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
   5641 	"\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
   5642 	"\t.align " SZPTR "\n"
   5643 	".LECIE2:\n\n");
   5644     fprintf(ctx->fp,
   5645 	".LSFDE3:\n"
   5646 	"\t.long .LEFDE3-.LASFDE3\n"
   5647 	".LASFDE3:\n"
   5648 	"\t.long .LASFDE3-.Lframe2\n"
   5649 	"\t.long lj_vm_ffi_call-.\n"
   5650 	"\t.long %d\n"
   5651 	"\t.uleb128 0\n"			/* augmentation length */
   5652 #if LJ_64
   5653 	"\t.byte 0xe\n\t.uleb128 16\n"		/* def_cfa_offset */
   5654 	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
   5655 	"\t.byte 0xd\n\t.uleb128 0x6\n"		/* def_cfa_register rbp */
   5656 	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
   5657 #else
   5658 	"\t.byte 0xe\n\t.uleb128 8\n"		/* def_cfa_offset */
   5659 	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
   5660 	"\t.byte 0xd\n\t.uleb128 0x5\n"		/* def_cfa_register ebp */
   5661 	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset ebx */
   5662 #endif
   5663 	"\t.align " SZPTR "\n"
   5664 	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
   5665 #endif
   5666 #endif
   5667     break;
   5668 #if !LJ_NO_UNWIND
   5669   /* Mental note: never let Apple design an assembler.
   5670   ** Or a linker. Or a plastic case. But I digress.
   5671   */
   5672   case BUILD_machasm: {
   5673 #if LJ_HASFFI
   5674     int fcsize = 0;
   5675 #endif
   5676     int i;
   5677     fprintf(ctx->fp, "\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support\n");
   5678     fprintf(ctx->fp,
   5679 	"EH_frame1:\n"
   5680 	"\t.set L$set$x,LECIEX-LSCIEX\n"
   5681 	"\t.long L$set$x\n"
   5682 	"LSCIEX:\n"
   5683 	"\t.long 0\n"
   5684 	"\t.byte 0x1\n"
   5685 	"\t.ascii \"zPR\\0\"\n"
   5686 	"\t.byte 0x1\n"
   5687 	"\t.byte 128-" SZPTR "\n"
   5688 	"\t.byte " REG_RA "\n"
   5689 	"\t.byte 6\n"				/* augmentation length */
   5690 	"\t.byte 0x9b\n"			/* indirect|pcrel|sdata4 */
   5691 #if LJ_64
   5692 	"\t.long _lj_err_unwind_dwarf+4@GOTPCREL\n"
   5693 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
   5694 	"\t.byte 0xc\n\t.byte " REG_SP "\n\t.byte " SZPTR "\n"
   5695 #else
   5696 	"\t.long L_lj_err_unwind_dwarf$non_lazy_ptr-.\n"
   5697 	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
   5698 	"\t.byte 0xc\n\t.byte 0x5\n\t.byte 0x4\n"  /* esp=5 on 32 bit MACH-O. */
   5699 #endif
   5700 	"\t.byte 0x80+" REG_RA "\n\t.byte 0x1\n"
   5701 	"\t.align " BSZPTR "\n"
   5702 	"LECIEX:\n\n");
   5703     for (i = 0; i < ctx->nsym; i++) {
   5704       const char *name = ctx->sym[i].name;
   5705       int32_t size = ctx->sym[i+1].ofs - ctx->sym[i].ofs;
   5706       if (size == 0) continue;
   5707 #if LJ_HASFFI
   5708       if (!strcmp(name, "_lj_vm_ffi_call")) { fcsize = size; continue; }
   5709 #endif
   5710       fprintf(ctx->fp,
   5711 	  "%s.eh:\n"
   5712 	  "LSFDE%d:\n"
   5713 	  "\t.set L$set$%d,LEFDE%d-LASFDE%d\n"
   5714 	  "\t.long L$set$%d\n"
   5715 	  "LASFDE%d:\n"
   5716 	  "\t.long LASFDE%d-EH_frame1\n"
   5717 	  "\t.long %s-.\n"
   5718 	  "\t.long %d\n"
   5719 	  "\t.byte 0\n"				/* augmentation length */
   5720 	  "\t.byte 0xe\n\t.byte %d\n"		/* def_cfa_offset */
   5721 #if LJ_64
   5722 	  "\t.byte 0x86\n\t.byte 0x2\n"		/* offset rbp */
   5723 	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset rbx */
   5724 	  "\t.byte 0x8f\n\t.byte 0x4\n"		/* offset r15 */
   5725 	  "\t.byte 0x8e\n\t.byte 0x5\n"		/* offset r14 */
   5726 #else
   5727 	  "\t.byte 0x84\n\t.byte 0x2\n"		/* offset ebp (4 for MACH-O)*/
   5728 	  "\t.byte 0x87\n\t.byte 0x3\n"		/* offset edi */
   5729 	  "\t.byte 0x86\n\t.byte 0x4\n"		/* offset esi */
   5730 	  "\t.byte 0x83\n\t.byte 0x5\n"		/* offset ebx */
   5731 #endif
   5732 	  "\t.align " BSZPTR "\n"
   5733 	  "LEFDE%d:\n\n",
   5734 	  name, i, i, i, i, i, i, i, name, size, CFRAME_SIZE, i);
   5735     }
   5736 #if LJ_HASFFI
   5737     if (fcsize) {
   5738       fprintf(ctx->fp,
   5739 	  "EH_frame2:\n"
   5740 	  "\t.set L$set$y,LECIEY-LSCIEY\n"
   5741 	  "\t.long L$set$y\n"
   5742 	  "LSCIEY:\n"
   5743 	  "\t.long 0\n"
   5744 	  "\t.byte 0x1\n"
   5745 	  "\t.ascii \"zR\\0\"\n"
   5746 	  "\t.byte 0x1\n"
   5747 	  "\t.byte 128-" SZPTR "\n"
   5748 	  "\t.byte " REG_RA "\n"
   5749 	  "\t.byte 1\n"				/* augmentation length */
   5750 #if LJ_64
   5751 	  "\t.byte 0x1b\n"			/* pcrel|sdata4 */
   5752 	  "\t.byte 0xc\n\t.byte " REG_SP "\n\t.byte " SZPTR "\n"
   5753 #else
   5754 	  "\t.byte 0x1b\n"			/* pcrel|sdata4 */
   5755 	  "\t.byte 0xc\n\t.byte 0x5\n\t.byte 0x4\n"  /* esp=5 on 32 bit MACH. */
   5756 #endif
   5757 	  "\t.byte 0x80+" REG_RA "\n\t.byte 0x1\n"
   5758 	  "\t.align " BSZPTR "\n"
   5759 	  "LECIEY:\n\n");
   5760       fprintf(ctx->fp,
   5761 	  "_lj_vm_ffi_call.eh:\n"
   5762 	  "LSFDEY:\n"
   5763 	  "\t.set L$set$yy,LEFDEY-LASFDEY\n"
   5764 	  "\t.long L$set$yy\n"
   5765 	  "LASFDEY:\n"
   5766 	  "\t.long LASFDEY-EH_frame2\n"
   5767 	  "\t.long _lj_vm_ffi_call-.\n"
   5768 	  "\t.long %d\n"
   5769 	  "\t.byte 0\n"				/* augmentation length */
   5770 #if LJ_64
   5771 	  "\t.byte 0xe\n\t.byte 16\n"		/* def_cfa_offset */
   5772 	  "\t.byte 0x86\n\t.byte 0x2\n"		/* offset rbp */
   5773 	  "\t.byte 0xd\n\t.byte 0x6\n"		/* def_cfa_register rbp */
   5774 	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset rbx */
   5775 #else
   5776 	  "\t.byte 0xe\n\t.byte 8\n"		/* def_cfa_offset */
   5777 	  "\t.byte 0x84\n\t.byte 0x2\n"		/* offset ebp (4 for MACH-O)*/
   5778 	  "\t.byte 0xd\n\t.byte 0x4\n"		/* def_cfa_register ebp */
   5779 	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset ebx */
   5780 #endif
   5781 	  "\t.align " BSZPTR "\n"
   5782 	  "LEFDEY:\n\n", fcsize);
   5783     }
   5784 #endif
   5785 #if !LJ_64
   5786     fprintf(ctx->fp,
   5787       "\t.non_lazy_symbol_pointer\n"
   5788       "L_lj_err_unwind_dwarf$non_lazy_ptr:\n"
   5789       ".indirect_symbol _lj_err_unwind_dwarf\n"
   5790       ".long 0\n\n");
   5791     fprintf(ctx->fp, "\t.section __IMPORT,__jump_table,symbol_stubs,pure_instructions+self_modifying_code,5\n");
   5792     {
   5793       const char *const *xn;
   5794       for (xn = ctx->extnames; *xn; xn++)
   5795 	if (strncmp(*xn, LABEL_PREFIX, sizeof(LABEL_PREFIX)-1))
   5796 	  fprintf(ctx->fp, "L_%s$stub:\n\t.indirect_symbol _%s\n\t.ascii \"\\364\\364\\364\\364\\364\"\n", *xn, *xn);
   5797     }
   5798 #endif
   5799     fprintf(ctx->fp, ".subsections_via_symbols\n");
   5800     }
   5801     break;
   5802 #endif
   5803   default:  /* Difficult for other modes. */
   5804     break;
   5805   }
   5806 }
   5807