lj_asm_x86.h (103039B)
1 /* 2 ** x86/x64 IR assembler (SSA IR -> machine code). 3 ** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h 4 */ 5 6 /* -- Guard handling ------------------------------------------------------ */ 7 8 /* Generate an exit stub group at the bottom of the reserved MCode memory. */ 9 static MCode *asm_exitstub_gen(ASMState *as, ExitNo group) 10 { 11 ExitNo i, groupofs = (group*EXITSTUBS_PER_GROUP) & 0xff; 12 MCode *mxp = as->mcbot; 13 MCode *mxpstart = mxp; 14 if (mxp + (2+2)*EXITSTUBS_PER_GROUP+8+5 >= as->mctop) 15 asm_mclimit(as); 16 /* Push low byte of exitno for each exit stub. */ 17 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)groupofs; 18 for (i = 1; i < EXITSTUBS_PER_GROUP; i++) { 19 *mxp++ = XI_JMPs; *mxp++ = (MCode)((2+2)*(EXITSTUBS_PER_GROUP - i) - 2); 20 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)(groupofs + i); 21 } 22 /* Push the high byte of the exitno for each exit stub group. */ 23 *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8); 24 #if !LJ_GC64 25 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */ 26 *mxp++ = XI_MOVmi; 27 *mxp++ = MODRM(XM_OFS8, 0, RID_ESP); 28 *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP); 29 *mxp++ = 2*sizeof(void *); 30 *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4; 31 #endif 32 /* Jump to exit handler which fills in the ExitState. */ 33 *mxp++ = XI_JMP; mxp += 4; 34 *((int32_t *)(mxp-4)) = jmprel(mxp, (MCode *)(void *)lj_vm_exit_handler); 35 /* Commit the code for this group (even if assembly fails later on). */ 36 lj_mcode_commitbot(as->J, mxp); 37 as->mcbot = mxp; 38 as->mclim = as->mcbot + MCLIM_REDZONE; 39 return mxpstart; 40 } 41 42 /* Setup all needed exit stubs. */ 43 static void asm_exitstub_setup(ASMState *as, ExitNo nexits) 44 { 45 ExitNo i; 46 if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR) 47 lj_trace_err(as->J, LJ_TRERR_SNAPOV); 48 for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++) 49 if (as->J->exitstubgroup[i] == NULL) 50 as->J->exitstubgroup[i] = asm_exitstub_gen(as, i); 51 } 52 53 /* Emit conditional branch to exit for guard. 54 ** It's important to emit this *after* all registers have been allocated, 55 ** because rematerializations may invalidate the flags. 56 */ 57 static void asm_guardcc(ASMState *as, int cc) 58 { 59 MCode *target = exitstub_addr(as->J, as->snapno); 60 MCode *p = as->mcp; 61 if (LJ_UNLIKELY(p == as->invmcp)) { 62 as->loopinv = 1; 63 *(int32_t *)(p+1) = jmprel(p+5, target); 64 target = p; 65 cc ^= 1; 66 if (as->realign) { 67 if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP)) 68 as->mrm.ofs += 2; /* Fixup RIP offset for pending fused load. */ 69 emit_sjcc(as, cc, target); 70 return; 71 } 72 } 73 if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP)) 74 as->mrm.ofs += 6; /* Fixup RIP offset for pending fused load. */ 75 emit_jcc(as, cc, target); 76 } 77 78 /* -- Memory operand fusion ----------------------------------------------- */ 79 80 /* Limit linear search to this distance. Avoids O(n^2) behavior. */ 81 #define CONFLICT_SEARCH_LIM 31 82 83 /* Check if a reference is a signed 32 bit constant. */ 84 static int asm_isk32(ASMState *as, IRRef ref, int32_t *k) 85 { 86 if (irref_isk(ref)) { 87 IRIns *ir = IR(ref); 88 #if LJ_GC64 89 if (ir->o == IR_KNULL || !irt_is64(ir->t)) { 90 *k = ir->i; 91 return 1; 92 } else if (checki32((int64_t)ir_k64(ir)->u64)) { 93 *k = (int32_t)ir_k64(ir)->u64; 94 return 1; 95 } 96 #else 97 if (ir->o != IR_KINT64) { 98 *k = ir->i; 99 return 1; 100 } else if (checki32((int64_t)ir_kint64(ir)->u64)) { 101 *k = (int32_t)ir_kint64(ir)->u64; 102 return 1; 103 } 104 #endif 105 } 106 return 0; 107 } 108 109 /* Check if there's no conflicting instruction between curins and ref. 110 ** Also avoid fusing loads if there are multiple references. 111 */ 112 static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload) 113 { 114 IRIns *ir = as->ir; 115 IRRef i = as->curins; 116 if (i > ref + CONFLICT_SEARCH_LIM) 117 return 0; /* Give up, ref is too far away. */ 118 while (--i > ref) { 119 if (ir[i].o == conflict) 120 return 0; /* Conflict found. */ 121 else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref)) 122 return 0; 123 } 124 return 1; /* Ok, no conflict. */ 125 } 126 127 /* Fuse array base into memory operand. */ 128 static IRRef asm_fuseabase(ASMState *as, IRRef ref) 129 { 130 IRIns *irb = IR(ref); 131 as->mrm.ofs = 0; 132 if (irb->o == IR_FLOAD) { 133 IRIns *ira = IR(irb->op1); 134 lua_assert(irb->op2 == IRFL_TAB_ARRAY); 135 /* We can avoid the FLOAD of t->array for colocated arrays. */ 136 if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE && 137 !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) { 138 as->mrm.ofs = (int32_t)sizeof(GCtab); /* Ofs to colocated array. */ 139 return irb->op1; /* Table obj. */ 140 } 141 } else if (irb->o == IR_ADD && irref_isk(irb->op2)) { 142 /* Fuse base offset (vararg load). */ 143 as->mrm.ofs = IR(irb->op2)->i; 144 return irb->op1; 145 } 146 return ref; /* Otherwise use the given array base. */ 147 } 148 149 /* Fuse array reference into memory operand. */ 150 static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow) 151 { 152 IRIns *irx; 153 lua_assert(ir->o == IR_AREF); 154 as->mrm.base = (uint8_t)ra_alloc1(as, asm_fuseabase(as, ir->op1), allow); 155 irx = IR(ir->op2); 156 if (irref_isk(ir->op2)) { 157 as->mrm.ofs += 8*irx->i; 158 as->mrm.idx = RID_NONE; 159 } else { 160 rset_clear(allow, as->mrm.base); 161 as->mrm.scale = XM_SCALE8; 162 /* Fuse a constant ADD (e.g. t[i+1]) into the offset. 163 ** Doesn't help much without ABCelim, but reduces register pressure. 164 */ 165 if (!LJ_64 && /* Has bad effects with negative index on x64. */ 166 mayfuse(as, ir->op2) && ra_noreg(irx->r) && 167 irx->o == IR_ADD && irref_isk(irx->op2)) { 168 as->mrm.ofs += 8*IR(irx->op2)->i; 169 as->mrm.idx = (uint8_t)ra_alloc1(as, irx->op1, allow); 170 } else { 171 as->mrm.idx = (uint8_t)ra_alloc1(as, ir->op2, allow); 172 } 173 } 174 } 175 176 /* Fuse array/hash/upvalue reference into memory operand. 177 ** Caveat: this may allocate GPRs for the base/idx registers. Be sure to 178 ** pass the final allow mask, excluding any GPRs used for other inputs. 179 ** In particular: 2-operand GPR instructions need to call ra_dest() first! 180 */ 181 static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow) 182 { 183 IRIns *ir = IR(ref); 184 if (ra_noreg(ir->r)) { 185 switch ((IROp)ir->o) { 186 case IR_AREF: 187 if (mayfuse(as, ref)) { 188 asm_fusearef(as, ir, allow); 189 return; 190 } 191 break; 192 case IR_HREFK: 193 if (mayfuse(as, ref)) { 194 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); 195 as->mrm.ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node)); 196 as->mrm.idx = RID_NONE; 197 return; 198 } 199 break; 200 case IR_UREFC: 201 if (irref_isk(ir->op1)) { 202 GCfunc *fn = ir_kfunc(IR(ir->op1)); 203 GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv; 204 #if LJ_GC64 205 int64_t ofs = dispofs(as, &uv->tv); 206 if (checki32(ofs) && checki32(ofs+4)) { 207 as->mrm.ofs = (int32_t)ofs; 208 as->mrm.base = RID_DISPATCH; 209 as->mrm.idx = RID_NONE; 210 return; 211 } 212 #else 213 as->mrm.ofs = ptr2addr(&uv->tv); 214 as->mrm.base = as->mrm.idx = RID_NONE; 215 return; 216 #endif 217 } 218 break; 219 default: 220 lua_assert(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO || 221 ir->o == IR_KKPTR); 222 break; 223 } 224 } 225 as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow); 226 as->mrm.ofs = 0; 227 as->mrm.idx = RID_NONE; 228 } 229 230 /* Fuse FLOAD/FREF reference into memory operand. */ 231 static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow) 232 { 233 lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF); 234 as->mrm.idx = RID_NONE; 235 if (ir->op1 == REF_NIL) { 236 #if LJ_GC64 237 as->mrm.ofs = (int32_t)ir->op2 - GG_OFS(dispatch); 238 as->mrm.base = RID_DISPATCH; 239 #else 240 as->mrm.ofs = (int32_t)ir->op2 + ptr2addr(J2GG(as->J)); 241 as->mrm.base = RID_NONE; 242 #endif 243 return; 244 } 245 as->mrm.ofs = field_ofs[ir->op2]; 246 if (irref_isk(ir->op1)) { 247 IRIns *op1 = IR(ir->op1); 248 #if LJ_GC64 249 if (ir->op1 == REF_NIL) { 250 as->mrm.ofs -= GG_OFS(dispatch); 251 as->mrm.base = RID_DISPATCH; 252 return; 253 } else if (op1->o == IR_KPTR || op1->o == IR_KKPTR) { 254 intptr_t ofs = dispofs(as, ir_kptr(op1)); 255 if (checki32(as->mrm.ofs + ofs)) { 256 as->mrm.ofs += (int32_t)ofs; 257 as->mrm.base = RID_DISPATCH; 258 return; 259 } 260 } 261 #else 262 as->mrm.ofs += op1->i; 263 as->mrm.base = RID_NONE; 264 return; 265 #endif 266 } 267 as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); 268 } 269 270 /* Fuse string reference into memory operand. */ 271 static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow) 272 { 273 IRIns *irr; 274 lua_assert(ir->o == IR_STRREF); 275 as->mrm.base = as->mrm.idx = RID_NONE; 276 as->mrm.scale = XM_SCALE1; 277 as->mrm.ofs = sizeof(GCstr); 278 if (!LJ_GC64 && irref_isk(ir->op1)) { 279 as->mrm.ofs += IR(ir->op1)->i; 280 } else { 281 Reg r = ra_alloc1(as, ir->op1, allow); 282 rset_clear(allow, r); 283 as->mrm.base = (uint8_t)r; 284 } 285 irr = IR(ir->op2); 286 if (irref_isk(ir->op2)) { 287 as->mrm.ofs += irr->i; 288 } else { 289 Reg r; 290 /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */ 291 if (!LJ_64 && /* Has bad effects with negative index on x64. */ 292 mayfuse(as, ir->op2) && irr->o == IR_ADD && irref_isk(irr->op2)) { 293 as->mrm.ofs += IR(irr->op2)->i; 294 r = ra_alloc1(as, irr->op1, allow); 295 } else { 296 r = ra_alloc1(as, ir->op2, allow); 297 } 298 if (as->mrm.base == RID_NONE) 299 as->mrm.base = (uint8_t)r; 300 else 301 as->mrm.idx = (uint8_t)r; 302 } 303 } 304 305 static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow) 306 { 307 IRIns *ir = IR(ref); 308 as->mrm.idx = RID_NONE; 309 if (ir->o == IR_KPTR || ir->o == IR_KKPTR) { 310 #if LJ_GC64 311 intptr_t ofs = dispofs(as, ir_kptr(ir)); 312 if (checki32(ofs)) { 313 as->mrm.ofs = (int32_t)ofs; 314 as->mrm.base = RID_DISPATCH; 315 return; 316 } 317 } if (0) { 318 #else 319 as->mrm.ofs = ir->i; 320 as->mrm.base = RID_NONE; 321 } else if (ir->o == IR_STRREF) { 322 asm_fusestrref(as, ir, allow); 323 #endif 324 } else { 325 as->mrm.ofs = 0; 326 if (canfuse(as, ir) && ir->o == IR_ADD && ra_noreg(ir->r)) { 327 /* Gather (base+idx*sz)+ofs as emitted by cdata ptr/array indexing. */ 328 IRIns *irx; 329 IRRef idx; 330 Reg r; 331 if (asm_isk32(as, ir->op2, &as->mrm.ofs)) { /* Recognize x+ofs. */ 332 ref = ir->op1; 333 ir = IR(ref); 334 if (!(ir->o == IR_ADD && canfuse(as, ir) && ra_noreg(ir->r))) 335 goto noadd; 336 } 337 as->mrm.scale = XM_SCALE1; 338 idx = ir->op1; 339 ref = ir->op2; 340 irx = IR(idx); 341 if (!(irx->o == IR_BSHL || irx->o == IR_ADD)) { /* Try other operand. */ 342 idx = ir->op2; 343 ref = ir->op1; 344 irx = IR(idx); 345 } 346 if (canfuse(as, irx) && ra_noreg(irx->r)) { 347 if (irx->o == IR_BSHL && irref_isk(irx->op2) && IR(irx->op2)->i <= 3) { 348 /* Recognize idx<<b with b = 0-3, corresponding to sz = (1),2,4,8. */ 349 idx = irx->op1; 350 as->mrm.scale = (uint8_t)(IR(irx->op2)->i << 6); 351 } else if (irx->o == IR_ADD && irx->op1 == irx->op2) { 352 /* FOLD does idx*2 ==> idx<<1 ==> idx+idx. */ 353 idx = irx->op1; 354 as->mrm.scale = XM_SCALE2; 355 } 356 } 357 r = ra_alloc1(as, idx, allow); 358 rset_clear(allow, r); 359 as->mrm.idx = (uint8_t)r; 360 } 361 noadd: 362 as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow); 363 } 364 } 365 366 /* Fuse load of 64 bit IR constant into memory operand. */ 367 static Reg asm_fuseloadk64(ASMState *as, IRIns *ir) 368 { 369 const uint64_t *k = &ir_k64(ir)->u64; 370 if (!LJ_GC64 || checki32((intptr_t)k)) { 371 as->mrm.ofs = ptr2addr(k); 372 as->mrm.base = RID_NONE; 373 #if LJ_GC64 374 } else if (checki32(dispofs(as, k))) { 375 as->mrm.ofs = (int32_t)dispofs(as, k); 376 as->mrm.base = RID_DISPATCH; 377 } else if (checki32(mcpofs(as, k)) && checki32(mcpofs(as, k+1)) && 378 checki32(mctopofs(as, k)) && checki32(mctopofs(as, k+1))) { 379 as->mrm.ofs = (int32_t)mcpofs(as, k); 380 as->mrm.base = RID_RIP; 381 } else { 382 if (ir->i) { 383 lua_assert(*k == *(uint64_t*)(as->mctop - ir->i)); 384 } else { 385 while ((uintptr_t)as->mcbot & 7) *as->mcbot++ = XI_INT3; 386 *(uint64_t*)as->mcbot = *k; 387 ir->i = (int32_t)(as->mctop - as->mcbot); 388 as->mcbot += 8; 389 as->mclim = as->mcbot + MCLIM_REDZONE; 390 } 391 as->mrm.ofs = (int32_t)mcpofs(as, as->mctop - ir->i); 392 as->mrm.base = RID_RIP; 393 #endif 394 } 395 as->mrm.idx = RID_NONE; 396 return RID_MRM; 397 } 398 399 /* Fuse load into memory operand. 400 ** 401 ** Important caveat: this may emit RIP-relative loads! So don't place any 402 ** code emitters between this function and the use of its result. 403 ** The only permitted exception is asm_guardcc(). 404 */ 405 static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) 406 { 407 IRIns *ir = IR(ref); 408 if (ra_hasreg(ir->r)) { 409 if (allow != RSET_EMPTY) { /* Fast path. */ 410 ra_noweak(as, ir->r); 411 return ir->r; 412 } 413 fusespill: 414 /* Force a spill if only memory operands are allowed (asm_x87load). */ 415 as->mrm.base = RID_ESP; 416 as->mrm.ofs = ra_spill(as, ir); 417 as->mrm.idx = RID_NONE; 418 return RID_MRM; 419 } 420 if (ir->o == IR_KNUM) { 421 RegSet avail = as->freeset & ~as->modset & RSET_FPR; 422 lua_assert(allow != RSET_EMPTY); 423 if (!(avail & (avail-1))) /* Fuse if less than two regs available. */ 424 return asm_fuseloadk64(as, ir); 425 } else if (ref == REF_BASE || ir->o == IR_KINT64) { 426 RegSet avail = as->freeset & ~as->modset & RSET_GPR; 427 lua_assert(allow != RSET_EMPTY); 428 if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ 429 if (ref == REF_BASE) { 430 #if LJ_GC64 431 as->mrm.ofs = (int32_t)dispofs(as, &J2G(as->J)->jit_base); 432 as->mrm.base = RID_DISPATCH; 433 #else 434 as->mrm.ofs = ptr2addr(&J2G(as->J)->jit_base); 435 as->mrm.base = RID_NONE; 436 #endif 437 as->mrm.idx = RID_NONE; 438 return RID_MRM; 439 } else { 440 return asm_fuseloadk64(as, ir); 441 } 442 } 443 } else if (mayfuse(as, ref)) { 444 RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR; 445 if (ir->o == IR_SLOAD) { 446 if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) && 447 noconflict(as, ref, IR_RETF, 0) && 448 !(LJ_GC64 && irt_isaddr(ir->t))) { 449 as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow); 450 as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) + 451 (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0); 452 as->mrm.idx = RID_NONE; 453 return RID_MRM; 454 } 455 } else if (ir->o == IR_FLOAD) { 456 /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */ 457 if ((irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)) && 458 noconflict(as, ref, IR_FSTORE, 0)) { 459 asm_fusefref(as, ir, xallow); 460 return RID_MRM; 461 } 462 } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { 463 if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) && 464 !(LJ_GC64 && irt_isaddr(ir->t))) { 465 asm_fuseahuref(as, ir->op1, xallow); 466 return RID_MRM; 467 } 468 } else if (ir->o == IR_XLOAD) { 469 /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp). 470 ** Fusing unaligned memory operands is ok on x86 (except for SIMD types). 471 */ 472 if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) && 473 noconflict(as, ref, IR_XSTORE, 0)) { 474 asm_fusexref(as, ir->op1, xallow); 475 return RID_MRM; 476 } 477 } else if (ir->o == IR_VLOAD && !(LJ_GC64 && irt_isaddr(ir->t))) { 478 asm_fuseahuref(as, ir->op1, xallow); 479 return RID_MRM; 480 } 481 } 482 if (ir->o == IR_FLOAD && ir->op1 == REF_NIL) { 483 asm_fusefref(as, ir, RSET_EMPTY); 484 return RID_MRM; 485 } 486 if (!(as->freeset & allow) && !emit_canremat(ref) && 487 (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref))) 488 goto fusespill; 489 return ra_allocref(as, ref, allow); 490 } 491 492 #if LJ_64 493 /* Don't fuse a 32 bit load into a 64 bit operation. */ 494 static Reg asm_fuseloadm(ASMState *as, IRRef ref, RegSet allow, int is64) 495 { 496 if (is64 && !irt_is64(IR(ref)->t)) 497 return ra_alloc1(as, ref, allow); 498 return asm_fuseload(as, ref, allow); 499 } 500 #else 501 #define asm_fuseloadm(as, ref, allow, is64) asm_fuseload(as, (ref), (allow)) 502 #endif 503 504 /* -- Calls --------------------------------------------------------------- */ 505 506 /* Count the required number of stack slots for a call. */ 507 static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args) 508 { 509 uint32_t i, nargs = CCI_XNARGS(ci); 510 int nslots = 0; 511 #if LJ_64 512 if (LJ_ABI_WIN) { 513 nslots = (int)(nargs*2); /* Only matters for more than four args. */ 514 } else { 515 int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR; 516 for (i = 0; i < nargs; i++) 517 if (args[i] && irt_isfp(IR(args[i])->t)) { 518 if (nfpr > 0) nfpr--; else nslots += 2; 519 } else { 520 if (ngpr > 0) ngpr--; else nslots += 2; 521 } 522 } 523 #else 524 int ngpr = 0; 525 if ((ci->flags & CCI_CC_MASK) == CCI_CC_FASTCALL) 526 ngpr = 2; 527 else if ((ci->flags & CCI_CC_MASK) == CCI_CC_THISCALL) 528 ngpr = 1; 529 for (i = 0; i < nargs; i++) 530 if (args[i] && irt_isfp(IR(args[i])->t)) { 531 nslots += irt_isnum(IR(args[i])->t) ? 2 : 1; 532 } else { 533 if (ngpr > 0) ngpr--; else nslots++; 534 } 535 #endif 536 return nslots; 537 } 538 539 /* Generate a call to a C function. */ 540 static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) 541 { 542 uint32_t n, nargs = CCI_XNARGS(ci); 543 int32_t ofs = STACKARG_OFS; 544 #if LJ_64 545 uint32_t gprs = REGARG_GPRS; 546 Reg fpr = REGARG_FIRSTFPR; 547 #if !LJ_ABI_WIN 548 MCode *patchnfpr = NULL; 549 #endif 550 #else 551 uint32_t gprs = 0; 552 if ((ci->flags & CCI_CC_MASK) != CCI_CC_CDECL) { 553 if ((ci->flags & CCI_CC_MASK) == CCI_CC_THISCALL) 554 gprs = (REGARG_GPRS & 31); 555 else if ((ci->flags & CCI_CC_MASK) == CCI_CC_FASTCALL) 556 gprs = REGARG_GPRS; 557 } 558 #endif 559 if ((void *)ci->func) 560 emit_call(as, ci->func); 561 #if LJ_64 562 if ((ci->flags & CCI_VARARG)) { /* Special handling for vararg calls. */ 563 #if LJ_ABI_WIN 564 for (n = 0; n < 4 && n < nargs; n++) { 565 IRIns *ir = IR(args[n]); 566 if (irt_isfp(ir->t)) /* Duplicate FPRs in GPRs. */ 567 emit_rr(as, XO_MOVDto, (irt_isnum(ir->t) ? REX_64 : 0) | (fpr+n), 568 ((gprs >> (n*5)) & 31)); /* Either MOVD or MOVQ. */ 569 } 570 #else 571 patchnfpr = --as->mcp; /* Indicate number of used FPRs in register al. */ 572 *--as->mcp = XI_MOVrib | RID_EAX; 573 #endif 574 } 575 #endif 576 for (n = 0; n < nargs; n++) { /* Setup args. */ 577 IRRef ref = args[n]; 578 IRIns *ir = IR(ref); 579 Reg r; 580 #if LJ_64 && LJ_ABI_WIN 581 /* Windows/x64 argument registers are strictly positional. */ 582 r = irt_isfp(ir->t) ? (fpr <= REGARG_LASTFPR ? fpr : 0) : (gprs & 31); 583 fpr++; gprs >>= 5; 584 #elif LJ_64 585 /* POSIX/x64 argument registers are used in order of appearance. */ 586 if (irt_isfp(ir->t)) { 587 r = fpr <= REGARG_LASTFPR ? fpr++ : 0; 588 } else { 589 r = gprs & 31; gprs >>= 5; 590 } 591 #else 592 if (ref && irt_isfp(ir->t)) { 593 r = 0; 594 } else { 595 r = gprs & 31; gprs >>= 5; 596 if (!ref) continue; 597 } 598 #endif 599 if (r) { /* Argument is in a register. */ 600 if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { 601 #if LJ_64 602 if (LJ_GC64 ? !(ir->o == IR_KINT || ir->o == IR_KNULL) : ir->o == IR_KINT64) 603 emit_loadu64(as, r, ir_k64(ir)->u64); 604 else 605 #endif 606 emit_loadi(as, r, ir->i); 607 } else { 608 lua_assert(rset_test(as->freeset, r)); /* Must have been evicted. */ 609 if (ra_hasreg(ir->r)) { 610 ra_noweak(as, ir->r); 611 emit_movrr(as, ir, r, ir->r); 612 } else { 613 ra_allocref(as, ref, RID2RSET(r)); 614 } 615 } 616 } else if (irt_isfp(ir->t)) { /* FP argument is on stack. */ 617 lua_assert(!(irt_isfloat(ir->t) && irref_isk(ref))); /* No float k. */ 618 if (LJ_32 && (ofs & 4) && irref_isk(ref)) { 619 /* Split stores for unaligned FP consts. */ 620 emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo); 621 emit_movmroi(as, RID_ESP, ofs+4, (int32_t)ir_knum(ir)->u32.hi); 622 } else { 623 r = ra_alloc1(as, ref, RSET_FPR); 624 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSDto : XO_MOVSSto, 625 r, RID_ESP, ofs); 626 } 627 ofs += (LJ_32 && irt_isfloat(ir->t)) ? 4 : 8; 628 } else { /* Non-FP argument is on stack. */ 629 if (LJ_32 && ref < ASMREF_TMP1) { 630 emit_movmroi(as, RID_ESP, ofs, ir->i); 631 } else { 632 r = ra_alloc1(as, ref, RSET_GPR); 633 emit_movtomro(as, REX_64 + r, RID_ESP, ofs); 634 } 635 ofs += sizeof(intptr_t); 636 } 637 checkmclim(as); 638 } 639 #if LJ_64 && !LJ_ABI_WIN 640 if (patchnfpr) *patchnfpr = fpr - REGARG_FIRSTFPR; 641 #endif 642 } 643 644 /* Setup result reg/sp for call. Evict scratch regs. */ 645 static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) 646 { 647 RegSet drop = RSET_SCRATCH; 648 int hiop = (LJ_32 && (ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t)); 649 if ((ci->flags & CCI_NOFPRCLOBBER)) 650 drop &= ~RSET_FPR; 651 if (ra_hasreg(ir->r)) 652 rset_clear(drop, ir->r); /* Dest reg handled below. */ 653 if (hiop && ra_hasreg((ir+1)->r)) 654 rset_clear(drop, (ir+1)->r); /* Dest reg handled below. */ 655 ra_evictset(as, drop); /* Evictions must be performed first. */ 656 if (ra_used(ir)) { 657 if (irt_isfp(ir->t)) { 658 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 659 #if LJ_64 660 if ((ci->flags & CCI_CASTU64)) { 661 Reg dest = ir->r; 662 if (ra_hasreg(dest)) { 663 ra_free(as, dest); 664 ra_modified(as, dest); 665 emit_rr(as, XO_MOVD, dest|REX_64, RID_RET); /* Really MOVQ. */ 666 } 667 if (ofs) emit_movtomro(as, RID_RET|REX_64, RID_ESP, ofs); 668 } else { 669 ra_destreg(as, ir, RID_FPRET); 670 } 671 #else 672 /* Number result is in x87 st0 for x86 calling convention. */ 673 Reg dest = ir->r; 674 if (ra_hasreg(dest)) { 675 ra_free(as, dest); 676 ra_modified(as, dest); 677 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS, 678 dest, RID_ESP, ofs); 679 } 680 if ((ci->flags & CCI_CASTU64)) { 681 emit_movtomro(as, RID_RETLO, RID_ESP, ofs); 682 emit_movtomro(as, RID_RETHI, RID_ESP, ofs+4); 683 } else { 684 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd, 685 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs); 686 } 687 #endif 688 #if LJ_32 689 } else if (hiop) { 690 ra_destpair(as, ir); 691 #endif 692 } else { 693 lua_assert(!irt_ispri(ir->t)); 694 ra_destreg(as, ir, RID_RET); 695 } 696 } else if (LJ_32 && irt_isfp(ir->t) && !(ci->flags & CCI_CASTU64)) { 697 emit_x87op(as, XI_FPOP); /* Pop unused result from x87 st0. */ 698 } 699 } 700 701 /* Return a constant function pointer or NULL for indirect calls. */ 702 static void *asm_callx_func(ASMState *as, IRIns *irf, IRRef func) 703 { 704 #if LJ_32 705 UNUSED(as); 706 if (irref_isk(func)) 707 return (void *)irf->i; 708 #else 709 if (irref_isk(func)) { 710 MCode *p; 711 if (irf->o == IR_KINT64) 712 p = (MCode *)(void *)ir_k64(irf)->u64; 713 else 714 p = (MCode *)(void *)(uintptr_t)(uint32_t)irf->i; 715 if (p - as->mcp == (int32_t)(p - as->mcp)) 716 return p; /* Call target is still in +-2GB range. */ 717 /* Avoid the indirect case of emit_call(). Try to hoist func addr. */ 718 } 719 #endif 720 return NULL; 721 } 722 723 static void asm_callx(ASMState *as, IRIns *ir) 724 { 725 IRRef args[CCI_NARGS_MAX*2]; 726 CCallInfo ci; 727 IRRef func; 728 IRIns *irf; 729 int32_t spadj = 0; 730 ci.flags = asm_callx_flags(as, ir); 731 asm_collectargs(as, ir, &ci, args); 732 asm_setupresult(as, ir, &ci); 733 #if LJ_32 734 /* Have to readjust stack after non-cdecl calls due to callee cleanup. */ 735 if ((ci.flags & CCI_CC_MASK) != CCI_CC_CDECL) 736 spadj = 4 * asm_count_call_slots(as, &ci, args); 737 #endif 738 func = ir->op2; irf = IR(func); 739 if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); } 740 ci.func = (ASMFunction)asm_callx_func(as, irf, func); 741 if (!(void *)ci.func) { 742 /* Use a (hoistable) non-scratch register for indirect calls. */ 743 RegSet allow = (RSET_GPR & ~RSET_SCRATCH); 744 Reg r = ra_alloc1(as, func, allow); 745 if (LJ_32) emit_spsub(as, spadj); /* Above code may cause restores! */ 746 emit_rr(as, XO_GROUP5, XOg_CALL, r); 747 } else if (LJ_32) { 748 emit_spsub(as, spadj); 749 } 750 asm_gencall(as, &ci, args); 751 } 752 753 /* -- Returns ------------------------------------------------------------- */ 754 755 /* Return to lower frame. Guard that it goes to the right spot. */ 756 static void asm_retf(ASMState *as, IRIns *ir) 757 { 758 Reg base = ra_alloc1(as, REF_BASE, RSET_GPR); 759 #if LJ_FR2 760 Reg rpc = ra_scratch(as, rset_exclude(RSET_GPR, base)); 761 #endif 762 void *pc = ir_kptr(IR(ir->op2)); 763 int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1)); 764 as->topslot -= (BCReg)delta; 765 if ((int32_t)as->topslot < 0) as->topslot = 0; 766 irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */ 767 emit_setgl(as, base, jit_base); 768 emit_addptr(as, base, -8*delta); 769 asm_guardcc(as, CC_NE); 770 #if LJ_FR2 771 emit_rmro(as, XO_CMP, rpc|REX_GC64, base, -8); 772 emit_loadu64(as, rpc, u64ptr(pc)); 773 #else 774 emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc)); 775 #endif 776 } 777 778 /* -- Type conversions ---------------------------------------------------- */ 779 780 static void asm_tointg(ASMState *as, IRIns *ir, Reg left) 781 { 782 Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left)); 783 Reg dest = ra_dest(as, ir, RSET_GPR); 784 asm_guardcc(as, CC_P); 785 asm_guardcc(as, CC_NE); 786 emit_rr(as, XO_UCOMISD, left, tmp); 787 emit_rr(as, XO_CVTSI2SD, tmp, dest); 788 emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */ 789 emit_rr(as, XO_CVTTSD2SI, dest, left); 790 /* Can't fuse since left is needed twice. */ 791 } 792 793 static void asm_tobit(ASMState *as, IRIns *ir) 794 { 795 Reg dest = ra_dest(as, ir, RSET_GPR); 796 Reg tmp = ra_noreg(IR(ir->op1)->r) ? 797 ra_alloc1(as, ir->op1, RSET_FPR) : 798 ra_scratch(as, RSET_FPR); 799 Reg right; 800 emit_rr(as, XO_MOVDto, tmp, dest); 801 right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp)); 802 emit_mrm(as, XO_ADDSD, tmp, right); 803 ra_left(as, tmp, ir->op1); 804 } 805 806 static void asm_conv(ASMState *as, IRIns *ir) 807 { 808 IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); 809 int st64 = (st == IRT_I64 || st == IRT_U64 || (LJ_64 && st == IRT_P64)); 810 int stfp = (st == IRT_NUM || st == IRT_FLOAT); 811 IRRef lref = ir->op1; 812 lua_assert(irt_type(ir->t) != st); 813 lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */ 814 if (irt_isfp(ir->t)) { 815 Reg dest = ra_dest(as, ir, RSET_FPR); 816 if (stfp) { /* FP to FP conversion. */ 817 Reg left = asm_fuseload(as, lref, RSET_FPR); 818 emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left); 819 if (left == dest) return; /* Avoid the XO_XORPS. */ 820 } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */ 821 /* number = (2^52+2^51 .. u32) - (2^52+2^51) */ 822 cTValue *k = &as->J->k64[LJ_K64_TOBIT]; 823 Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest)); 824 if (irt_isfloat(ir->t)) 825 emit_rr(as, XO_CVTSD2SS, dest, dest); 826 emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */ 827 emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */ 828 emit_rma(as, XO_MOVSD, bias, k); 829 emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR)); 830 return; 831 } else { /* Integer to FP conversion. */ 832 Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ? 833 ra_alloc1(as, lref, RSET_GPR) : 834 asm_fuseloadm(as, lref, RSET_GPR, st64); 835 if (LJ_64 && st == IRT_U64) { 836 MCLabel l_end = emit_label(as); 837 cTValue *k = &as->J->k64[LJ_K64_2P64]; 838 emit_rma(as, XO_ADDSD, dest, k); /* Add 2^64 to compensate. */ 839 emit_sjcc(as, CC_NS, l_end); 840 emit_rr(as, XO_TEST, left|REX_64, left); /* Check if u64 >= 2^63. */ 841 } 842 emit_mrm(as, irt_isnum(ir->t) ? XO_CVTSI2SD : XO_CVTSI2SS, 843 dest|((LJ_64 && (st64 || st == IRT_U32)) ? REX_64 : 0), left); 844 } 845 emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ 846 } else if (stfp) { /* FP to integer conversion. */ 847 if (irt_isguard(ir->t)) { 848 /* Checked conversions are only supported from number to int. */ 849 lua_assert(irt_isint(ir->t) && st == IRT_NUM); 850 asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); 851 } else { 852 Reg dest = ra_dest(as, ir, RSET_GPR); 853 x86Op op = st == IRT_NUM ? XO_CVTTSD2SI : XO_CVTTSS2SI; 854 if (LJ_64 ? irt_isu64(ir->t) : irt_isu32(ir->t)) { 855 /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */ 856 /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */ 857 Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) : 858 ra_scratch(as, RSET_FPR); 859 MCLabel l_end = emit_label(as); 860 if (LJ_32) 861 emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000); 862 emit_rr(as, op, dest|REX_64, tmp); 863 if (st == IRT_NUM) 864 emit_rma(as, XO_ADDSD, tmp, &as->J->k64[LJ_K64_M2P64_31]); 865 else 866 emit_rma(as, XO_ADDSS, tmp, &as->J->k32[LJ_K32_M2P64_31]); 867 emit_sjcc(as, CC_NS, l_end); 868 emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest negative. */ 869 emit_rr(as, op, dest|REX_64, tmp); 870 ra_left(as, tmp, lref); 871 } else { 872 if (LJ_64 && irt_isu32(ir->t)) 873 emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */ 874 emit_mrm(as, op, 875 dest|((LJ_64 && 876 (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), 877 asm_fuseload(as, lref, RSET_FPR)); 878 } 879 } 880 } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */ 881 Reg left, dest = ra_dest(as, ir, RSET_GPR); 882 RegSet allow = RSET_GPR; 883 x86Op op; 884 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t)); 885 if (st == IRT_I8) { 886 op = XO_MOVSXb; allow = RSET_GPR8; dest |= FORCE_REX; 887 } else if (st == IRT_U8) { 888 op = XO_MOVZXb; allow = RSET_GPR8; dest |= FORCE_REX; 889 } else if (st == IRT_I16) { 890 op = XO_MOVSXw; 891 } else { 892 op = XO_MOVZXw; 893 } 894 left = asm_fuseload(as, lref, allow); 895 /* Add extra MOV if source is already in wrong register. */ 896 if (!LJ_64 && left != RID_MRM && !rset_test(allow, left)) { 897 Reg tmp = ra_scratch(as, allow); 898 emit_rr(as, op, dest, tmp); 899 emit_rr(as, XO_MOV, tmp, left); 900 } else { 901 emit_mrm(as, op, dest, left); 902 } 903 } else { /* 32/64 bit integer conversions. */ 904 if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */ 905 Reg dest = ra_dest(as, ir, RSET_GPR); 906 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ 907 } else if (irt_is64(ir->t)) { 908 Reg dest = ra_dest(as, ir, RSET_GPR); 909 if (st64 || !(ir->op2 & IRCONV_SEXT)) { 910 /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ 911 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ 912 } else { /* 32 to 64 bit sign extension. */ 913 Reg left = asm_fuseload(as, lref, RSET_GPR); 914 emit_mrm(as, XO_MOVSXd, dest|REX_64, left); 915 } 916 } else { 917 Reg dest = ra_dest(as, ir, RSET_GPR); 918 if (st64) { 919 Reg left = asm_fuseload(as, lref, RSET_GPR); 920 /* This is either a 32 bit reg/reg mov which zeroes the hiword 921 ** or a load of the loword from a 64 bit address. 922 */ 923 emit_mrm(as, XO_MOV, dest, left); 924 } else { /* 32/32 bit no-op (cast). */ 925 ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ 926 } 927 } 928 } 929 } 930 931 #if LJ_32 && LJ_HASFFI 932 /* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */ 933 934 /* 64 bit integer to FP conversion in 32 bit mode. */ 935 static void asm_conv_fp_int64(ASMState *as, IRIns *ir) 936 { 937 Reg hi = ra_alloc1(as, ir->op1, RSET_GPR); 938 Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi)); 939 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 940 Reg dest = ir->r; 941 if (ra_hasreg(dest)) { 942 ra_free(as, dest); 943 ra_modified(as, dest); 944 emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS, dest, RID_ESP, ofs); 945 } 946 emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd, 947 irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs); 948 if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) { 949 /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */ 950 MCLabel l_end = emit_label(as); 951 emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_2P64]); 952 emit_sjcc(as, CC_NS, l_end); 953 emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */ 954 } else { 955 lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64); 956 } 957 emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0); 958 /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */ 959 emit_rmro(as, XO_MOVto, hi, RID_ESP, 4); 960 emit_rmro(as, XO_MOVto, lo, RID_ESP, 0); 961 } 962 963 /* FP to 64 bit integer conversion in 32 bit mode. */ 964 static void asm_conv_int64_fp(ASMState *as, IRIns *ir) 965 { 966 IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK); 967 IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH); 968 Reg lo, hi; 969 lua_assert(st == IRT_NUM || st == IRT_FLOAT); 970 lua_assert(dt == IRT_I64 || dt == IRT_U64); 971 hi = ra_dest(as, ir, RSET_GPR); 972 lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi)); 973 if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0); 974 /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */ 975 if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */ 976 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4); 977 emit_rmro(as, XO_MOVto, lo, RID_ESP, 4); 978 emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff); 979 } 980 if (dt == IRT_U64) { 981 /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ 982 MCLabel l_pop, l_end = emit_label(as); 983 emit_x87op(as, XI_FPOP); 984 l_pop = emit_label(as); 985 emit_sjmp(as, l_end); 986 emit_rmro(as, XO_MOV, hi, RID_ESP, 4); 987 if ((as->flags & JIT_F_SSE3)) 988 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); 989 else 990 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); 991 emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_M2P64]); 992 emit_sjcc(as, CC_NS, l_pop); 993 emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */ 994 } 995 emit_rmro(as, XO_MOV, hi, RID_ESP, 4); 996 if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */ 997 emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); 998 } else { /* Otherwise set FPU rounding mode to truncate before the store. */ 999 emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); 1000 emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0); 1001 emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0); 1002 emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0); 1003 emit_loadi(as, lo, 0xc00); 1004 emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0); 1005 } 1006 if (dt == IRT_U64) 1007 emit_x87op(as, XI_FDUP); 1008 emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd, 1009 st == IRT_NUM ? XOg_FLDq: XOg_FLDd, 1010 asm_fuseload(as, ir->op1, RSET_EMPTY)); 1011 } 1012 1013 static void asm_conv64(ASMState *as, IRIns *ir) 1014 { 1015 if (irt_isfp(ir->t)) 1016 asm_conv_fp_int64(as, ir); 1017 else 1018 asm_conv_int64_fp(as, ir); 1019 } 1020 #endif 1021 1022 static void asm_strto(ASMState *as, IRIns *ir) 1023 { 1024 /* Force a spill slot for the destination register (if any). */ 1025 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num]; 1026 IRRef args[2]; 1027 RegSet drop = RSET_SCRATCH; 1028 if ((drop & RSET_FPR) != RSET_FPR && ra_hasreg(ir->r)) 1029 rset_set(drop, ir->r); /* WIN64 doesn't spill all FPRs. */ 1030 ra_evictset(as, drop); 1031 asm_guardcc(as, CC_E); 1032 emit_rr(as, XO_TEST, RID_RET, RID_RET); /* Test return status. */ 1033 args[0] = ir->op1; /* GCstr *str */ 1034 args[1] = ASMREF_TMP1; /* TValue *n */ 1035 asm_gencall(as, ci, args); 1036 /* Store the result to the spill slot or temp slots. */ 1037 emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1)|REX_64, 1038 RID_ESP, sps_scale(ir->s)); 1039 } 1040 1041 /* -- Memory references --------------------------------------------------- */ 1042 1043 /* Get pointer to TValue. */ 1044 static void asm_tvptr(ASMState *as, Reg dest, IRRef ref) 1045 { 1046 IRIns *ir = IR(ref); 1047 if (irt_isnum(ir->t)) { 1048 /* For numbers use the constant itself or a spill slot as a TValue. */ 1049 if (irref_isk(ref)) 1050 emit_loada(as, dest, ir_knum(ir)); 1051 else 1052 emit_rmro(as, XO_LEA, dest|REX_64, RID_ESP, ra_spill(as, ir)); 1053 } else { 1054 /* Otherwise use g->tmptv to hold the TValue. */ 1055 #if LJ_GC64 1056 if (irref_isk(ref)) { 1057 TValue k; 1058 lj_ir_kvalue(as->J->L, &k, ir); 1059 emit_movmroi(as, dest, 4, k.u32.hi); 1060 emit_movmroi(as, dest, 0, k.u32.lo); 1061 } else { 1062 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */ 1063 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest)); 1064 if (irt_is64(ir->t)) { 1065 emit_u32(as, irt_toitype(ir->t) << 15); 1066 emit_rmro(as, XO_ARITHi, XOg_OR, dest, 4); 1067 } else { 1068 emit_movmroi(as, dest, 4, (irt_toitype(ir->t) << 15) | 0x7fff); 1069 } 1070 emit_movtomro(as, REX_64IR(ir, src), dest, 0); 1071 } 1072 #else 1073 if (!irref_isk(ref)) { 1074 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest)); 1075 emit_movtomro(as, REX_64IR(ir, src), dest, 0); 1076 } else if (!irt_ispri(ir->t)) { 1077 emit_movmroi(as, dest, 0, ir->i); 1078 } 1079 if (!(LJ_64 && irt_islightud(ir->t))) 1080 emit_movmroi(as, dest, 4, irt_toitype(ir->t)); 1081 #endif 1082 emit_loada(as, dest, &J2G(as->J)->tmptv); 1083 } 1084 } 1085 1086 static void asm_aref(ASMState *as, IRIns *ir) 1087 { 1088 Reg dest = ra_dest(as, ir, RSET_GPR); 1089 asm_fusearef(as, ir, RSET_GPR); 1090 if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0)) 1091 emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM); 1092 else if (as->mrm.base != dest) 1093 emit_rr(as, XO_MOV, dest|REX_GC64, as->mrm.base); 1094 } 1095 1096 /* Inlined hash lookup. Specialized for key type and for const keys. 1097 ** The equivalent C code is: 1098 ** Node *n = hashkey(t, key); 1099 ** do { 1100 ** if (lj_obj_equal(&n->key, key)) return &n->val; 1101 ** } while ((n = nextnode(n))); 1102 ** return niltv(L); 1103 */ 1104 static void asm_href(ASMState *as, IRIns *ir, IROp merge) 1105 { 1106 RegSet allow = RSET_GPR; 1107 int destused = ra_used(ir); 1108 Reg dest = ra_dest(as, ir, allow); 1109 Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); 1110 Reg key = RID_NONE, tmp = RID_NONE; 1111 IRIns *irkey = IR(ir->op2); 1112 int isk = irref_isk(ir->op2); 1113 IRType1 kt = irkey->t; 1114 uint32_t khash; 1115 MCLabel l_end, l_loop, l_next; 1116 1117 if (!isk) { 1118 rset_clear(allow, tab); 1119 key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow); 1120 if (LJ_GC64 || !irt_isstr(kt)) 1121 tmp = ra_scratch(as, rset_exclude(allow, key)); 1122 } 1123 1124 /* Key not found in chain: jump to exit (if merged) or load niltv. */ 1125 l_end = emit_label(as); 1126 if (merge == IR_NE) 1127 asm_guardcc(as, CC_E); /* XI_JMP is not found by lj_asm_patchexit. */ 1128 else if (destused) 1129 emit_loada(as, dest, niltvg(J2G(as->J))); 1130 1131 /* Follow hash chain until the end. */ 1132 l_loop = emit_sjcc_label(as, CC_NZ); 1133 emit_rr(as, XO_TEST, dest|REX_GC64, dest); 1134 emit_rmro(as, XO_MOV, dest|REX_GC64, dest, offsetof(Node, next)); 1135 l_next = emit_label(as); 1136 1137 /* Type and value comparison. */ 1138 if (merge == IR_EQ) 1139 asm_guardcc(as, CC_E); 1140 else 1141 emit_sjcc(as, CC_E, l_end); 1142 if (irt_isnum(kt)) { 1143 if (isk) { 1144 /* Assumes -0.0 is already canonicalized to +0.0. */ 1145 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo), 1146 (int32_t)ir_knum(irkey)->u32.lo); 1147 emit_sjcc(as, CC_NE, l_next); 1148 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi), 1149 (int32_t)ir_knum(irkey)->u32.hi); 1150 } else { 1151 emit_sjcc(as, CC_P, l_next); 1152 emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n)); 1153 emit_sjcc(as, CC_AE, l_next); 1154 /* The type check avoids NaN penalties and complaints from Valgrind. */ 1155 #if LJ_64 && !LJ_GC64 1156 emit_u32(as, LJ_TISNUM); 1157 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); 1158 #else 1159 emit_i8(as, LJ_TISNUM); 1160 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); 1161 #endif 1162 } 1163 #if LJ_64 && !LJ_GC64 1164 } else if (irt_islightud(kt)) { 1165 emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64)); 1166 #endif 1167 #if LJ_GC64 1168 } else if (irt_isaddr(kt)) { 1169 if (isk) { 1170 TValue k; 1171 k.u64 = ((uint64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64; 1172 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo), 1173 k.u32.lo); 1174 emit_sjcc(as, CC_NE, l_next); 1175 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi), 1176 k.u32.hi); 1177 } else { 1178 emit_rmro(as, XO_CMP, tmp|REX_64, dest, offsetof(Node, key.u64)); 1179 } 1180 } else { 1181 lua_assert(irt_ispri(kt) && !irt_isnil(kt)); 1182 emit_u32(as, (irt_toitype(kt)<<15)|0x7fff); 1183 emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); 1184 #else 1185 } else { 1186 if (!irt_ispri(kt)) { 1187 lua_assert(irt_isaddr(kt)); 1188 if (isk) 1189 emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr), 1190 ptr2addr(ir_kgc(irkey))); 1191 else 1192 emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr)); 1193 emit_sjcc(as, CC_NE, l_next); 1194 } 1195 lua_assert(!irt_isnil(kt)); 1196 emit_i8(as, irt_toitype(kt)); 1197 emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); 1198 #endif 1199 } 1200 emit_sfixup(as, l_loop); 1201 checkmclim(as); 1202 #if LJ_GC64 1203 if (!isk && irt_isaddr(kt)) { 1204 emit_rr(as, XO_OR, tmp|REX_64, key); 1205 emit_loadu64(as, tmp, (uint64_t)irt_toitype(kt) << 47); 1206 } 1207 #endif 1208 1209 /* Load main position relative to tab->node into dest. */ 1210 khash = isk ? ir_khash(irkey) : 1; 1211 if (khash == 0) { 1212 emit_rmro(as, XO_MOV, dest|REX_GC64, tab, offsetof(GCtab, node)); 1213 } else { 1214 emit_rmro(as, XO_ARITH(XOg_ADD), dest|REX_GC64, tab, offsetof(GCtab,node)); 1215 if ((as->flags & JIT_F_PREFER_IMUL)) { 1216 emit_i8(as, sizeof(Node)); 1217 emit_rr(as, XO_IMULi8, dest, dest); 1218 } else { 1219 emit_shifti(as, XOg_SHL, dest, 3); 1220 emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0); 1221 } 1222 if (isk) { 1223 emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash); 1224 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask)); 1225 } else if (irt_isstr(kt)) { 1226 emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, hash)); 1227 emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask)); 1228 } else { /* Must match with hashrot() in lj_tab.c. */ 1229 emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask)); 1230 emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp); 1231 emit_shifti(as, XOg_ROL, tmp, HASH_ROT3); 1232 emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp); 1233 emit_shifti(as, XOg_ROL, dest, HASH_ROT2); 1234 emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest); 1235 emit_shifti(as, XOg_ROL, dest, HASH_ROT1); 1236 emit_rr(as, XO_ARITH(XOg_XOR), tmp, dest); 1237 if (irt_isnum(kt)) { 1238 emit_rr(as, XO_ARITH(XOg_ADD), dest, dest); 1239 #if LJ_64 1240 emit_shifti(as, XOg_SHR|REX_64, dest, 32); 1241 emit_rr(as, XO_MOV, tmp, dest); 1242 emit_rr(as, XO_MOVDto, key|REX_64, dest); 1243 #else 1244 emit_rmro(as, XO_MOV, dest, RID_ESP, ra_spill(as, irkey)+4); 1245 emit_rr(as, XO_MOVDto, key, tmp); 1246 #endif 1247 } else { 1248 emit_rr(as, XO_MOV, tmp, key); 1249 emit_rmro(as, XO_LEA, dest, key, HASH_BIAS); 1250 } 1251 } 1252 } 1253 } 1254 1255 static void asm_hrefk(ASMState *as, IRIns *ir) 1256 { 1257 IRIns *kslot = IR(ir->op2); 1258 IRIns *irkey = IR(kslot->op1); 1259 int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node)); 1260 Reg dest = ra_used(ir) ? ra_dest(as, ir, RSET_GPR) : RID_NONE; 1261 Reg node = ra_alloc1(as, ir->op1, RSET_GPR); 1262 #if !LJ_64 1263 MCLabel l_exit; 1264 #endif 1265 lua_assert(ofs % sizeof(Node) == 0); 1266 if (ra_hasreg(dest)) { 1267 if (ofs != 0) { 1268 if (dest == node && !(as->flags & JIT_F_LEA_AGU)) 1269 emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, ofs); 1270 else 1271 emit_rmro(as, XO_LEA, dest|REX_GC64, node, ofs); 1272 } else if (dest != node) { 1273 emit_rr(as, XO_MOV, dest|REX_GC64, node); 1274 } 1275 } 1276 asm_guardcc(as, CC_NE); 1277 #if LJ_64 1278 if (!irt_ispri(irkey->t)) { 1279 Reg key = ra_scratch(as, rset_exclude(RSET_GPR, node)); 1280 emit_rmro(as, XO_CMP, key|REX_64, node, 1281 ofs + (int32_t)offsetof(Node, key.u64)); 1282 lua_assert(irt_isnum(irkey->t) || irt_isgcv(irkey->t)); 1283 /* Assumes -0.0 is already canonicalized to +0.0. */ 1284 emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 : 1285 #if LJ_GC64 1286 ((uint64_t)irt_toitype(irkey->t) << 47) | 1287 (uint64_t)ir_kgc(irkey)); 1288 #else 1289 ((uint64_t)irt_toitype(irkey->t) << 32) | 1290 (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey))); 1291 #endif 1292 } else { 1293 lua_assert(!irt_isnil(irkey->t)); 1294 #if LJ_GC64 1295 emit_i32(as, (irt_toitype(irkey->t)<<15)|0x7fff); 1296 emit_rmro(as, XO_ARITHi, XOg_CMP, node, 1297 ofs + (int32_t)offsetof(Node, key.it)); 1298 #else 1299 emit_i8(as, irt_toitype(irkey->t)); 1300 emit_rmro(as, XO_ARITHi8, XOg_CMP, node, 1301 ofs + (int32_t)offsetof(Node, key.it)); 1302 #endif 1303 } 1304 #else 1305 l_exit = emit_label(as); 1306 if (irt_isnum(irkey->t)) { 1307 /* Assumes -0.0 is already canonicalized to +0.0. */ 1308 emit_gmroi(as, XG_ARITHi(XOg_CMP), node, 1309 ofs + (int32_t)offsetof(Node, key.u32.lo), 1310 (int32_t)ir_knum(irkey)->u32.lo); 1311 emit_sjcc(as, CC_NE, l_exit); 1312 emit_gmroi(as, XG_ARITHi(XOg_CMP), node, 1313 ofs + (int32_t)offsetof(Node, key.u32.hi), 1314 (int32_t)ir_knum(irkey)->u32.hi); 1315 } else { 1316 if (!irt_ispri(irkey->t)) { 1317 lua_assert(irt_isgcv(irkey->t)); 1318 emit_gmroi(as, XG_ARITHi(XOg_CMP), node, 1319 ofs + (int32_t)offsetof(Node, key.gcr), 1320 ptr2addr(ir_kgc(irkey))); 1321 emit_sjcc(as, CC_NE, l_exit); 1322 } 1323 lua_assert(!irt_isnil(irkey->t)); 1324 emit_i8(as, irt_toitype(irkey->t)); 1325 emit_rmro(as, XO_ARITHi8, XOg_CMP, node, 1326 ofs + (int32_t)offsetof(Node, key.it)); 1327 } 1328 #endif 1329 } 1330 1331 static void asm_uref(ASMState *as, IRIns *ir) 1332 { 1333 Reg dest = ra_dest(as, ir, RSET_GPR); 1334 if (irref_isk(ir->op1)) { 1335 GCfunc *fn = ir_kfunc(IR(ir->op1)); 1336 MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; 1337 emit_rma(as, XO_MOV, dest|REX_GC64, v); 1338 } else { 1339 Reg uv = ra_scratch(as, RSET_GPR); 1340 Reg func = ra_alloc1(as, ir->op1, RSET_GPR); 1341 if (ir->o == IR_UREFC) { 1342 emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv)); 1343 asm_guardcc(as, CC_NE); 1344 emit_i8(as, 1); 1345 emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed)); 1346 } else { 1347 emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v)); 1348 } 1349 emit_rmro(as, XO_MOV, uv|REX_GC64, func, 1350 (int32_t)offsetof(GCfuncL, uvptr) + 1351 (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8)); 1352 } 1353 } 1354 1355 static void asm_fref(ASMState *as, IRIns *ir) 1356 { 1357 Reg dest = ra_dest(as, ir, RSET_GPR); 1358 asm_fusefref(as, ir, RSET_GPR); 1359 emit_mrm(as, XO_LEA, dest, RID_MRM); 1360 } 1361 1362 static void asm_strref(ASMState *as, IRIns *ir) 1363 { 1364 Reg dest = ra_dest(as, ir, RSET_GPR); 1365 asm_fusestrref(as, ir, RSET_GPR); 1366 if (as->mrm.base == RID_NONE) 1367 emit_loadi(as, dest, as->mrm.ofs); 1368 else if (as->mrm.base == dest && as->mrm.idx == RID_NONE) 1369 emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, as->mrm.ofs); 1370 else 1371 emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM); 1372 } 1373 1374 /* -- Loads and stores ---------------------------------------------------- */ 1375 1376 static void asm_fxload(ASMState *as, IRIns *ir) 1377 { 1378 Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); 1379 x86Op xo; 1380 if (ir->o == IR_FLOAD) 1381 asm_fusefref(as, ir, RSET_GPR); 1382 else 1383 asm_fusexref(as, ir->op1, RSET_GPR); 1384 /* ir->op2 is ignored -- unaligned loads are ok on x86. */ 1385 switch (irt_type(ir->t)) { 1386 case IRT_I8: xo = XO_MOVSXb; break; 1387 case IRT_U8: xo = XO_MOVZXb; break; 1388 case IRT_I16: xo = XO_MOVSXw; break; 1389 case IRT_U16: xo = XO_MOVZXw; break; 1390 case IRT_NUM: xo = XO_MOVSD; break; 1391 case IRT_FLOAT: xo = XO_MOVSS; break; 1392 default: 1393 if (LJ_64 && irt_is64(ir->t)) 1394 dest |= REX_64; 1395 else 1396 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)); 1397 xo = XO_MOV; 1398 break; 1399 } 1400 emit_mrm(as, xo, dest, RID_MRM); 1401 } 1402 1403 #define asm_fload(as, ir) asm_fxload(as, ir) 1404 #define asm_xload(as, ir) asm_fxload(as, ir) 1405 1406 static void asm_fxstore(ASMState *as, IRIns *ir) 1407 { 1408 RegSet allow = RSET_GPR; 1409 Reg src = RID_NONE, osrc = RID_NONE; 1410 int32_t k = 0; 1411 if (ir->r == RID_SINK) 1412 return; 1413 /* The IRT_I16/IRT_U16 stores should never be simplified for constant 1414 ** values since mov word [mem], imm16 has a length-changing prefix. 1415 */ 1416 if (irt_isi16(ir->t) || irt_isu16(ir->t) || irt_isfp(ir->t) || 1417 !asm_isk32(as, ir->op2, &k)) { 1418 RegSet allow8 = irt_isfp(ir->t) ? RSET_FPR : 1419 (irt_isi8(ir->t) || irt_isu8(ir->t)) ? RSET_GPR8 : RSET_GPR; 1420 src = osrc = ra_alloc1(as, ir->op2, allow8); 1421 if (!LJ_64 && !rset_test(allow8, src)) { /* Already in wrong register. */ 1422 rset_clear(allow, osrc); 1423 src = ra_scratch(as, allow8); 1424 } 1425 rset_clear(allow, src); 1426 } 1427 if (ir->o == IR_FSTORE) { 1428 asm_fusefref(as, IR(ir->op1), allow); 1429 } else { 1430 asm_fusexref(as, ir->op1, allow); 1431 if (LJ_32 && ir->o == IR_HIOP) as->mrm.ofs += 4; 1432 } 1433 if (ra_hasreg(src)) { 1434 x86Op xo; 1435 switch (irt_type(ir->t)) { 1436 case IRT_I8: case IRT_U8: xo = XO_MOVtob; src |= FORCE_REX; break; 1437 case IRT_I16: case IRT_U16: xo = XO_MOVtow; break; 1438 case IRT_NUM: xo = XO_MOVSDto; break; 1439 case IRT_FLOAT: xo = XO_MOVSSto; break; 1440 #if LJ_64 && !LJ_GC64 1441 case IRT_LIGHTUD: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */ 1442 #endif 1443 default: 1444 if (LJ_64 && irt_is64(ir->t)) 1445 src |= REX_64; 1446 else 1447 lua_assert(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)); 1448 xo = XO_MOVto; 1449 break; 1450 } 1451 emit_mrm(as, xo, src, RID_MRM); 1452 if (!LJ_64 && src != osrc) { 1453 ra_noweak(as, osrc); 1454 emit_rr(as, XO_MOV, src, osrc); 1455 } 1456 } else { 1457 if (irt_isi8(ir->t) || irt_isu8(ir->t)) { 1458 emit_i8(as, k); 1459 emit_mrm(as, XO_MOVmib, 0, RID_MRM); 1460 } else { 1461 lua_assert(irt_is64(ir->t) || irt_isint(ir->t) || irt_isu32(ir->t) || 1462 irt_isaddr(ir->t)); 1463 emit_i32(as, k); 1464 emit_mrm(as, XO_MOVmi, REX_64IR(ir, 0), RID_MRM); 1465 } 1466 } 1467 } 1468 1469 #define asm_fstore(as, ir) asm_fxstore(as, ir) 1470 #define asm_xstore(as, ir) asm_fxstore(as, ir) 1471 1472 #if LJ_64 && !LJ_GC64 1473 static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck) 1474 { 1475 if (ra_used(ir) || typecheck) { 1476 Reg dest = ra_dest(as, ir, RSET_GPR); 1477 if (typecheck) { 1478 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, dest)); 1479 asm_guardcc(as, CC_NE); 1480 emit_i8(as, -2); 1481 emit_rr(as, XO_ARITHi8, XOg_CMP, tmp); 1482 emit_shifti(as, XOg_SAR|REX_64, tmp, 47); 1483 emit_rr(as, XO_MOV, tmp|REX_64, dest); 1484 } 1485 return dest; 1486 } else { 1487 return RID_NONE; 1488 } 1489 } 1490 #endif 1491 1492 static void asm_ahuvload(ASMState *as, IRIns *ir) 1493 { 1494 #if LJ_GC64 1495 Reg tmp = RID_NONE; 1496 #endif 1497 lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) || 1498 (LJ_DUALNUM && irt_isint(ir->t))); 1499 #if LJ_64 && !LJ_GC64 1500 if (irt_islightud(ir->t)) { 1501 Reg dest = asm_load_lightud64(as, ir, 1); 1502 if (ra_hasreg(dest)) { 1503 asm_fuseahuref(as, ir->op1, RSET_GPR); 1504 emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM); 1505 } 1506 return; 1507 } else 1508 #endif 1509 if (ra_used(ir)) { 1510 RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; 1511 Reg dest = ra_dest(as, ir, allow); 1512 asm_fuseahuref(as, ir->op1, RSET_GPR); 1513 #if LJ_GC64 1514 if (irt_isaddr(ir->t)) { 1515 emit_shifti(as, XOg_SHR|REX_64, dest, 17); 1516 asm_guardcc(as, CC_NE); 1517 emit_i8(as, irt_toitype(ir->t)); 1518 emit_rr(as, XO_ARITHi8, XOg_CMP, dest); 1519 emit_i8(as, XI_O16); 1520 if ((as->flags & JIT_F_BMI2)) { 1521 emit_i8(as, 47); 1522 emit_mrm(as, XV_RORX|VEX_64, dest, RID_MRM); 1523 } else { 1524 emit_shifti(as, XOg_ROR|REX_64, dest, 47); 1525 emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM); 1526 } 1527 return; 1528 } else 1529 #endif 1530 emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XO_MOVSD, dest, RID_MRM); 1531 } else { 1532 RegSet gpr = RSET_GPR; 1533 #if LJ_GC64 1534 if (irt_isaddr(ir->t)) { 1535 tmp = ra_scratch(as, RSET_GPR); 1536 gpr = rset_exclude(gpr, tmp); 1537 } 1538 #endif 1539 asm_fuseahuref(as, ir->op1, gpr); 1540 } 1541 /* Always do the type check, even if the load result is unused. */ 1542 as->mrm.ofs += 4; 1543 asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE); 1544 if (LJ_64 && irt_type(ir->t) >= IRT_NUM) { 1545 lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t)); 1546 #if LJ_GC64 1547 emit_u32(as, LJ_TISNUM << 15); 1548 #else 1549 emit_u32(as, LJ_TISNUM); 1550 #endif 1551 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM); 1552 #if LJ_GC64 1553 } else if (irt_isaddr(ir->t)) { 1554 as->mrm.ofs -= 4; 1555 emit_i8(as, irt_toitype(ir->t)); 1556 emit_mrm(as, XO_ARITHi8, XOg_CMP, tmp); 1557 emit_shifti(as, XOg_SAR|REX_64, tmp, 47); 1558 emit_mrm(as, XO_MOV, tmp|REX_64, RID_MRM); 1559 } else if (irt_isnil(ir->t)) { 1560 as->mrm.ofs -= 4; 1561 emit_i8(as, -1); 1562 emit_mrm(as, XO_ARITHi8, XOg_CMP|REX_64, RID_MRM); 1563 } else { 1564 emit_u32(as, (irt_toitype(ir->t) << 15) | 0x7fff); 1565 emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM); 1566 #else 1567 } else { 1568 emit_i8(as, irt_toitype(ir->t)); 1569 emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM); 1570 #endif 1571 } 1572 } 1573 1574 static void asm_ahustore(ASMState *as, IRIns *ir) 1575 { 1576 if (ir->r == RID_SINK) 1577 return; 1578 if (irt_isnum(ir->t)) { 1579 Reg src = ra_alloc1(as, ir->op2, RSET_FPR); 1580 asm_fuseahuref(as, ir->op1, RSET_GPR); 1581 emit_mrm(as, XO_MOVSDto, src, RID_MRM); 1582 #if LJ_64 && !LJ_GC64 1583 } else if (irt_islightud(ir->t)) { 1584 Reg src = ra_alloc1(as, ir->op2, RSET_GPR); 1585 asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src)); 1586 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM); 1587 #endif 1588 #if LJ_GC64 1589 } else if (irref_isk(ir->op2)) { 1590 TValue k; 1591 lj_ir_kvalue(as->J->L, &k, IR(ir->op2)); 1592 asm_fuseahuref(as, ir->op1, RSET_GPR); 1593 if (tvisnil(&k)) { 1594 emit_i32(as, -1); 1595 emit_mrm(as, XO_MOVmi, REX_64, RID_MRM); 1596 } else { 1597 emit_u32(as, k.u32.lo); 1598 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1599 as->mrm.ofs += 4; 1600 emit_u32(as, k.u32.hi); 1601 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1602 } 1603 #endif 1604 } else { 1605 IRIns *irr = IR(ir->op2); 1606 RegSet allow = RSET_GPR; 1607 Reg src = RID_NONE; 1608 if (!irref_isk(ir->op2)) { 1609 src = ra_alloc1(as, ir->op2, allow); 1610 rset_clear(allow, src); 1611 } 1612 asm_fuseahuref(as, ir->op1, allow); 1613 if (ra_hasreg(src)) { 1614 #if LJ_GC64 1615 if (!(LJ_DUALNUM && irt_isinteger(ir->t))) { 1616 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */ 1617 as->mrm.ofs += 4; 1618 emit_u32(as, irt_toitype(ir->t) << 15); 1619 emit_mrm(as, XO_ARITHi, XOg_OR, RID_MRM); 1620 as->mrm.ofs -= 4; 1621 emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM); 1622 return; 1623 } 1624 #endif 1625 emit_mrm(as, XO_MOVto, src, RID_MRM); 1626 } else if (!irt_ispri(irr->t)) { 1627 lua_assert(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t))); 1628 emit_i32(as, irr->i); 1629 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1630 } 1631 as->mrm.ofs += 4; 1632 #if LJ_GC64 1633 lua_assert(LJ_DUALNUM && irt_isinteger(ir->t)); 1634 emit_i32(as, LJ_TNUMX << 15); 1635 #else 1636 emit_i32(as, (int32_t)irt_toitype(ir->t)); 1637 #endif 1638 emit_mrm(as, XO_MOVmi, 0, RID_MRM); 1639 } 1640 } 1641 1642 static void asm_sload(ASMState *as, IRIns *ir) 1643 { 1644 int32_t ofs = 8*((int32_t)ir->op1-1-LJ_FR2) + 1645 (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0); 1646 IRType1 t = ir->t; 1647 Reg base; 1648 lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ 1649 lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK)); 1650 lua_assert(LJ_DUALNUM || 1651 !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME))); 1652 if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) { 1653 Reg left = ra_scratch(as, RSET_FPR); 1654 asm_tointg(as, ir, left); /* Frees dest reg. Do this before base alloc. */ 1655 base = ra_alloc1(as, REF_BASE, RSET_GPR); 1656 emit_rmro(as, XO_MOVSD, left, base, ofs); 1657 t.irt = IRT_NUM; /* Continue with a regular number type check. */ 1658 #if LJ_64 && !LJ_GC64 1659 } else if (irt_islightud(t)) { 1660 Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK)); 1661 if (ra_hasreg(dest)) { 1662 base = ra_alloc1(as, REF_BASE, RSET_GPR); 1663 emit_rmro(as, XO_MOV, dest|REX_64, base, ofs); 1664 } 1665 return; 1666 #endif 1667 } else if (ra_used(ir)) { 1668 RegSet allow = irt_isnum(t) ? RSET_FPR : RSET_GPR; 1669 Reg dest = ra_dest(as, ir, allow); 1670 base = ra_alloc1(as, REF_BASE, RSET_GPR); 1671 lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t)); 1672 if ((ir->op2 & IRSLOAD_CONVERT)) { 1673 t.irt = irt_isint(t) ? IRT_NUM : IRT_INT; /* Check for original type. */ 1674 emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTTSD2SI, dest, base, ofs); 1675 } else { 1676 #if LJ_GC64 1677 if (irt_isaddr(t)) { 1678 /* LJ_GC64 type check + tag removal without BMI2 and with BMI2: 1679 ** 1680 ** mov r64, [addr] rorx r64, [addr], 47 1681 ** ror r64, 47 1682 ** cmp r16, itype cmp r16, itype 1683 ** jne ->exit jne ->exit 1684 ** shr r64, 16 shr r64, 16 1685 */ 1686 emit_shifti(as, XOg_SHR|REX_64, dest, 17); 1687 if ((ir->op2 & IRSLOAD_TYPECHECK)) { 1688 asm_guardcc(as, CC_NE); 1689 emit_i8(as, irt_toitype(t)); 1690 emit_rr(as, XO_ARITHi8, XOg_CMP, dest); 1691 emit_i8(as, XI_O16); 1692 } 1693 if ((as->flags & JIT_F_BMI2)) { 1694 emit_i8(as, 47); 1695 emit_rmro(as, XV_RORX|VEX_64, dest, base, ofs); 1696 } else { 1697 if ((ir->op2 & IRSLOAD_TYPECHECK)) 1698 emit_shifti(as, XOg_ROR|REX_64, dest, 47); 1699 else 1700 emit_shifti(as, XOg_SHL|REX_64, dest, 17); 1701 emit_rmro(as, XO_MOV, dest|REX_64, base, ofs); 1702 } 1703 return; 1704 } else 1705 #endif 1706 emit_rmro(as, irt_isnum(t) ? XO_MOVSD : XO_MOV, dest, base, ofs); 1707 } 1708 } else { 1709 if (!(ir->op2 & IRSLOAD_TYPECHECK)) 1710 return; /* No type check: avoid base alloc. */ 1711 base = ra_alloc1(as, REF_BASE, RSET_GPR); 1712 } 1713 if ((ir->op2 & IRSLOAD_TYPECHECK)) { 1714 /* Need type check, even if the load result is unused. */ 1715 asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE); 1716 if (LJ_64 && irt_type(t) >= IRT_NUM) { 1717 lua_assert(irt_isinteger(t) || irt_isnum(t)); 1718 #if LJ_GC64 1719 emit_u32(as, LJ_TISNUM << 15); 1720 #else 1721 emit_u32(as, LJ_TISNUM); 1722 #endif 1723 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4); 1724 #if LJ_GC64 1725 } else if (irt_isnil(t)) { 1726 /* LJ_GC64 type check for nil: 1727 ** 1728 ** cmp qword [addr], -1 1729 ** jne ->exit 1730 */ 1731 emit_i8(as, -1); 1732 emit_rmro(as, XO_ARITHi8, XOg_CMP|REX_64, base, ofs); 1733 } else if (irt_ispri(t)) { 1734 emit_u32(as, (irt_toitype(t) << 15) | 0x7fff); 1735 emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4); 1736 } else { 1737 /* LJ_GC64 type check only: 1738 ** 1739 ** mov r64, [addr] 1740 ** sar r64, 47 1741 ** cmp r32, itype 1742 ** jne ->exit 1743 */ 1744 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, base)); 1745 emit_i8(as, irt_toitype(t)); 1746 emit_rr(as, XO_ARITHi8, XOg_CMP, tmp); 1747 emit_shifti(as, XOg_SAR|REX_64, tmp, 47); 1748 emit_rmro(as, XO_MOV, tmp|REX_64, base, ofs+4); 1749 #else 1750 } else { 1751 emit_i8(as, irt_toitype(t)); 1752 emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4); 1753 #endif 1754 } 1755 } 1756 } 1757 1758 /* -- Allocations --------------------------------------------------------- */ 1759 1760 #if LJ_HASFFI 1761 static void asm_cnew(ASMState *as, IRIns *ir) 1762 { 1763 CTState *cts = ctype_ctsG(J2G(as->J)); 1764 CTypeID id = (CTypeID)IR(ir->op1)->i; 1765 CTSize sz; 1766 CTInfo info = lj_ctype_info(cts, id, &sz); 1767 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco]; 1768 IRRef args[4]; 1769 lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL)); 1770 1771 as->gcsteps++; 1772 asm_setupresult(as, ir, ci); /* GCcdata * */ 1773 1774 /* Initialize immutable cdata object. */ 1775 if (ir->o == IR_CNEWI) { 1776 RegSet allow = (RSET_GPR & ~RSET_SCRATCH); 1777 #if LJ_64 1778 Reg r64 = sz == 8 ? REX_64 : 0; 1779 if (irref_isk(ir->op2)) { 1780 IRIns *irk = IR(ir->op2); 1781 uint64_t k = irk->o == IR_KINT64 ? ir_k64(irk)->u64 : 1782 (uint64_t)(uint32_t)irk->i; 1783 if (sz == 4 || checki32((int64_t)k)) { 1784 emit_i32(as, (int32_t)k); 1785 emit_rmro(as, XO_MOVmi, r64, RID_RET, sizeof(GCcdata)); 1786 } else { 1787 emit_movtomro(as, RID_ECX + r64, RID_RET, sizeof(GCcdata)); 1788 emit_loadu64(as, RID_ECX, k); 1789 } 1790 } else { 1791 Reg r = ra_alloc1(as, ir->op2, allow); 1792 emit_movtomro(as, r + r64, RID_RET, sizeof(GCcdata)); 1793 } 1794 #else 1795 int32_t ofs = sizeof(GCcdata); 1796 if (sz == 8) { 1797 ofs += 4; ir++; 1798 lua_assert(ir->o == IR_HIOP); 1799 } 1800 do { 1801 if (irref_isk(ir->op2)) { 1802 emit_movmroi(as, RID_RET, ofs, IR(ir->op2)->i); 1803 } else { 1804 Reg r = ra_alloc1(as, ir->op2, allow); 1805 emit_movtomro(as, r, RID_RET, ofs); 1806 rset_clear(allow, r); 1807 } 1808 if (ofs == sizeof(GCcdata)) break; 1809 ofs -= 4; ir--; 1810 } while (1); 1811 #endif 1812 lua_assert(sz == 4 || sz == 8); 1813 } else if (ir->op2 != REF_NIL) { /* Create VLA/VLS/aligned cdata. */ 1814 ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv]; 1815 args[0] = ASMREF_L; /* lua_State *L */ 1816 args[1] = ir->op1; /* CTypeID id */ 1817 args[2] = ir->op2; /* CTSize sz */ 1818 args[3] = ASMREF_TMP1; /* CTSize align */ 1819 asm_gencall(as, ci, args); 1820 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info)); 1821 return; 1822 } 1823 1824 /* Combine initialization of marked, gct and ctypeid. */ 1825 emit_movtomro(as, RID_ECX, RID_RET, offsetof(GCcdata, marked)); 1826 emit_gri(as, XG_ARITHi(XOg_OR), RID_ECX, 1827 (int32_t)((~LJ_TCDATA<<8)+(id<<16))); 1828 emit_gri(as, XG_ARITHi(XOg_AND), RID_ECX, LJ_GC_WHITES); 1829 emit_opgl(as, XO_MOVZXb, RID_ECX, gc.currentwhite); 1830 1831 args[0] = ASMREF_L; /* lua_State *L */ 1832 args[1] = ASMREF_TMP1; /* MSize size */ 1833 asm_gencall(as, ci, args); 1834 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)(sz+sizeof(GCcdata))); 1835 } 1836 #else 1837 #define asm_cnew(as, ir) ((void)0) 1838 #endif 1839 1840 /* -- Write barriers ------------------------------------------------------ */ 1841 1842 static void asm_tbar(ASMState *as, IRIns *ir) 1843 { 1844 Reg tab = ra_alloc1(as, ir->op1, RSET_GPR); 1845 Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab)); 1846 MCLabel l_end = emit_label(as); 1847 emit_movtomro(as, tmp|REX_GC64, tab, offsetof(GCtab, gclist)); 1848 emit_setgl(as, tab, gc.grayagain); 1849 emit_getgl(as, tmp, gc.grayagain); 1850 emit_i8(as, ~LJ_GC_BLACK); 1851 emit_rmro(as, XO_ARITHib, XOg_AND, tab, offsetof(GCtab, marked)); 1852 emit_sjcc(as, CC_Z, l_end); 1853 emit_i8(as, LJ_GC_BLACK); 1854 emit_rmro(as, XO_GROUP3b, XOg_TEST, tab, offsetof(GCtab, marked)); 1855 } 1856 1857 static void asm_obar(ASMState *as, IRIns *ir) 1858 { 1859 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv]; 1860 IRRef args[2]; 1861 MCLabel l_end; 1862 Reg obj; 1863 /* No need for other object barriers (yet). */ 1864 lua_assert(IR(ir->op1)->o == IR_UREFC); 1865 ra_evictset(as, RSET_SCRATCH); 1866 l_end = emit_label(as); 1867 args[0] = ASMREF_TMP1; /* global_State *g */ 1868 args[1] = ir->op1; /* TValue *tv */ 1869 asm_gencall(as, ci, args); 1870 emit_loada(as, ra_releasetmp(as, ASMREF_TMP1), J2G(as->J)); 1871 obj = IR(ir->op1)->r; 1872 emit_sjcc(as, CC_Z, l_end); 1873 emit_i8(as, LJ_GC_WHITES); 1874 if (irref_isk(ir->op2)) { 1875 GCobj *vp = ir_kgc(IR(ir->op2)); 1876 emit_rma(as, XO_GROUP3b, XOg_TEST, &vp->gch.marked); 1877 } else { 1878 Reg val = ra_alloc1(as, ir->op2, rset_exclude(RSET_SCRATCH&RSET_GPR, obj)); 1879 emit_rmro(as, XO_GROUP3b, XOg_TEST, val, (int32_t)offsetof(GChead, marked)); 1880 } 1881 emit_sjcc(as, CC_Z, l_end); 1882 emit_i8(as, LJ_GC_BLACK); 1883 emit_rmro(as, XO_GROUP3b, XOg_TEST, obj, 1884 (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv)); 1885 } 1886 1887 /* -- FP/int arithmetic and logic operations ------------------------------ */ 1888 1889 /* Load reference onto x87 stack. Force a spill to memory if needed. */ 1890 static void asm_x87load(ASMState *as, IRRef ref) 1891 { 1892 IRIns *ir = IR(ref); 1893 if (ir->o == IR_KNUM) { 1894 cTValue *tv = ir_knum(ir); 1895 if (tvispzero(tv)) /* Use fldz only for +0. */ 1896 emit_x87op(as, XI_FLDZ); 1897 else if (tvispone(tv)) 1898 emit_x87op(as, XI_FLD1); 1899 else 1900 emit_rma(as, XO_FLDq, XOg_FLDq, tv); 1901 } else if (ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT && !ra_used(ir) && 1902 !irref_isk(ir->op1) && mayfuse(as, ir->op1)) { 1903 IRIns *iri = IR(ir->op1); 1904 emit_rmro(as, XO_FILDd, XOg_FILDd, RID_ESP, ra_spill(as, iri)); 1905 } else { 1906 emit_mrm(as, XO_FLDq, XOg_FLDq, asm_fuseload(as, ref, RSET_EMPTY)); 1907 } 1908 } 1909 1910 static void asm_fpmath(ASMState *as, IRIns *ir) 1911 { 1912 IRFPMathOp fpm = (IRFPMathOp)ir->op2; 1913 if (fpm == IRFPM_SQRT) { 1914 Reg dest = ra_dest(as, ir, RSET_FPR); 1915 Reg left = asm_fuseload(as, ir->op1, RSET_FPR); 1916 emit_mrm(as, XO_SQRTSD, dest, left); 1917 } else if (fpm <= IRFPM_TRUNC) { 1918 if (as->flags & JIT_F_SSE4_1) { /* SSE4.1 has a rounding instruction. */ 1919 Reg dest = ra_dest(as, ir, RSET_FPR); 1920 Reg left = asm_fuseload(as, ir->op1, RSET_FPR); 1921 /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. 1922 ** Let's pretend it's a 3-byte opcode, and compensate afterwards. 1923 ** This is atrocious, but the alternatives are much worse. 1924 */ 1925 /* Round down/up/trunc == 1001/1010/1011. */ 1926 emit_i8(as, 0x09 + fpm); 1927 emit_mrm(as, XO_ROUNDSD, dest, left); 1928 if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) { 1929 as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */ 1930 } 1931 *--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */ 1932 } else { /* Call helper functions for SSE2 variant. */ 1933 /* The modified regs must match with the *.dasc implementation. */ 1934 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX); 1935 if (ra_hasreg(ir->r)) 1936 rset_clear(drop, ir->r); /* Dest reg handled below. */ 1937 ra_evictset(as, drop); 1938 ra_destreg(as, ir, RID_XMM0); 1939 emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse : 1940 fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse); 1941 ra_left(as, RID_XMM0, ir->op1); 1942 } 1943 } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) { 1944 /* Rejoined to pow(). */ 1945 } else { 1946 asm_callid(as, ir, IRCALL_lj_vm_floor + fpm); 1947 } 1948 } 1949 1950 #define asm_atan2(as, ir) asm_callid(as, ir, IRCALL_atan2) 1951 1952 static void asm_ldexp(ASMState *as, IRIns *ir) 1953 { 1954 int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ 1955 Reg dest = ir->r; 1956 if (ra_hasreg(dest)) { 1957 ra_free(as, dest); 1958 ra_modified(as, dest); 1959 emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs); 1960 } 1961 emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); 1962 emit_x87op(as, XI_FPOP1); 1963 emit_x87op(as, XI_FSCALE); 1964 asm_x87load(as, ir->op1); 1965 asm_x87load(as, ir->op2); 1966 } 1967 1968 static void asm_fppowi(ASMState *as, IRIns *ir) 1969 { 1970 /* The modified regs must match with the *.dasc implementation. */ 1971 RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); 1972 if (ra_hasreg(ir->r)) 1973 rset_clear(drop, ir->r); /* Dest reg handled below. */ 1974 ra_evictset(as, drop); 1975 ra_destreg(as, ir, RID_XMM0); 1976 emit_call(as, lj_vm_powi_sse); 1977 ra_left(as, RID_XMM0, ir->op1); 1978 ra_left(as, RID_EAX, ir->op2); 1979 } 1980 1981 static void asm_pow(ASMState *as, IRIns *ir) 1982 { 1983 #if LJ_64 && LJ_HASFFI 1984 if (!irt_isnum(ir->t)) 1985 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : 1986 IRCALL_lj_carith_powu64); 1987 else 1988 #endif 1989 asm_fppowi(as, ir); 1990 } 1991 1992 static int asm_swapops(ASMState *as, IRIns *ir) 1993 { 1994 IRIns *irl = IR(ir->op1); 1995 IRIns *irr = IR(ir->op2); 1996 lua_assert(ra_noreg(irr->r)); 1997 if (!irm_iscomm(lj_ir_mode[ir->o])) 1998 return 0; /* Can't swap non-commutative operations. */ 1999 if (irref_isk(ir->op2)) 2000 return 0; /* Don't swap constants to the left. */ 2001 if (ra_hasreg(irl->r)) 2002 return 1; /* Swap if left already has a register. */ 2003 if (ra_samehint(ir->r, irr->r)) 2004 return 1; /* Swap if dest and right have matching hints. */ 2005 if (as->curins > as->loopref) { /* In variant part? */ 2006 if (ir->op2 < as->loopref && !irt_isphi(irr->t)) 2007 return 0; /* Keep invariants on the right. */ 2008 if (ir->op1 < as->loopref && !irt_isphi(irl->t)) 2009 return 1; /* Swap invariants to the right. */ 2010 } 2011 if (opisfusableload(irl->o)) 2012 return 1; /* Swap fusable loads to the right. */ 2013 return 0; /* Otherwise don't swap. */ 2014 } 2015 2016 static void asm_fparith(ASMState *as, IRIns *ir, x86Op xo) 2017 { 2018 IRRef lref = ir->op1; 2019 IRRef rref = ir->op2; 2020 RegSet allow = RSET_FPR; 2021 Reg dest; 2022 Reg right = IR(rref)->r; 2023 if (ra_hasreg(right)) { 2024 rset_clear(allow, right); 2025 ra_noweak(as, right); 2026 } 2027 dest = ra_dest(as, ir, allow); 2028 if (lref == rref) { 2029 right = dest; 2030 } else if (ra_noreg(right)) { 2031 if (asm_swapops(as, ir)) { 2032 IRRef tmp = lref; lref = rref; rref = tmp; 2033 } 2034 right = asm_fuseload(as, rref, rset_clear(allow, dest)); 2035 } 2036 emit_mrm(as, xo, dest, right); 2037 ra_left(as, dest, lref); 2038 } 2039 2040 static void asm_intarith(ASMState *as, IRIns *ir, x86Arith xa) 2041 { 2042 IRRef lref = ir->op1; 2043 IRRef rref = ir->op2; 2044 RegSet allow = RSET_GPR; 2045 Reg dest, right; 2046 int32_t k = 0; 2047 if (as->flagmcp == as->mcp) { /* Drop test r,r instruction. */ 2048 MCode *p = as->mcp + ((LJ_64 && *as->mcp < XI_TESTb) ? 3 : 2); 2049 if ((p[1] & 15) < 14) { 2050 if ((p[1] & 15) >= 12) p[1] -= 4; /* L <->S, NL <-> NS */ 2051 as->flagmcp = NULL; 2052 as->mcp = p; 2053 } /* else: cannot transform LE/NLE to cc without use of OF. */ 2054 } 2055 right = IR(rref)->r; 2056 if (ra_hasreg(right)) { 2057 rset_clear(allow, right); 2058 ra_noweak(as, right); 2059 } 2060 dest = ra_dest(as, ir, allow); 2061 if (lref == rref) { 2062 right = dest; 2063 } else if (ra_noreg(right) && !asm_isk32(as, rref, &k)) { 2064 if (asm_swapops(as, ir)) { 2065 IRRef tmp = lref; lref = rref; rref = tmp; 2066 } 2067 right = asm_fuseloadm(as, rref, rset_clear(allow, dest), irt_is64(ir->t)); 2068 } 2069 if (irt_isguard(ir->t)) /* For IR_ADDOV etc. */ 2070 asm_guardcc(as, CC_O); 2071 if (xa != XOg_X_IMUL) { 2072 if (ra_hasreg(right)) 2073 emit_mrm(as, XO_ARITH(xa), REX_64IR(ir, dest), right); 2074 else 2075 emit_gri(as, XG_ARITHi(xa), REX_64IR(ir, dest), k); 2076 } else if (ra_hasreg(right)) { /* IMUL r, mrm. */ 2077 emit_mrm(as, XO_IMUL, REX_64IR(ir, dest), right); 2078 } else { /* IMUL r, r, k. */ 2079 /* NYI: use lea/shl/add/sub (FOLD only does 2^k) depending on CPU. */ 2080 Reg left = asm_fuseloadm(as, lref, RSET_GPR, irt_is64(ir->t)); 2081 x86Op xo; 2082 if (checki8(k)) { emit_i8(as, k); xo = XO_IMULi8; 2083 } else { emit_i32(as, k); xo = XO_IMULi; } 2084 emit_mrm(as, xo, REX_64IR(ir, dest), left); 2085 return; 2086 } 2087 ra_left(as, dest, lref); 2088 } 2089 2090 /* LEA is really a 4-operand ADD with an independent destination register, 2091 ** up to two source registers and an immediate. One register can be scaled 2092 ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several 2093 ** instructions. 2094 ** 2095 ** Currently only a few common cases are supported: 2096 ** - 3-operand ADD: y = a+b; y = a+k with a and b already allocated 2097 ** - Left ADD fusion: y = (a+b)+k; y = (a+k)+b 2098 ** - Right ADD fusion: y = a+(b+k) 2099 ** The ommited variants have already been reduced by FOLD. 2100 ** 2101 ** There are more fusion opportunities, like gathering shifts or joining 2102 ** common references. But these are probably not worth the trouble, since 2103 ** array indexing is not decomposed and already makes use of all fields 2104 ** of the ModRM operand. 2105 */ 2106 static int asm_lea(ASMState *as, IRIns *ir) 2107 { 2108 IRIns *irl = IR(ir->op1); 2109 IRIns *irr = IR(ir->op2); 2110 RegSet allow = RSET_GPR; 2111 Reg dest; 2112 as->mrm.base = as->mrm.idx = RID_NONE; 2113 as->mrm.scale = XM_SCALE1; 2114 as->mrm.ofs = 0; 2115 if (ra_hasreg(irl->r)) { 2116 rset_clear(allow, irl->r); 2117 ra_noweak(as, irl->r); 2118 as->mrm.base = irl->r; 2119 if (irref_isk(ir->op2) || ra_hasreg(irr->r)) { 2120 /* The PHI renaming logic does a better job in some cases. */ 2121 if (ra_hasreg(ir->r) && 2122 ((irt_isphi(irl->t) && as->phireg[ir->r] == ir->op1) || 2123 (irt_isphi(irr->t) && as->phireg[ir->r] == ir->op2))) 2124 return 0; 2125 if (irref_isk(ir->op2)) { 2126 as->mrm.ofs = irr->i; 2127 } else { 2128 rset_clear(allow, irr->r); 2129 ra_noweak(as, irr->r); 2130 as->mrm.idx = irr->r; 2131 } 2132 } else if (irr->o == IR_ADD && mayfuse(as, ir->op2) && 2133 irref_isk(irr->op2)) { 2134 Reg idx = ra_alloc1(as, irr->op1, allow); 2135 rset_clear(allow, idx); 2136 as->mrm.idx = (uint8_t)idx; 2137 as->mrm.ofs = IR(irr->op2)->i; 2138 } else { 2139 return 0; 2140 } 2141 } else if (ir->op1 != ir->op2 && irl->o == IR_ADD && mayfuse(as, ir->op1) && 2142 (irref_isk(ir->op2) || irref_isk(irl->op2))) { 2143 Reg idx, base = ra_alloc1(as, irl->op1, allow); 2144 rset_clear(allow, base); 2145 as->mrm.base = (uint8_t)base; 2146 if (irref_isk(ir->op2)) { 2147 as->mrm.ofs = irr->i; 2148 idx = ra_alloc1(as, irl->op2, allow); 2149 } else { 2150 as->mrm.ofs = IR(irl->op2)->i; 2151 idx = ra_alloc1(as, ir->op2, allow); 2152 } 2153 rset_clear(allow, idx); 2154 as->mrm.idx = (uint8_t)idx; 2155 } else { 2156 return 0; 2157 } 2158 dest = ra_dest(as, ir, allow); 2159 emit_mrm(as, XO_LEA, dest, RID_MRM); 2160 return 1; /* Success. */ 2161 } 2162 2163 static void asm_add(ASMState *as, IRIns *ir) 2164 { 2165 if (irt_isnum(ir->t)) 2166 asm_fparith(as, ir, XO_ADDSD); 2167 else if ((as->flags & JIT_F_LEA_AGU) || as->flagmcp == as->mcp || 2168 irt_is64(ir->t) || !asm_lea(as, ir)) 2169 asm_intarith(as, ir, XOg_ADD); 2170 } 2171 2172 static void asm_sub(ASMState *as, IRIns *ir) 2173 { 2174 if (irt_isnum(ir->t)) 2175 asm_fparith(as, ir, XO_SUBSD); 2176 else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */ 2177 asm_intarith(as, ir, XOg_SUB); 2178 } 2179 2180 static void asm_mul(ASMState *as, IRIns *ir) 2181 { 2182 if (irt_isnum(ir->t)) 2183 asm_fparith(as, ir, XO_MULSD); 2184 else 2185 asm_intarith(as, ir, XOg_X_IMUL); 2186 } 2187 2188 static void asm_div(ASMState *as, IRIns *ir) 2189 { 2190 #if LJ_64 && LJ_HASFFI 2191 if (!irt_isnum(ir->t)) 2192 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 : 2193 IRCALL_lj_carith_divu64); 2194 else 2195 #endif 2196 asm_fparith(as, ir, XO_DIVSD); 2197 } 2198 2199 static void asm_mod(ASMState *as, IRIns *ir) 2200 { 2201 #if LJ_64 && LJ_HASFFI 2202 if (!irt_isint(ir->t)) 2203 asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 : 2204 IRCALL_lj_carith_modu64); 2205 else 2206 #endif 2207 asm_callid(as, ir, IRCALL_lj_vm_modi); 2208 } 2209 2210 static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg) 2211 { 2212 Reg dest = ra_dest(as, ir, RSET_GPR); 2213 emit_rr(as, XO_GROUP3, REX_64IR(ir, xg), dest); 2214 ra_left(as, dest, ir->op1); 2215 } 2216 2217 static void asm_neg(ASMState *as, IRIns *ir) 2218 { 2219 if (irt_isnum(ir->t)) 2220 asm_fparith(as, ir, XO_XORPS); 2221 else 2222 asm_neg_not(as, ir, XOg_NEG); 2223 } 2224 2225 #define asm_abs(as, ir) asm_fparith(as, ir, XO_ANDPS) 2226 2227 static void asm_intmin_max(ASMState *as, IRIns *ir, int cc) 2228 { 2229 Reg right, dest = ra_dest(as, ir, RSET_GPR); 2230 IRRef lref = ir->op1, rref = ir->op2; 2231 if (irref_isk(rref)) { lref = rref; rref = ir->op1; } 2232 right = ra_alloc1(as, rref, rset_exclude(RSET_GPR, dest)); 2233 emit_rr(as, XO_CMOV + (cc<<24), REX_64IR(ir, dest), right); 2234 emit_rr(as, XO_CMP, REX_64IR(ir, dest), right); 2235 ra_left(as, dest, lref); 2236 } 2237 2238 static void asm_min(ASMState *as, IRIns *ir) 2239 { 2240 if (irt_isnum(ir->t)) 2241 asm_fparith(as, ir, XO_MINSD); 2242 else 2243 asm_intmin_max(as, ir, CC_G); 2244 } 2245 2246 static void asm_max(ASMState *as, IRIns *ir) 2247 { 2248 if (irt_isnum(ir->t)) 2249 asm_fparith(as, ir, XO_MAXSD); 2250 else 2251 asm_intmin_max(as, ir, CC_L); 2252 } 2253 2254 /* Note: don't use LEA for overflow-checking arithmetic! */ 2255 #define asm_addov(as, ir) asm_intarith(as, ir, XOg_ADD) 2256 #define asm_subov(as, ir) asm_intarith(as, ir, XOg_SUB) 2257 #define asm_mulov(as, ir) asm_intarith(as, ir, XOg_X_IMUL) 2258 2259 #define asm_bnot(as, ir) asm_neg_not(as, ir, XOg_NOT) 2260 2261 static void asm_bswap(ASMState *as, IRIns *ir) 2262 { 2263 Reg dest = ra_dest(as, ir, RSET_GPR); 2264 as->mcp = emit_op(XO_BSWAP + ((dest&7) << 24), 2265 REX_64IR(ir, 0), dest, 0, as->mcp, 1); 2266 ra_left(as, dest, ir->op1); 2267 } 2268 2269 #define asm_band(as, ir) asm_intarith(as, ir, XOg_AND) 2270 #define asm_bor(as, ir) asm_intarith(as, ir, XOg_OR) 2271 #define asm_bxor(as, ir) asm_intarith(as, ir, XOg_XOR) 2272 2273 static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs, x86Op xv) 2274 { 2275 IRRef rref = ir->op2; 2276 IRIns *irr = IR(rref); 2277 Reg dest; 2278 if (irref_isk(rref)) { /* Constant shifts. */ 2279 int shift; 2280 dest = ra_dest(as, ir, RSET_GPR); 2281 shift = irr->i & (irt_is64(ir->t) ? 63 : 31); 2282 if (!xv && shift && (as->flags & JIT_F_BMI2)) { 2283 Reg left = asm_fuseloadm(as, ir->op1, RSET_GPR, irt_is64(ir->t)); 2284 if (left != dest) { /* BMI2 rotate right by constant. */ 2285 emit_i8(as, xs == XOg_ROL ? -shift : shift); 2286 emit_mrm(as, VEX_64IR(ir, XV_RORX), dest, left); 2287 return; 2288 } 2289 } 2290 switch (shift) { 2291 case 0: break; 2292 case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break; 2293 default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break; 2294 } 2295 } else if ((as->flags & JIT_F_BMI2) && xv) { /* BMI2 variable shifts. */ 2296 Reg left, right; 2297 dest = ra_dest(as, ir, RSET_GPR); 2298 right = ra_alloc1(as, rref, RSET_GPR); 2299 left = asm_fuseloadm(as, ir->op1, rset_exclude(RSET_GPR, right), 2300 irt_is64(ir->t)); 2301 emit_mrm(as, VEX_64IR(ir, xv) ^ (right << 19), dest, left); 2302 return; 2303 } else { /* Variable shifts implicitly use register cl (i.e. ecx). */ 2304 Reg right; 2305 dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX)); 2306 if (dest == RID_ECX) { 2307 dest = ra_scratch(as, rset_exclude(RSET_GPR, RID_ECX)); 2308 emit_rr(as, XO_MOV, RID_ECX, dest); 2309 } 2310 right = irr->r; 2311 if (ra_noreg(right)) 2312 right = ra_allocref(as, rref, RID2RSET(RID_ECX)); 2313 else if (right != RID_ECX) 2314 ra_scratch(as, RID2RSET(RID_ECX)); 2315 emit_rr(as, XO_SHIFTcl, REX_64IR(ir, xs), dest); 2316 ra_noweak(as, right); 2317 if (right != RID_ECX) 2318 emit_rr(as, XO_MOV, RID_ECX, right); 2319 } 2320 ra_left(as, dest, ir->op1); 2321 /* 2322 ** Note: avoid using the flags resulting from a shift or rotate! 2323 ** All of them cause a partial flag stall, except for r,1 shifts 2324 ** (but not rotates). And a shift count of 0 leaves the flags unmodified. 2325 */ 2326 } 2327 2328 #define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL, XV_SHLX) 2329 #define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR, XV_SHRX) 2330 #define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR, XV_SARX) 2331 #define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL, 0) 2332 #define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR, 0) 2333 2334 /* -- Comparisons --------------------------------------------------------- */ 2335 2336 /* Virtual flags for unordered FP comparisons. */ 2337 #define VCC_U 0x1000 /* Unordered. */ 2338 #define VCC_P 0x2000 /* Needs extra CC_P branch. */ 2339 #define VCC_S 0x4000 /* Swap avoids CC_P branch. */ 2340 #define VCC_PS (VCC_P|VCC_S) 2341 2342 /* Map of comparisons to flags. ORDER IR. */ 2343 #define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf)) 2344 static const uint16_t asm_compmap[IR_ABC+1] = { 2345 /* signed non-eq unsigned flags */ 2346 /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS), 2347 /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0), 2348 /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS), 2349 /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0), 2350 /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U), 2351 /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS), 2352 /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U), 2353 /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS), 2354 /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P), 2355 /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P), 2356 /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */ 2357 }; 2358 2359 /* FP and integer comparisons. */ 2360 static void asm_comp(ASMState *as, IRIns *ir) 2361 { 2362 uint32_t cc = asm_compmap[ir->o]; 2363 if (irt_isnum(ir->t)) { 2364 IRRef lref = ir->op1; 2365 IRRef rref = ir->op2; 2366 Reg left, right; 2367 MCLabel l_around; 2368 /* 2369 ** An extra CC_P branch is required to preserve ordered/unordered 2370 ** semantics for FP comparisons. This can be avoided by swapping 2371 ** the operands and inverting the condition (except for EQ and UNE). 2372 ** So always try to swap if possible. 2373 ** 2374 ** Another option would be to swap operands to achieve better memory 2375 ** operand fusion. But it's unlikely that this outweighs the cost 2376 ** of the extra branches. 2377 */ 2378 if (cc & VCC_S) { /* Swap? */ 2379 IRRef tmp = lref; lref = rref; rref = tmp; 2380 cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */ 2381 } 2382 left = ra_alloc1(as, lref, RSET_FPR); 2383 l_around = emit_label(as); 2384 asm_guardcc(as, cc >> 4); 2385 if (cc & VCC_P) { /* Extra CC_P branch required? */ 2386 if (!(cc & VCC_U)) { 2387 asm_guardcc(as, CC_P); /* Branch to exit for ordered comparisons. */ 2388 } else if (l_around != as->invmcp) { 2389 emit_sjcc(as, CC_P, l_around); /* Branch around for unordered. */ 2390 } else { 2391 /* Patched to mcloop by asm_loop_fixup. */ 2392 as->loopinv = 2; 2393 if (as->realign) 2394 emit_sjcc(as, CC_P, as->mcp); 2395 else 2396 emit_jcc(as, CC_P, as->mcp); 2397 } 2398 } 2399 right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left)); 2400 emit_mrm(as, XO_UCOMISD, left, right); 2401 } else { 2402 IRRef lref = ir->op1, rref = ir->op2; 2403 IROp leftop = (IROp)(IR(lref)->o); 2404 Reg r64 = REX_64IR(ir, 0); 2405 int32_t imm = 0; 2406 lua_assert(irt_is64(ir->t) || irt_isint(ir->t) || 2407 irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t)); 2408 /* Swap constants (only for ABC) and fusable loads to the right. */ 2409 if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) { 2410 if ((cc & 0xc) == 0xc) cc ^= 0x53; /* L <-> G, LE <-> GE */ 2411 else if ((cc & 0xa) == 0x2) cc ^= 0x55; /* A <-> B, AE <-> BE */ 2412 lref = ir->op2; rref = ir->op1; 2413 } 2414 if (asm_isk32(as, rref, &imm)) { 2415 IRIns *irl = IR(lref); 2416 /* Check wether we can use test ins. Not for unsigned, since CF=0. */ 2417 int usetest = (imm == 0 && (cc & 0xa) != 0x2); 2418 if (usetest && irl->o == IR_BAND && irl+1 == ir && !ra_used(irl)) { 2419 /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */ 2420 Reg right, left = RID_NONE; 2421 RegSet allow = RSET_GPR; 2422 if (!asm_isk32(as, irl->op2, &imm)) { 2423 left = ra_alloc1(as, irl->op2, allow); 2424 rset_clear(allow, left); 2425 } else { /* Try to Fuse IRT_I8/IRT_U8 loads, too. See below. */ 2426 IRIns *irll = IR(irl->op1); 2427 if (opisfusableload((IROp)irll->o) && 2428 (irt_isi8(irll->t) || irt_isu8(irll->t))) { 2429 IRType1 origt = irll->t; /* Temporarily flip types. */ 2430 irll->t.irt = (irll->t.irt & ~IRT_TYPE) | IRT_INT; 2431 as->curins--; /* Skip to BAND to avoid failing in noconflict(). */ 2432 right = asm_fuseload(as, irl->op1, RSET_GPR); 2433 as->curins++; 2434 irll->t = origt; 2435 if (right != RID_MRM) goto test_nofuse; 2436 /* Fusion succeeded, emit test byte mrm, imm8. */ 2437 asm_guardcc(as, cc); 2438 emit_i8(as, (imm & 0xff)); 2439 emit_mrm(as, XO_GROUP3b, XOg_TEST, RID_MRM); 2440 return; 2441 } 2442 } 2443 as->curins--; /* Skip to BAND to avoid failing in noconflict(). */ 2444 right = asm_fuseloadm(as, irl->op1, allow, r64); 2445 as->curins++; /* Undo the above. */ 2446 test_nofuse: 2447 asm_guardcc(as, cc); 2448 if (ra_noreg(left)) { 2449 emit_i32(as, imm); 2450 emit_mrm(as, XO_GROUP3, r64 + XOg_TEST, right); 2451 } else { 2452 emit_mrm(as, XO_TEST, r64 + left, right); 2453 } 2454 } else { 2455 Reg left; 2456 if (opisfusableload((IROp)irl->o) && 2457 ((irt_isu8(irl->t) && checku8(imm)) || 2458 ((irt_isi8(irl->t) || irt_isi16(irl->t)) && checki8(imm)) || 2459 (irt_isu16(irl->t) && checku16(imm) && checki8((int16_t)imm)))) { 2460 /* Only the IRT_INT case is fused by asm_fuseload. 2461 ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads 2462 ** are handled here. 2463 ** Note that cmp word [mem], imm16 should not be generated, 2464 ** since it has a length-changing prefix. Compares of a word 2465 ** against a sign-extended imm8 are ok, however. 2466 */ 2467 IRType1 origt = irl->t; /* Temporarily flip types. */ 2468 irl->t.irt = (irl->t.irt & ~IRT_TYPE) | IRT_INT; 2469 left = asm_fuseload(as, lref, RSET_GPR); 2470 irl->t = origt; 2471 if (left == RID_MRM) { /* Fusion succeeded? */ 2472 if (irt_isu8(irl->t) || irt_isu16(irl->t)) 2473 cc >>= 4; /* Need unsigned compare. */ 2474 asm_guardcc(as, cc); 2475 emit_i8(as, imm); 2476 emit_mrm(as, (irt_isi8(origt) || irt_isu8(origt)) ? 2477 XO_ARITHib : XO_ARITHiw8, r64 + XOg_CMP, RID_MRM); 2478 return; 2479 } /* Otherwise handle register case as usual. */ 2480 } else { 2481 left = asm_fuseloadm(as, lref, 2482 irt_isu8(ir->t) ? RSET_GPR8 : RSET_GPR, r64); 2483 } 2484 asm_guardcc(as, cc); 2485 if (usetest && left != RID_MRM) { 2486 /* Use test r,r instead of cmp r,0. */ 2487 x86Op xo = XO_TEST; 2488 if (irt_isu8(ir->t)) { 2489 lua_assert(ir->o == IR_EQ || ir->o == IR_NE); 2490 xo = XO_TESTb; 2491 if (!rset_test(RSET_RANGE(RID_EAX, RID_EBX+1), left)) { 2492 if (LJ_64) { 2493 left |= FORCE_REX; 2494 } else { 2495 emit_i32(as, 0xff); 2496 emit_mrm(as, XO_GROUP3, XOg_TEST, left); 2497 return; 2498 } 2499 } 2500 } 2501 emit_rr(as, xo, r64 + left, left); 2502 if (irl+1 == ir) /* Referencing previous ins? */ 2503 as->flagmcp = as->mcp; /* Set flag to drop test r,r if possible. */ 2504 } else { 2505 emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm); 2506 } 2507 } 2508 } else { 2509 Reg left = ra_alloc1(as, lref, RSET_GPR); 2510 Reg right = asm_fuseloadm(as, rref, rset_exclude(RSET_GPR, left), r64); 2511 asm_guardcc(as, cc); 2512 emit_mrm(as, XO_CMP, r64 + left, right); 2513 } 2514 } 2515 } 2516 2517 #define asm_equal(as, ir) asm_comp(as, ir) 2518 2519 #if LJ_32 && LJ_HASFFI 2520 /* 64 bit integer comparisons in 32 bit mode. */ 2521 static void asm_comp_int64(ASMState *as, IRIns *ir) 2522 { 2523 uint32_t cc = asm_compmap[(ir-1)->o]; 2524 RegSet allow = RSET_GPR; 2525 Reg lefthi = RID_NONE, leftlo = RID_NONE; 2526 Reg righthi = RID_NONE, rightlo = RID_NONE; 2527 MCLabel l_around; 2528 x86ModRM mrm; 2529 2530 as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */ 2531 2532 /* Allocate/fuse hiword operands. */ 2533 if (irref_isk(ir->op2)) { 2534 lefthi = asm_fuseload(as, ir->op1, allow); 2535 } else { 2536 lefthi = ra_alloc1(as, ir->op1, allow); 2537 rset_clear(allow, lefthi); 2538 righthi = asm_fuseload(as, ir->op2, allow); 2539 if (righthi == RID_MRM) { 2540 if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base); 2541 if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx); 2542 } else { 2543 rset_clear(allow, righthi); 2544 } 2545 } 2546 mrm = as->mrm; /* Save state for hiword instruction. */ 2547 2548 /* Allocate/fuse loword operands. */ 2549 if (irref_isk((ir-1)->op2)) { 2550 leftlo = asm_fuseload(as, (ir-1)->op1, allow); 2551 } else { 2552 leftlo = ra_alloc1(as, (ir-1)->op1, allow); 2553 rset_clear(allow, leftlo); 2554 rightlo = asm_fuseload(as, (ir-1)->op2, allow); 2555 } 2556 2557 /* All register allocations must be performed _before_ this point. */ 2558 l_around = emit_label(as); 2559 as->invmcp = as->flagmcp = NULL; /* Cannot use these optimizations. */ 2560 2561 /* Loword comparison and branch. */ 2562 asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */ 2563 if (ra_noreg(rightlo)) { 2564 int32_t imm = IR((ir-1)->op2)->i; 2565 if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM) 2566 emit_rr(as, XO_TEST, leftlo, leftlo); 2567 else 2568 emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm); 2569 } else { 2570 emit_mrm(as, XO_CMP, leftlo, rightlo); 2571 } 2572 2573 /* Hiword comparison and branches. */ 2574 if ((cc & 15) != CC_NE) 2575 emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */ 2576 if ((cc & 15) != CC_E) 2577 asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */ 2578 as->mrm = mrm; /* Restore state. */ 2579 if (ra_noreg(righthi)) { 2580 int32_t imm = IR(ir->op2)->i; 2581 if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM) 2582 emit_rr(as, XO_TEST, lefthi, lefthi); 2583 else 2584 emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm); 2585 } else { 2586 emit_mrm(as, XO_CMP, lefthi, righthi); 2587 } 2588 } 2589 #endif 2590 2591 /* -- Support for 64 bit ops in 32 bit mode ------------------------------- */ 2592 2593 /* Hiword op of a split 64 bit op. Previous op must be the loword op. */ 2594 static void asm_hiop(ASMState *as, IRIns *ir) 2595 { 2596 #if LJ_32 && LJ_HASFFI 2597 /* HIOP is marked as a store because it needs its own DCE logic. */ 2598 int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */ 2599 if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1; 2600 if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */ 2601 as->curins--; /* Always skip the CONV. */ 2602 if (usehi || uselo) 2603 asm_conv64(as, ir); 2604 return; 2605 } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */ 2606 asm_comp_int64(as, ir); 2607 return; 2608 } else if ((ir-1)->o == IR_XSTORE) { 2609 if ((ir-1)->r != RID_SINK) 2610 asm_fxstore(as, ir); 2611 return; 2612 } 2613 if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ 2614 switch ((ir-1)->o) { 2615 case IR_ADD: 2616 as->flagmcp = NULL; 2617 as->curins--; 2618 asm_intarith(as, ir, XOg_ADC); 2619 asm_intarith(as, ir-1, XOg_ADD); 2620 break; 2621 case IR_SUB: 2622 as->flagmcp = NULL; 2623 as->curins--; 2624 asm_intarith(as, ir, XOg_SBB); 2625 asm_intarith(as, ir-1, XOg_SUB); 2626 break; 2627 case IR_NEG: { 2628 Reg dest = ra_dest(as, ir, RSET_GPR); 2629 emit_rr(as, XO_GROUP3, XOg_NEG, dest); 2630 emit_i8(as, 0); 2631 emit_rr(as, XO_ARITHi8, XOg_ADC, dest); 2632 ra_left(as, dest, ir->op1); 2633 as->curins--; 2634 asm_neg_not(as, ir-1, XOg_NEG); 2635 break; 2636 } 2637 case IR_CALLN: 2638 case IR_CALLXS: 2639 if (!uselo) 2640 ra_allocref(as, ir->op1, RID2RSET(RID_RETLO)); /* Mark lo op as used. */ 2641 break; 2642 case IR_CNEWI: 2643 /* Nothing to do here. Handled by CNEWI itself. */ 2644 break; 2645 default: lua_assert(0); break; 2646 } 2647 #else 2648 UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */ 2649 #endif 2650 } 2651 2652 /* -- Profiling ----------------------------------------------------------- */ 2653 2654 static void asm_prof(ASMState *as, IRIns *ir) 2655 { 2656 UNUSED(ir); 2657 asm_guardcc(as, CC_NE); 2658 emit_i8(as, HOOK_PROFILE); 2659 emit_rma(as, XO_GROUP3b, XOg_TEST, &J2G(as->J)->hookmask); 2660 } 2661 2662 /* -- Stack handling ------------------------------------------------------ */ 2663 2664 /* Check Lua stack size for overflow. Use exit handler as fallback. */ 2665 static void asm_stack_check(ASMState *as, BCReg topslot, 2666 IRIns *irp, RegSet allow, ExitNo exitno) 2667 { 2668 /* Try to get an unused temp. register, otherwise spill/restore eax. */ 2669 Reg pbase = irp ? irp->r : RID_BASE; 2670 Reg r = allow ? rset_pickbot(allow) : RID_EAX; 2671 emit_jcc(as, CC_B, exitstub_addr(as->J, exitno)); 2672 if (allow == RSET_EMPTY) /* Restore temp. register. */ 2673 emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0); 2674 else 2675 ra_modified(as, r); 2676 emit_gri(as, XG_ARITHi(XOg_CMP), r|REX_GC64, (int32_t)(8*topslot)); 2677 if (ra_hasreg(pbase) && pbase != r) 2678 emit_rr(as, XO_ARITH(XOg_SUB), r|REX_GC64, pbase); 2679 else 2680 #if LJ_GC64 2681 emit_rmro(as, XO_ARITH(XOg_SUB), r|REX_64, RID_DISPATCH, 2682 (int32_t)dispofs(as, &J2G(as->J)->jit_base)); 2683 #else 2684 emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE, 2685 ptr2addr(&J2G(as->J)->jit_base)); 2686 #endif 2687 emit_rmro(as, XO_MOV, r|REX_GC64, r, offsetof(lua_State, maxstack)); 2688 emit_getgl(as, r, cur_L); 2689 if (allow == RSET_EMPTY) /* Spill temp. register. */ 2690 emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0); 2691 } 2692 2693 /* Restore Lua stack from on-trace state. */ 2694 static void asm_stack_restore(ASMState *as, SnapShot *snap) 2695 { 2696 SnapEntry *map = &as->T->snapmap[snap->mapofs]; 2697 #if !LJ_FR2 || defined(LUA_USE_ASSERT) 2698 SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2]; 2699 #endif 2700 MSize n, nent = snap->nent; 2701 /* Store the value of all modified slots to the Lua stack. */ 2702 for (n = 0; n < nent; n++) { 2703 SnapEntry sn = map[n]; 2704 BCReg s = snap_slot(sn); 2705 int32_t ofs = 8*((int32_t)s-1-LJ_FR2); 2706 IRRef ref = snap_ref(sn); 2707 IRIns *ir = IR(ref); 2708 if ((sn & SNAP_NORESTORE)) 2709 continue; 2710 if (irt_isnum(ir->t)) { 2711 Reg src = ra_alloc1(as, ref, RSET_FPR); 2712 emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs); 2713 } else { 2714 lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || 2715 (LJ_DUALNUM && irt_isinteger(ir->t))); 2716 if (!irref_isk(ref)) { 2717 Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE)); 2718 #if LJ_GC64 2719 if (irt_is64(ir->t)) { 2720 /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */ 2721 emit_u32(as, irt_toitype(ir->t) << 15); 2722 emit_rmro(as, XO_ARITHi, XOg_OR, RID_BASE, ofs+4); 2723 } else if (LJ_DUALNUM && irt_isinteger(ir->t)) { 2724 emit_movmroi(as, RID_BASE, ofs+4, LJ_TISNUM << 15); 2725 } else { 2726 emit_movmroi(as, RID_BASE, ofs+4, (irt_toitype(ir->t)<<15)|0x7fff); 2727 } 2728 #endif 2729 emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs); 2730 #if LJ_GC64 2731 } else { 2732 TValue k; 2733 lj_ir_kvalue(as->J->L, &k, ir); 2734 if (tvisnil(&k)) { 2735 emit_i32(as, -1); 2736 emit_rmro(as, XO_MOVmi, REX_64, RID_BASE, ofs); 2737 } else { 2738 emit_movmroi(as, RID_BASE, ofs+4, k.u32.hi); 2739 emit_movmroi(as, RID_BASE, ofs, k.u32.lo); 2740 } 2741 #else 2742 } else if (!irt_ispri(ir->t)) { 2743 emit_movmroi(as, RID_BASE, ofs, ir->i); 2744 #endif 2745 } 2746 if ((sn & (SNAP_CONT|SNAP_FRAME))) { 2747 #if !LJ_FR2 2748 if (s != 0) /* Do not overwrite link to previous frame. */ 2749 emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--)); 2750 #endif 2751 #if !LJ_GC64 2752 } else { 2753 if (!(LJ_64 && irt_islightud(ir->t))) 2754 emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t)); 2755 #endif 2756 } 2757 } 2758 checkmclim(as); 2759 } 2760 lua_assert(map + nent == flinks); 2761 } 2762 2763 /* -- GC handling --------------------------------------------------------- */ 2764 /* Check GC threshold and do one or more GC steps. */ 2765 static void asm_gc_check(ASMState *as) 2766 { 2767 const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit]; 2768 IRRef args[2]; 2769 MCLabel l_end; 2770 Reg tmp; 2771 ra_evictset(as, RSET_SCRATCH); 2772 l_end = emit_label(as); 2773 /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */ 2774 asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */ 2775 emit_rr(as, XO_TEST, RID_RET, RID_RET); 2776 args[0] = ASMREF_TMP1; /* global_State *g */ 2777 args[1] = ASMREF_TMP2; /* MSize steps */ 2778 asm_gencall(as, ci, args); 2779 tmp = ra_releasetmp(as, ASMREF_TMP1); 2780 #if LJ_GC64 2781 emit_rmro(as, XO_LEA, tmp|REX_64, RID_DISPATCH, GG_DISP2G); 2782 #else 2783 emit_loada(as, tmp, J2G(as->J)); 2784 #endif 2785 emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), as->gcsteps); 2786 /* Jump around GC step if GC total < GC threshold. */ 2787 emit_sjcc(as, CC_B, l_end); 2788 emit_opgl(as, XO_ARITH(XOg_CMP), tmp|REX_GC64, gc.threshold); 2789 emit_getgl(as, tmp, gc.total); 2790 as->gcsteps = 0; 2791 checkmclim(as); 2792 } 2793 /* -- Loop handling ------------------------------------------------------- */ 2794 2795 /* Fixup the loop branch. */ 2796 static void asm_loop_fixup(ASMState *as) 2797 { 2798 MCode *p = as->mctop; 2799 MCode *target = as->mcp; 2800 if (as->realign) { /* Realigned loops use short jumps. */ 2801 as->realign = NULL; /* Stop another retry. */ 2802 lua_assert(((intptr_t)target & 15) == 0); 2803 if (as->loopinv) { /* Inverted loop branch? */ 2804 p -= 5; 2805 p[0] = XI_JMP; 2806 lua_assert(target - p >= -128); 2807 p[-1] = (MCode)(target - p); /* Patch sjcc. */ 2808 if (as->loopinv == 2) 2809 p[-3] = (MCode)(target - p + 2); /* Patch opt. short jp. */ 2810 } else { 2811 lua_assert(target - p >= -128); 2812 p[-1] = (MCode)(int8_t)(target - p); /* Patch short jmp. */ 2813 p[-2] = XI_JMPs; 2814 } 2815 } else { 2816 MCode *newloop; 2817 p[-5] = XI_JMP; 2818 if (as->loopinv) { /* Inverted loop branch? */ 2819 /* asm_guardcc already inverted the jcc and patched the jmp. */ 2820 p -= 5; 2821 newloop = target+4; 2822 *(int32_t *)(p-4) = (int32_t)(target - p); /* Patch jcc. */ 2823 if (as->loopinv == 2) { 2824 *(int32_t *)(p-10) = (int32_t)(target - p + 6); /* Patch opt. jp. */ 2825 newloop = target+8; 2826 } 2827 } else { /* Otherwise just patch jmp. */ 2828 *(int32_t *)(p-4) = (int32_t)(target - p); 2829 newloop = target+3; 2830 } 2831 /* Realign small loops and shorten the loop branch. */ 2832 if (newloop >= p - 128) { 2833 as->realign = newloop; /* Force a retry and remember alignment. */ 2834 as->curins = as->stopins; /* Abort asm_trace now. */ 2835 as->T->nins = as->orignins; /* Remove any added renames. */ 2836 } 2837 } 2838 } 2839 2840 /* -- Head of trace ------------------------------------------------------- */ 2841 2842 /* Coalesce BASE register for a root trace. */ 2843 static void asm_head_root_base(ASMState *as) 2844 { 2845 IRIns *ir = IR(REF_BASE); 2846 Reg r = ir->r; 2847 if (ra_hasreg(r)) { 2848 ra_free(as, r); 2849 if (rset_test(as->modset, r) || irt_ismarked(ir->t)) 2850 ir->r = RID_INIT; /* No inheritance for modified BASE register. */ 2851 if (r != RID_BASE) 2852 emit_rr(as, XO_MOV, r|REX_GC64, RID_BASE); 2853 } 2854 } 2855 2856 /* Coalesce or reload BASE register for a side trace. */ 2857 static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow) 2858 { 2859 IRIns *ir = IR(REF_BASE); 2860 Reg r = ir->r; 2861 if (ra_hasreg(r)) { 2862 ra_free(as, r); 2863 if (rset_test(as->modset, r) || irt_ismarked(ir->t)) 2864 ir->r = RID_INIT; /* No inheritance for modified BASE register. */ 2865 if (irp->r == r) { 2866 rset_clear(allow, r); /* Mark same BASE register as coalesced. */ 2867 } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) { 2868 /* Move from coalesced parent reg. */ 2869 rset_clear(allow, irp->r); 2870 emit_rr(as, XO_MOV, r|REX_GC64, irp->r); 2871 } else { 2872 emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */ 2873 } 2874 } 2875 return allow; 2876 } 2877 2878 /* -- Tail of trace ------------------------------------------------------- */ 2879 2880 /* Fixup the tail code. */ 2881 static void asm_tail_fixup(ASMState *as, TraceNo lnk) 2882 { 2883 /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */ 2884 MCode *p = as->mctop; 2885 MCode *target, *q; 2886 int32_t spadj = as->T->spadjust; 2887 if (spadj == 0) { 2888 p -= ((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0); 2889 } else { 2890 MCode *p1; 2891 /* Patch stack adjustment. */ 2892 if (checki8(spadj)) { 2893 p -= 3; 2894 p1 = p-6; 2895 *p1 = (MCode)spadj; 2896 } else { 2897 p1 = p-9; 2898 *(int32_t *)p1 = spadj; 2899 } 2900 if ((as->flags & JIT_F_LEA_AGU)) { 2901 #if LJ_64 2902 p1[-4] = 0x48; 2903 #endif 2904 p1[-3] = (MCode)XI_LEA; 2905 p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP); 2906 p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP); 2907 } else { 2908 #if LJ_64 2909 p1[-3] = 0x48; 2910 #endif 2911 p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi); 2912 p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP); 2913 } 2914 } 2915 /* Patch exit branch. */ 2916 target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp; 2917 *(int32_t *)(p-4) = jmprel(p, target); 2918 p[-5] = XI_JMP; 2919 /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */ 2920 for (q = as->mctop-1; q >= p; q--) 2921 *q = XI_NOP; 2922 as->mctop = p; 2923 } 2924 2925 /* Prepare tail of code. */ 2926 static void asm_tail_prep(ASMState *as) 2927 { 2928 MCode *p = as->mctop; 2929 /* Realign and leave room for backwards loop branch or exit branch. */ 2930 if (as->realign) { 2931 int i = ((int)(intptr_t)as->realign) & 15; 2932 /* Fill unused mcode tail with NOPs to make the prefetcher happy. */ 2933 while (i-- > 0) 2934 *--p = XI_NOP; 2935 as->mctop = p; 2936 p -= (as->loopinv ? 5 : 2); /* Space for short/near jmp. */ 2937 } else { 2938 p -= 5; /* Space for exit branch (near jmp). */ 2939 } 2940 if (as->loopref) { 2941 as->invmcp = as->mcp = p; 2942 } else { 2943 /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */ 2944 as->mcp = p - (((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0)); 2945 as->invmcp = NULL; 2946 } 2947 } 2948 2949 /* -- Trace setup --------------------------------------------------------- */ 2950 2951 /* Ensure there are enough stack slots for call arguments. */ 2952 static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) 2953 { 2954 IRRef args[CCI_NARGS_MAX*2]; 2955 int nslots; 2956 asm_collectargs(as, ir, ci, args); 2957 nslots = asm_count_call_slots(as, ci, args); 2958 if (nslots > as->evenspill) /* Leave room for args in stack slots. */ 2959 as->evenspill = nslots; 2960 #if LJ_64 2961 return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET); 2962 #else 2963 return irt_isfp(ir->t) ? REGSP_INIT : REGSP_HINT(RID_RET); 2964 #endif 2965 } 2966 2967 /* Target-specific setup. */ 2968 static void asm_setup_target(ASMState *as) 2969 { 2970 asm_exitstub_setup(as, as->T->nsnap); 2971 as->mrm.base = 0; 2972 } 2973 2974 /* -- Trace patching ------------------------------------------------------ */ 2975 2976 static const uint8_t map_op1[256] = { 2977 0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x20, 2978 0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51, 2979 0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51, 2980 0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51, 2981 #if LJ_64 2982 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x14,0x14,0x14,0x14,0x14,0x14,0x14,0x14, 2983 #else 2984 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51, 2985 #endif 2986 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51, 2987 0x51,0x51,0x92,0x92,0x10,0x10,0x12,0x11,0x45,0x86,0x52,0x93,0x51,0x51,0x51,0x51, 2988 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52, 2989 0x93,0x86,0x93,0x93,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92, 2990 0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x47,0x51,0x51,0x51,0x51,0x51, 2991 #if LJ_64 2992 0x59,0x59,0x59,0x59,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51, 2993 #else 2994 0x55,0x55,0x55,0x55,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51, 2995 #endif 2996 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x05,0x05,0x05,0x05,0x05,0x05,0x05,0x05, 2997 0x93,0x93,0x53,0x51,0x70,0x71,0x93,0x86,0x54,0x51,0x53,0x51,0x51,0x52,0x51,0x51, 2998 0x92,0x92,0x92,0x92,0x52,0x52,0x51,0x51,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92, 2999 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x45,0x45,0x47,0x52,0x51,0x51,0x51,0x51, 3000 0x10,0x51,0x10,0x10,0x51,0x51,0x63,0x66,0x51,0x51,0x51,0x51,0x51,0x51,0x92,0x92 3001 }; 3002 3003 static const uint8_t map_op2[256] = { 3004 0x93,0x93,0x93,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x51,0x52,0x51,0x93,0x52,0x94, 3005 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3006 0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3007 0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x34,0x51,0x35,0x51,0x51,0x51,0x51,0x51, 3008 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3009 0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3010 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3011 0x94,0x54,0x54,0x54,0x93,0x93,0x93,0x52,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3012 0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46, 3013 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3014 0x52,0x52,0x52,0x93,0x94,0x93,0x51,0x51,0x52,0x52,0x52,0x93,0x94,0x93,0x93,0x93, 3015 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x94,0x93,0x93,0x93,0x93,0x93, 3016 0x93,0x93,0x94,0x93,0x94,0x94,0x94,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52, 3017 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3018 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, 3019 0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x52 3020 }; 3021 3022 static uint32_t asm_x86_inslen(const uint8_t* p) 3023 { 3024 uint32_t result = 0; 3025 uint32_t prefixes = 0; 3026 uint32_t x = map_op1[*p]; 3027 for (;;) { 3028 switch (x >> 4) { 3029 case 0: return result + x + (prefixes & 4); 3030 case 1: prefixes |= x; x = map_op1[*++p]; result++; break; 3031 case 2: x = map_op2[*++p]; break; 3032 case 3: p++; goto mrm; 3033 case 4: result -= (prefixes & 2); /* fallthrough */ 3034 case 5: return result + (x & 15); 3035 case 6: /* Group 3. */ 3036 if (p[1] & 0x38) x = 2; 3037 else if ((prefixes & 2) && (x == 0x66)) x = 4; 3038 goto mrm; 3039 case 7: /* VEX c4/c5. */ 3040 if (LJ_32 && p[1] < 0xc0) { 3041 x = 2; 3042 goto mrm; 3043 } 3044 if (x == 0x70) { 3045 x = *++p & 0x1f; 3046 result++; 3047 if (x >= 2) { 3048 p += 2; 3049 result += 2; 3050 goto mrm; 3051 } 3052 } 3053 p++; 3054 result++; 3055 x = map_op2[*++p]; 3056 break; 3057 case 8: result -= (prefixes & 2); /* fallthrough */ 3058 case 9: mrm: /* ModR/M and possibly SIB. */ 3059 result += (x & 15); 3060 x = *++p; 3061 switch (x >> 6) { 3062 case 0: if ((x & 7) == 5) return result + 4; break; 3063 case 1: result++; break; 3064 case 2: result += 4; break; 3065 case 3: return result; 3066 } 3067 if ((x & 7) == 4) { 3068 result++; 3069 if (x < 0x40 && (p[1] & 7) == 5) result += 4; 3070 } 3071 return result; 3072 } 3073 } 3074 } 3075 3076 /* Patch exit jumps of existing machine code to a new target. */ 3077 void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) 3078 { 3079 MCode *p = T->mcode; 3080 MCode *mcarea = lj_mcode_patch(J, p, 0); 3081 MSize len = T->szmcode; 3082 MCode *px = exitstub_addr(J, exitno) - 6; 3083 MCode *pe = p+len-6; 3084 #if LJ_GC64 3085 uint32_t statei = (uint32_t)(GG_OFS(g.vmstate) - GG_OFS(dispatch)); 3086 #else 3087 uint32_t statei = u32ptr(&J2G(J)->vmstate); 3088 #endif 3089 if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px) 3090 *(int32_t *)(p+len-4) = jmprel(p+len, target); 3091 /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */ 3092 for (; p < pe; p += asm_x86_inslen(p)) { 3093 intptr_t ofs = LJ_GC64 ? (p[0] & 0xf0) == 0x40 : LJ_64; 3094 if (*(uint32_t *)(p+2+ofs) == statei && p[ofs+LJ_GC64-LJ_64] == XI_MOVmi) 3095 break; 3096 } 3097 lua_assert(p < pe); 3098 for (; p < pe; p += asm_x86_inslen(p)) 3099 if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) 3100 *(int32_t *)(p+2) = jmprel(p+6, target); 3101 lj_mcode_sync(T->mcode, T->mcode + T->szmcode); 3102 lj_mcode_patch(J, mcarea, 1); 3103 } 3104