lj_opt_loop.c (15404B)
1 /* 2 ** LOOP: Loop Optimizations. 3 ** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h 4 */ 5 6 #define lj_opt_loop_c 7 #define LUA_CORE 8 9 #include "lj_obj.h" 10 11 #if LJ_HASJIT 12 13 #include "lj_err.h" 14 #include "lj_buf.h" 15 #include "lj_ir.h" 16 #include "lj_jit.h" 17 #include "lj_iropt.h" 18 #include "lj_trace.h" 19 #include "lj_snap.h" 20 #include "lj_vm.h" 21 22 /* Loop optimization: 23 ** 24 ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions 25 ** of a loop into invariant and variant instructions. The invariant 26 ** instructions are hoisted out of the loop and only the variant 27 ** instructions remain inside the loop body. 28 ** 29 ** Unfortunately LICM is mostly useless for compiling dynamic languages. 30 ** The IR has many guards and most of the subsequent instructions are 31 ** control-dependent on them. The first non-hoistable guard would 32 ** effectively prevent hoisting of all subsequent instructions. 33 ** 34 ** That's why we use a special form of unrolling using copy-substitution, 35 ** combined with redundancy elimination: 36 ** 37 ** The recorded instruction stream is re-emitted to the compiler pipeline 38 ** with substituted operands. The substitution table is filled with the 39 ** refs returned by re-emitting each instruction. This can be done 40 ** on-the-fly, because the IR is in strict SSA form, where every ref is 41 ** defined before its use. 42 ** 43 ** This aproach generates two code sections, separated by the LOOP 44 ** instruction: 45 ** 46 ** 1. The recorded instructions form a kind of pre-roll for the loop. It 47 ** contains a mix of invariant and variant instructions and performs 48 ** exactly one loop iteration (but not necessarily the 1st iteration). 49 ** 50 ** 2. The loop body contains only the variant instructions and performs 51 ** all remaining loop iterations. 52 ** 53 ** On first sight that looks like a waste of space, because the variant 54 ** instructions are present twice. But the key insight is that the 55 ** pre-roll honors the control-dependencies for *both* the pre-roll itself 56 ** *and* the loop body! 57 ** 58 ** It also means one doesn't have to explicitly model control-dependencies 59 ** (which, BTW, wouldn't help LICM much). And it's much easier to 60 ** integrate sparse snapshotting with this approach. 61 ** 62 ** One of the nicest aspects of this approach is that all of the 63 ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be 64 ** reused with only minor restrictions (e.g. one should not fold 65 ** instructions across loop-carried dependencies). 66 ** 67 ** But in general all optimizations can be applied which only need to look 68 ** backwards into the generated instruction stream. At any point in time 69 ** during the copy-substitution process this contains both a static loop 70 ** iteration (the pre-roll) and a dynamic one (from the to-be-copied 71 ** instruction up to the end of the partial loop body). 72 ** 73 ** Since control-dependencies are implicitly kept, CSE also applies to all 74 ** kinds of guards. The major advantage is that all invariant guards can 75 ** be hoisted, too. 76 ** 77 ** Load/store forwarding works across loop iterations, too. This is 78 ** important if loop-carried dependencies are kept in upvalues or tables. 79 ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may 80 ** become a forwarded loop-recurrence after inlining. 81 ** 82 ** Since the IR is in SSA form, loop-carried dependencies have to be 83 ** modeled with PHI instructions. The potential candidates for PHIs are 84 ** collected on-the-fly during copy-substitution. After eliminating the 85 ** redundant ones, PHI instructions are emitted *below* the loop body. 86 ** 87 ** Note that this departure from traditional SSA form doesn't change the 88 ** semantics of the PHI instructions themselves. But it greatly simplifies 89 ** on-the-fly generation of the IR and the machine code. 90 */ 91 92 /* Some local macros to save typing. Undef'd at the end. */ 93 #define IR(ref) (&J->cur.ir[(ref)]) 94 95 /* Pass IR on to next optimization in chain (FOLD). */ 96 #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J)) 97 98 /* Emit raw IR without passing through optimizations. */ 99 #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J)) 100 101 /* -- PHI elimination ----------------------------------------------------- */ 102 103 /* Emit or eliminate collected PHIs. */ 104 static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi, 105 SnapNo onsnap) 106 { 107 int passx = 0; 108 IRRef i, j, nslots; 109 IRRef invar = J->chain[IR_LOOP]; 110 /* Pass #1: mark redundant and potentially redundant PHIs. */ 111 for (i = 0, j = 0; i < nphi; i++) { 112 IRRef lref = phi[i]; 113 IRRef rref = subst[lref]; 114 if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */ 115 irt_clearphi(IR(lref)->t); 116 } else { 117 phi[j++] = (IRRef1)lref; 118 if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) { 119 /* Quick check for simple recurrences failed, need pass2. */ 120 irt_setmark(IR(lref)->t); 121 passx = 1; 122 } 123 } 124 } 125 nphi = j; 126 /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */ 127 if (passx) { 128 SnapNo s; 129 for (i = J->cur.nins-1; i > invar; i--) { 130 IRIns *ir = IR(i); 131 if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); 132 if (!irref_isk(ir->op1)) { 133 irt_clearmark(IR(ir->op1)->t); 134 if (ir->op1 < invar && 135 ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */ 136 ir = IR(ir->op1); 137 while (ir->o == IR_CARG) { 138 if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); 139 if (irref_isk(ir->op1)) break; 140 ir = IR(ir->op1); 141 irt_clearmark(ir->t); 142 } 143 } 144 } 145 } 146 for (s = J->cur.nsnap-1; s >= onsnap; s--) { 147 SnapShot *snap = &J->cur.snap[s]; 148 SnapEntry *map = &J->cur.snapmap[snap->mapofs]; 149 MSize n, nent = snap->nent; 150 for (n = 0; n < nent; n++) { 151 IRRef ref = snap_ref(map[n]); 152 if (!irref_isk(ref)) irt_clearmark(IR(ref)->t); 153 } 154 } 155 } 156 /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */ 157 nslots = J->baseslot+J->maxslot; 158 for (i = 1; i < nslots; i++) { 159 IRRef ref = tref_ref(J->slot[i]); 160 while (!irref_isk(ref) && ref != subst[ref]) { 161 IRIns *ir = IR(ref); 162 irt_clearmark(ir->t); /* Unmark potential uses, too. */ 163 if (irt_isphi(ir->t) || irt_ispri(ir->t)) 164 break; 165 irt_setphi(ir->t); 166 if (nphi >= LJ_MAX_PHI) 167 lj_trace_err(J, LJ_TRERR_PHIOV); 168 phi[nphi++] = (IRRef1)ref; 169 ref = subst[ref]; 170 if (ref > invar) 171 break; 172 } 173 } 174 /* Pass #4: propagate non-redundant PHIs. */ 175 while (passx) { 176 passx = 0; 177 for (i = 0; i < nphi; i++) { 178 IRRef lref = phi[i]; 179 IRIns *ir = IR(lref); 180 if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */ 181 IRIns *irr = IR(subst[lref]); 182 if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */ 183 irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */ 184 passx = 1; /* Retry. */ 185 } 186 } 187 } 188 } 189 /* Pass #5: emit PHI instructions or eliminate PHIs. */ 190 for (i = 0; i < nphi; i++) { 191 IRRef lref = phi[i]; 192 IRIns *ir = IR(lref); 193 if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */ 194 IRRef rref = subst[lref]; 195 if (rref > invar) 196 irt_setphi(IR(rref)->t); 197 emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref); 198 } else { /* Otherwise eliminate PHI. */ 199 irt_clearmark(ir->t); 200 irt_clearphi(ir->t); 201 } 202 } 203 } 204 205 /* -- Loop unrolling using copy-substitution ------------------------------ */ 206 207 /* Copy-substitute snapshot. */ 208 static void loop_subst_snap(jit_State *J, SnapShot *osnap, 209 SnapEntry *loopmap, IRRef1 *subst) 210 { 211 SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs]; 212 SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)]; 213 MSize nmapofs; 214 MSize on, ln, nn, onent = osnap->nent; 215 BCReg nslots = osnap->nslots; 216 SnapShot *snap = &J->cur.snap[J->cur.nsnap]; 217 if (irt_isguard(J->guardemit)) { /* Guard inbetween? */ 218 nmapofs = J->cur.nsnapmap; 219 J->cur.nsnap++; /* Add new snapshot. */ 220 } else { /* Otherwise overwrite previous snapshot. */ 221 snap--; 222 nmapofs = snap->mapofs; 223 } 224 J->guardemit.irt = 0; 225 /* Setup new snapshot. */ 226 snap->mapofs = (uint16_t)nmapofs; 227 snap->ref = (IRRef1)J->cur.nins; 228 snap->nslots = nslots; 229 snap->topslot = osnap->topslot; 230 snap->count = 0; 231 nmap = &J->cur.snapmap[nmapofs]; 232 /* Substitute snapshot slots. */ 233 on = ln = nn = 0; 234 while (on < onent) { 235 SnapEntry osn = omap[on], lsn = loopmap[ln]; 236 if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */ 237 nmap[nn++] = lsn; 238 ln++; 239 } else { /* Copy substituted slot from snapshot map. */ 240 if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */ 241 if (!irref_isk(snap_ref(osn))) 242 osn = snap_setref(osn, subst[snap_ref(osn)]); 243 nmap[nn++] = osn; 244 on++; 245 } 246 } 247 while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */ 248 nmap[nn++] = loopmap[ln++]; 249 snap->nent = (uint8_t)nn; 250 omap += onent; 251 nmap += nn; 252 while (omap < nextmap) /* Copy PC + frame links. */ 253 *nmap++ = *omap++; 254 J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap); 255 } 256 257 typedef struct LoopState { 258 jit_State *J; 259 IRRef1 *subst; 260 MSize sizesubst; 261 } LoopState; 262 263 /* Unroll loop. */ 264 static void loop_unroll(LoopState *lps) 265 { 266 jit_State *J = lps->J; 267 IRRef1 phi[LJ_MAX_PHI]; 268 uint32_t nphi = 0; 269 IRRef1 *subst; 270 SnapNo onsnap; 271 SnapShot *osnap, *loopsnap; 272 SnapEntry *loopmap, *psentinel; 273 IRRef ins, invar; 274 275 /* Allocate substitution table. 276 ** Only non-constant refs in [REF_BIAS,invar) are valid indexes. 277 */ 278 invar = J->cur.nins; 279 lps->sizesubst = invar - REF_BIAS; 280 lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1); 281 subst = lps->subst - REF_BIAS; 282 subst[REF_BASE] = REF_BASE; 283 284 /* LOOP separates the pre-roll from the loop body. */ 285 emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0); 286 287 /* Grow snapshot buffer and map for copy-substituted snapshots. 288 ** Need up to twice the number of snapshots minus #0 and loop snapshot. 289 ** Need up to twice the number of entries plus fallback substitutions 290 ** from the loop snapshot entries for each new snapshot. 291 ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap! 292 */ 293 onsnap = J->cur.nsnap; 294 lj_snap_grow_buf(J, 2*onsnap-2); 295 lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent); 296 297 /* The loop snapshot is used for fallback substitutions. */ 298 loopsnap = &J->cur.snap[onsnap-1]; 299 loopmap = &J->cur.snapmap[loopsnap->mapofs]; 300 /* The PC of snapshot #0 and the loop snapshot must match. */ 301 psentinel = &loopmap[loopsnap->nent]; 302 lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]); 303 *psentinel = SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */ 304 305 /* Start substitution with snapshot #1 (#0 is empty for root traces). */ 306 osnap = &J->cur.snap[1]; 307 308 /* Copy and substitute all recorded instructions and snapshots. */ 309 for (ins = REF_FIRST; ins < invar; ins++) { 310 IRIns *ir; 311 IRRef op1, op2; 312 313 if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */ 314 loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */ 315 316 /* Substitute instruction operands. */ 317 ir = IR(ins); 318 op1 = ir->op1; 319 if (!irref_isk(op1)) op1 = subst[op1]; 320 op2 = ir->op2; 321 if (!irref_isk(op2)) op2 = subst[op2]; 322 if (irm_kind(lj_ir_mode[ir->o]) == IRM_N && 323 op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */ 324 subst[ins] = (IRRef1)ins; /* Shortcut. */ 325 } else { 326 /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */ 327 IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */ 328 IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2)); 329 subst[ins] = (IRRef1)ref; 330 if (ref != ins) { 331 IRIns *irr = IR(ref); 332 if (ref < invar) { /* Loop-carried dependency? */ 333 /* Potential PHI? */ 334 if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) { 335 irt_setphi(irr->t); 336 if (nphi >= LJ_MAX_PHI) 337 lj_trace_err(J, LJ_TRERR_PHIOV); 338 phi[nphi++] = (IRRef1)ref; 339 } 340 /* Check all loop-carried dependencies for type instability. */ 341 if (!irt_sametype(t, irr->t)) { 342 if (irt_isinteger(t) && irt_isinteger(irr->t)) 343 continue; 344 else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */ 345 ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT)); 346 else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */ 347 ref = tref_ref(emitir(IRTGI(IR_CONV), ref, 348 IRCONV_INT_NUM|IRCONV_CHECK)); 349 else 350 lj_trace_err(J, LJ_TRERR_TYPEINS); 351 subst[ins] = (IRRef1)ref; 352 irr = IR(ref); 353 goto phiconv; 354 } 355 } else if (ref != REF_DROP && irr->o == IR_CONV && 356 ref > invar && irr->op1 < invar) { 357 /* May need an extra PHI for a CONV. */ 358 ref = irr->op1; 359 irr = IR(ref); 360 phiconv: 361 if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) { 362 irt_setphi(irr->t); 363 if (nphi >= LJ_MAX_PHI) 364 lj_trace_err(J, LJ_TRERR_PHIOV); 365 phi[nphi++] = (IRRef1)ref; 366 } 367 } 368 } 369 } 370 } 371 if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */ 372 J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs; 373 lua_assert(J->cur.nsnapmap <= J->sizesnapmap); 374 *psentinel = J->cur.snapmap[J->cur.snap[0].nent]; /* Restore PC. */ 375 376 loop_emit_phi(J, subst, phi, nphi, onsnap); 377 } 378 379 /* Undo any partial changes made by the loop optimization. */ 380 static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap) 381 { 382 ptrdiff_t i; 383 SnapShot *snap = &J->cur.snap[nsnap-1]; 384 SnapEntry *map = J->cur.snapmap; 385 map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */ 386 J->cur.nsnapmap = (uint16_t)nsnapmap; 387 J->cur.nsnap = nsnap; 388 J->guardemit.irt = 0; 389 lj_ir_rollback(J, ins); 390 for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */ 391 BPropEntry *bp = &J->bpropcache[i]; 392 if (bp->val >= ins) 393 bp->key = 0; 394 } 395 for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */ 396 IRIns *ir = IR(ins); 397 irt_clearphi(ir->t); 398 irt_clearmark(ir->t); 399 } 400 } 401 402 /* Protected callback for loop optimization. */ 403 static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud) 404 { 405 UNUSED(L); UNUSED(dummy); 406 loop_unroll((LoopState *)ud); 407 return NULL; 408 } 409 410 /* Loop optimization. */ 411 int lj_opt_loop(jit_State *J) 412 { 413 IRRef nins = J->cur.nins; 414 SnapNo nsnap = J->cur.nsnap; 415 MSize nsnapmap = J->cur.nsnapmap; 416 LoopState lps; 417 int errcode; 418 lps.J = J; 419 lps.subst = NULL; 420 lps.sizesubst = 0; 421 errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt); 422 lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1); 423 if (LJ_UNLIKELY(errcode)) { 424 lua_State *L = J->L; 425 if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */ 426 int32_t e = numberVint(L->top-1); 427 switch ((TraceError)e) { 428 case LJ_TRERR_TYPEINS: /* Type instability. */ 429 case LJ_TRERR_GFAIL: /* Guard would always fail. */ 430 /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */ 431 if (--J->instunroll < 0) /* But do not unroll forever. */ 432 break; 433 L->top--; /* Remove error object. */ 434 loop_undo(J, nins, nsnap, nsnapmap); 435 return 1; /* Loop optimization failed, continue recording. */ 436 default: 437 break; 438 } 439 } 440 lj_err_throw(L, errcode); /* Propagate all other errors. */ 441 } 442 return 0; /* Loop optimization is ok. */ 443 } 444 445 #undef IR 446 #undef emitir 447 #undef emitir_raw 448 449 #endif