lj_opt_loop.c - ljx - FORK: LuaJIT with native 5.2 and 5.3 support

lj_opt_loop.c (15404B)
      1 /*
      2 ** LOOP: Loop Optimizations.
      3 ** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
      4 */
      5 
      6 #define lj_opt_loop_c
      7 #define LUA_CORE
      8 
      9 #include "lj_obj.h"
     10 
     11 #if LJ_HASJIT
     12 
     13 #include "lj_err.h"
     14 #include "lj_buf.h"
     15 #include "lj_ir.h"
     16 #include "lj_jit.h"
     17 #include "lj_iropt.h"
     18 #include "lj_trace.h"
     19 #include "lj_snap.h"
     20 #include "lj_vm.h"
     21 
     22 /* Loop optimization:
     23 **
     24 ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
     25 ** of a loop into invariant and variant instructions. The invariant
     26 ** instructions are hoisted out of the loop and only the variant
     27 ** instructions remain inside the loop body.
     28 **
     29 ** Unfortunately LICM is mostly useless for compiling dynamic languages.
     30 ** The IR has many guards and most of the subsequent instructions are
     31 ** control-dependent on them. The first non-hoistable guard would
     32 ** effectively prevent hoisting of all subsequent instructions.
     33 **
     34 ** That's why we use a special form of unrolling using copy-substitution,
     35 ** combined with redundancy elimination:
     36 **
     37 ** The recorded instruction stream is re-emitted to the compiler pipeline
     38 ** with substituted operands. The substitution table is filled with the
     39 ** refs returned by re-emitting each instruction. This can be done
     40 ** on-the-fly, because the IR is in strict SSA form, where every ref is
     41 ** defined before its use.
     42 **
     43 ** This aproach generates two code sections, separated by the LOOP
     44 ** instruction:
     45 **
     46 ** 1. The recorded instructions form a kind of pre-roll for the loop. It
     47 ** contains a mix of invariant and variant instructions and performs
     48 ** exactly one loop iteration (but not necessarily the 1st iteration).
     49 **
     50 ** 2. The loop body contains only the variant instructions and performs
     51 ** all remaining loop iterations.
     52 **
     53 ** On first sight that looks like a waste of space, because the variant
     54 ** instructions are present twice. But the key insight is that the
     55 ** pre-roll honors the control-dependencies for *both* the pre-roll itself
     56 ** *and* the loop body!
     57 **
     58 ** It also means one doesn't have to explicitly model control-dependencies
     59 ** (which, BTW, wouldn't help LICM much). And it's much easier to
     60 ** integrate sparse snapshotting with this approach.
     61 **
     62 ** One of the nicest aspects of this approach is that all of the
     63 ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
     64 ** reused with only minor restrictions (e.g. one should not fold
     65 ** instructions across loop-carried dependencies).
     66 **
     67 ** But in general all optimizations can be applied which only need to look
     68 ** backwards into the generated instruction stream. At any point in time
     69 ** during the copy-substitution process this contains both a static loop
     70 ** iteration (the pre-roll) and a dynamic one (from the to-be-copied
     71 ** instruction up to the end of the partial loop body).
     72 **
     73 ** Since control-dependencies are implicitly kept, CSE also applies to all
     74 ** kinds of guards. The major advantage is that all invariant guards can
     75 ** be hoisted, too.
     76 **
     77 ** Load/store forwarding works across loop iterations, too. This is
     78 ** important if loop-carried dependencies are kept in upvalues or tables.
     79 ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
     80 ** become a forwarded loop-recurrence after inlining.
     81 **
     82 ** Since the IR is in SSA form, loop-carried dependencies have to be
     83 ** modeled with PHI instructions. The potential candidates for PHIs are
     84 ** collected on-the-fly during copy-substitution. After eliminating the
     85 ** redundant ones, PHI instructions are emitted *below* the loop body.
     86 **
     87 ** Note that this departure from traditional SSA form doesn't change the
     88 ** semantics of the PHI instructions themselves. But it greatly simplifies
     89 ** on-the-fly generation of the IR and the machine code.
     90 */
     91 
     92 /* Some local macros to save typing. Undef'd at the end. */
     93 #define IR(ref)		(&J->cur.ir[(ref)])
     94 
     95 /* Pass IR on to next optimization in chain (FOLD). */
     96 #define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
     97 
     98 /* Emit raw IR without passing through optimizations. */
     99 #define emitir_raw(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
    100 
    101 /* -- PHI elimination ----------------------------------------------------- */
    102 
    103 /* Emit or eliminate collected PHIs. */
    104 static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi,
    105 			  SnapNo onsnap)
    106 {
    107   int passx = 0;
    108   IRRef i, j, nslots;
    109   IRRef invar = J->chain[IR_LOOP];
    110   /* Pass #1: mark redundant and potentially redundant PHIs. */
    111   for (i = 0, j = 0; i < nphi; i++) {
    112     IRRef lref = phi[i];
    113     IRRef rref = subst[lref];
    114     if (lref == rref || rref == REF_DROP) {  /* Invariants are redundant. */
    115       irt_clearphi(IR(lref)->t);
    116     } else {
    117       phi[j++] = (IRRef1)lref;
    118       if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
    119 	/* Quick check for simple recurrences failed, need pass2. */
    120 	irt_setmark(IR(lref)->t);
    121 	passx = 1;
    122       }
    123     }
    124   }
    125   nphi = j;
    126   /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
    127   if (passx) {
    128     SnapNo s;
    129     for (i = J->cur.nins-1; i > invar; i--) {
    130       IRIns *ir = IR(i);
    131       if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
    132       if (!irref_isk(ir->op1)) {
    133 	irt_clearmark(IR(ir->op1)->t);
    134 	if (ir->op1 < invar &&
    135 	    ir->o >= IR_CALLN && ir->o <= IR_CARG) {  /* ORDER IR */
    136 	  ir = IR(ir->op1);
    137 	  while (ir->o == IR_CARG) {
    138 	    if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
    139 	    if (irref_isk(ir->op1)) break;
    140 	    ir = IR(ir->op1);
    141 	    irt_clearmark(ir->t);
    142 	  }
    143 	}
    144       }
    145     }
    146     for (s = J->cur.nsnap-1; s >= onsnap; s--) {
    147       SnapShot *snap = &J->cur.snap[s];
    148       SnapEntry *map = &J->cur.snapmap[snap->mapofs];
    149       MSize n, nent = snap->nent;
    150       for (n = 0; n < nent; n++) {
    151 	IRRef ref = snap_ref(map[n]);
    152 	if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
    153       }
    154     }
    155   }
    156   /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
    157   nslots = J->baseslot+J->maxslot;
    158   for (i = 1; i < nslots; i++) {
    159     IRRef ref = tref_ref(J->slot[i]);
    160     while (!irref_isk(ref) && ref != subst[ref]) {
    161       IRIns *ir = IR(ref);
    162       irt_clearmark(ir->t);  /* Unmark potential uses, too. */
    163       if (irt_isphi(ir->t) || irt_ispri(ir->t))
    164 	break;
    165       irt_setphi(ir->t);
    166       if (nphi >= LJ_MAX_PHI)
    167 	lj_trace_err(J, LJ_TRERR_PHIOV);
    168       phi[nphi++] = (IRRef1)ref;
    169       ref = subst[ref];
    170       if (ref > invar)
    171 	break;
    172     }
    173   }
    174   /* Pass #4: propagate non-redundant PHIs. */
    175   while (passx) {
    176     passx = 0;
    177     for (i = 0; i < nphi; i++) {
    178       IRRef lref = phi[i];
    179       IRIns *ir = IR(lref);
    180       if (!irt_ismarked(ir->t)) {  /* Propagate only from unmarked PHIs. */
    181 	IRIns *irr = IR(subst[lref]);
    182 	if (irt_ismarked(irr->t)) {  /* Right ref points to other PHI? */
    183 	  irt_clearmark(irr->t);  /* Mark that PHI as non-redundant. */
    184 	  passx = 1;  /* Retry. */
    185 	}
    186       }
    187     }
    188   }
    189   /* Pass #5: emit PHI instructions or eliminate PHIs. */
    190   for (i = 0; i < nphi; i++) {
    191     IRRef lref = phi[i];
    192     IRIns *ir = IR(lref);
    193     if (!irt_ismarked(ir->t)) {  /* Emit PHI if not marked. */
    194       IRRef rref = subst[lref];
    195       if (rref > invar)
    196 	irt_setphi(IR(rref)->t);
    197       emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
    198     } else {  /* Otherwise eliminate PHI. */
    199       irt_clearmark(ir->t);
    200       irt_clearphi(ir->t);
    201     }
    202   }
    203 }
    204 
    205 /* -- Loop unrolling using copy-substitution ------------------------------ */
    206 
    207 /* Copy-substitute snapshot. */
    208 static void loop_subst_snap(jit_State *J, SnapShot *osnap,
    209 			    SnapEntry *loopmap, IRRef1 *subst)
    210 {
    211   SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
    212   SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
    213   MSize nmapofs;
    214   MSize on, ln, nn, onent = osnap->nent;
    215   BCReg nslots = osnap->nslots;
    216   SnapShot *snap = &J->cur.snap[J->cur.nsnap];
    217   if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
    218     nmapofs = J->cur.nsnapmap;
    219     J->cur.nsnap++;  /* Add new snapshot. */
    220   } else {  /* Otherwise overwrite previous snapshot. */
    221     snap--;
    222     nmapofs = snap->mapofs;
    223   }
    224   J->guardemit.irt = 0;
    225   /* Setup new snapshot. */
    226   snap->mapofs = (uint16_t)nmapofs;
    227   snap->ref = (IRRef1)J->cur.nins;
    228   snap->nslots = nslots;
    229   snap->topslot = osnap->topslot;
    230   snap->count = 0;
    231   nmap = &J->cur.snapmap[nmapofs];
    232   /* Substitute snapshot slots. */
    233   on = ln = nn = 0;
    234   while (on < onent) {
    235     SnapEntry osn = omap[on], lsn = loopmap[ln];
    236     if (snap_slot(lsn) < snap_slot(osn)) {  /* Copy slot from loop map. */
    237       nmap[nn++] = lsn;
    238       ln++;
    239     } else {  /* Copy substituted slot from snapshot map. */
    240       if (snap_slot(lsn) == snap_slot(osn)) ln++;  /* Shadowed loop slot. */
    241       if (!irref_isk(snap_ref(osn)))
    242 	osn = snap_setref(osn, subst[snap_ref(osn)]);
    243       nmap[nn++] = osn;
    244       on++;
    245     }
    246   }
    247   while (snap_slot(loopmap[ln]) < nslots)  /* Copy remaining loop slots. */
    248     nmap[nn++] = loopmap[ln++];
    249   snap->nent = (uint8_t)nn;
    250   omap += onent;
    251   nmap += nn;
    252   while (omap < nextmap)  /* Copy PC + frame links. */
    253     *nmap++ = *omap++;
    254   J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap);
    255 }
    256 
    257 typedef struct LoopState {
    258   jit_State *J;
    259   IRRef1 *subst;
    260   MSize sizesubst;
    261 } LoopState;
    262 
    263 /* Unroll loop. */
    264 static void loop_unroll(LoopState *lps)
    265 {
    266   jit_State *J = lps->J;
    267   IRRef1 phi[LJ_MAX_PHI];
    268   uint32_t nphi = 0;
    269   IRRef1 *subst;
    270   SnapNo onsnap;
    271   SnapShot *osnap, *loopsnap;
    272   SnapEntry *loopmap, *psentinel;
    273   IRRef ins, invar;
    274 
    275   /* Allocate substitution table.
    276   ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
    277   */
    278   invar = J->cur.nins;
    279   lps->sizesubst = invar - REF_BIAS;
    280   lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1);
    281   subst = lps->subst - REF_BIAS;
    282   subst[REF_BASE] = REF_BASE;
    283 
    284   /* LOOP separates the pre-roll from the loop body. */
    285   emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
    286 
    287   /* Grow snapshot buffer and map for copy-substituted snapshots.
    288   ** Need up to twice the number of snapshots minus #0 and loop snapshot.
    289   ** Need up to twice the number of entries plus fallback substitutions
    290   ** from the loop snapshot entries for each new snapshot.
    291   ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
    292   */
    293   onsnap = J->cur.nsnap;
    294   lj_snap_grow_buf(J, 2*onsnap-2);
    295   lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent);
    296 
    297   /* The loop snapshot is used for fallback substitutions. */
    298   loopsnap = &J->cur.snap[onsnap-1];
    299   loopmap = &J->cur.snapmap[loopsnap->mapofs];
    300   /* The PC of snapshot #0 and the loop snapshot must match. */
    301   psentinel = &loopmap[loopsnap->nent];
    302   lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]);
    303   *psentinel = SNAP(255, 0, 0);  /* Replace PC with temporary sentinel. */
    304 
    305   /* Start substitution with snapshot #1 (#0 is empty for root traces). */
    306   osnap = &J->cur.snap[1];
    307 
    308   /* Copy and substitute all recorded instructions and snapshots. */
    309   for (ins = REF_FIRST; ins < invar; ins++) {
    310     IRIns *ir;
    311     IRRef op1, op2;
    312 
    313     if (ins >= osnap->ref)  /* Instruction belongs to next snapshot? */
    314       loop_subst_snap(J, osnap++, loopmap, subst);  /* Copy-substitute it. */
    315 
    316     /* Substitute instruction operands. */
    317     ir = IR(ins);
    318     op1 = ir->op1;
    319     if (!irref_isk(op1)) op1 = subst[op1];
    320     op2 = ir->op2;
    321     if (!irref_isk(op2)) op2 = subst[op2];
    322     if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
    323 	op1 == ir->op1 && op2 == ir->op2) {  /* Regular invariant ins? */
    324       subst[ins] = (IRRef1)ins;  /* Shortcut. */
    325     } else {
    326       /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
    327       IRType1 t = ir->t;  /* Get this first, since emitir may invalidate ir. */
    328       IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
    329       subst[ins] = (IRRef1)ref;
    330       if (ref != ins) {
    331 	IRIns *irr = IR(ref);
    332 	if (ref < invar) {  /* Loop-carried dependency? */
    333 	  /* Potential PHI? */
    334 	  if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
    335 	    irt_setphi(irr->t);
    336 	    if (nphi >= LJ_MAX_PHI)
    337 	      lj_trace_err(J, LJ_TRERR_PHIOV);
    338 	    phi[nphi++] = (IRRef1)ref;
    339 	  }
    340 	  /* Check all loop-carried dependencies for type instability. */
    341 	  if (!irt_sametype(t, irr->t)) {
    342 	    if (irt_isinteger(t) && irt_isinteger(irr->t))
    343 	      continue;
    344 	    else if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num. */
    345 	      ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
    346 	    else if (irt_isnum(irr->t) && irt_isinteger(t))  /* Fix num->int. */
    347 	      ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
    348 				    IRCONV_INT_NUM|IRCONV_CHECK));
    349 	    else
    350 	      lj_trace_err(J, LJ_TRERR_TYPEINS);
    351 	    subst[ins] = (IRRef1)ref;
    352 	    irr = IR(ref);
    353 	    goto phiconv;
    354 	  }
    355 	} else if (ref != REF_DROP && irr->o == IR_CONV &&
    356 		   ref > invar && irr->op1 < invar) {
    357 	  /* May need an extra PHI for a CONV. */
    358 	  ref = irr->op1;
    359 	  irr = IR(ref);
    360 	phiconv:
    361 	  if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
    362 	    irt_setphi(irr->t);
    363 	    if (nphi >= LJ_MAX_PHI)
    364 	      lj_trace_err(J, LJ_TRERR_PHIOV);
    365 	    phi[nphi++] = (IRRef1)ref;
    366 	  }
    367 	}
    368       }
    369     }
    370   }
    371   if (!irt_isguard(J->guardemit))  /* Drop redundant snapshot. */
    372     J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
    373   lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
    374   *psentinel = J->cur.snapmap[J->cur.snap[0].nent];  /* Restore PC. */
    375 
    376   loop_emit_phi(J, subst, phi, nphi, onsnap);
    377 }
    378 
    379 /* Undo any partial changes made by the loop optimization. */
    380 static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
    381 {
    382   ptrdiff_t i;
    383   SnapShot *snap = &J->cur.snap[nsnap-1];
    384   SnapEntry *map = J->cur.snapmap;
    385   map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent];  /* Restore PC. */
    386   J->cur.nsnapmap = (uint16_t)nsnapmap;
    387   J->cur.nsnap = nsnap;
    388   J->guardemit.irt = 0;
    389   lj_ir_rollback(J, ins);
    390   for (i = 0; i < BPROP_SLOTS; i++) {  /* Remove backprop. cache entries. */
    391     BPropEntry *bp = &J->bpropcache[i];
    392     if (bp->val >= ins)
    393       bp->key = 0;
    394   }
    395   for (ins--; ins >= REF_FIRST; ins--) {  /* Remove flags. */
    396     IRIns *ir = IR(ins);
    397     irt_clearphi(ir->t);
    398     irt_clearmark(ir->t);
    399   }
    400 }
    401 
    402 /* Protected callback for loop optimization. */
    403 static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
    404 {
    405   UNUSED(L); UNUSED(dummy);
    406   loop_unroll((LoopState *)ud);
    407   return NULL;
    408 }
    409 
    410 /* Loop optimization. */
    411 int lj_opt_loop(jit_State *J)
    412 {
    413   IRRef nins = J->cur.nins;
    414   SnapNo nsnap = J->cur.nsnap;
    415   MSize nsnapmap = J->cur.nsnapmap;
    416   LoopState lps;
    417   int errcode;
    418   lps.J = J;
    419   lps.subst = NULL;
    420   lps.sizesubst = 0;
    421   errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt);
    422   lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1);
    423   if (LJ_UNLIKELY(errcode)) {
    424     lua_State *L = J->L;
    425     if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) {  /* Trace error? */
    426       int32_t e = numberVint(L->top-1);
    427       switch ((TraceError)e) {
    428       case LJ_TRERR_TYPEINS:  /* Type instability. */
    429       case LJ_TRERR_GFAIL:  /* Guard would always fail. */
    430 	/* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
    431 	if (--J->instunroll < 0)  /* But do not unroll forever. */
    432 	  break;
    433 	L->top--;  /* Remove error object. */
    434 	loop_undo(J, nins, nsnap, nsnapmap);
    435 	return 1;  /* Loop optimization failed, continue recording. */
    436       default:
    437 	break;
    438       }
    439     }
    440     lj_err_throw(L, errcode);  /* Propagate all other errors. */
    441   }
    442   return 0;  /* Loop optimization is ok. */
    443 }
    444 
    445 #undef IR
    446 #undef emitir
    447 #undef emitir_raw
    448 
    449 #endif
	ljx FORK: LuaJIT with native 5.2 and 5.3 support
	git clone https://git.neptards.moe/neptards/ljx.git
	Log \| Files \| Refs \| README