ljx

FORK: LuaJIT with native 5.2 and 5.3 support
git clone https://git.neptards.moe/neptards/ljx.git
Log | Files | Refs | README

lj_opt_fold.c (68719B)


      1 /*
      2 ** FOLD: Constant Folding, Algebraic Simplifications and Reassociation.
      3 ** ABCelim: Array Bounds Check Elimination.
      4 ** CSE: Common-Subexpression Elimination.
      5 ** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
      6 */
      7 
      8 #define lj_opt_fold_c
      9 #define LUA_CORE
     10 
     11 #include <math.h>
     12 
     13 #include "lj_obj.h"
     14 
     15 #if LJ_HASJIT
     16 
     17 #include "lj_buf.h"
     18 #include "lj_str.h"
     19 #include "lj_tab.h"
     20 #include "lj_ir.h"
     21 #include "lj_jit.h"
     22 #include "lj_ircall.h"
     23 #include "lj_iropt.h"
     24 #include "lj_trace.h"
     25 #if LJ_HASFFI
     26 #include "lj_ctype.h"
     27 #include "lj_carith.h"
     28 #endif
     29 #include "lj_vm.h"
     30 #include "lj_strscan.h"
     31 #include "lj_strfmt.h"
     32 
     33 /* Here's a short description how the FOLD engine processes instructions:
     34 **
     35 ** The FOLD engine receives a single instruction stored in fins (J->fold.ins).
     36 ** The instruction and its operands are used to select matching fold rules.
     37 ** These are applied iteratively until a fixed point is reached.
     38 **
     39 ** The 8 bit opcode of the instruction itself plus the opcodes of the
     40 ** two instructions referenced by its operands form a 24 bit key
     41 ** 'ins left right' (unused operands -> 0, literals -> lowest 8 bits).
     42 **
     43 ** This key is used for partial matching against the fold rules. The
     44 ** left/right operand fields of the key are successively masked with
     45 ** the 'any' wildcard, from most specific to least specific:
     46 **
     47 **   ins left right
     48 **   ins any  right
     49 **   ins left any
     50 **   ins any  any
     51 **
     52 ** The masked key is used to lookup a matching fold rule in a semi-perfect
     53 ** hash table. If a matching rule is found, the related fold function is run.
     54 ** Multiple rules can share the same fold function. A fold rule may return
     55 ** one of several special values:
     56 **
     57 ** - NEXTFOLD means no folding was applied, because an additional test
     58 **   inside the fold function failed. Matching continues against less
     59 **   specific fold rules. Finally the instruction is passed on to CSE.
     60 **
     61 ** - RETRYFOLD means the instruction was modified in-place. Folding is
     62 **   retried as if this instruction had just been received.
     63 **
     64 ** All other return values are terminal actions -- no further folding is
     65 ** applied:
     66 **
     67 ** - INTFOLD(i) returns a reference to the integer constant i.
     68 **
     69 ** - LEFTFOLD and RIGHTFOLD return the left/right operand reference
     70 **   without emitting an instruction.
     71 **
     72 ** - CSEFOLD and EMITFOLD pass the instruction directly to CSE or emit
     73 **   it without passing through any further optimizations.
     74 **
     75 ** - FAILFOLD, DROPFOLD and CONDFOLD only apply to instructions which have
     76 **   no result (e.g. guarded assertions): FAILFOLD means the guard would
     77 **   always fail, i.e. the current trace is pointless. DROPFOLD means
     78 **   the guard is always true and has been eliminated. CONDFOLD is a
     79 **   shortcut for FAILFOLD + cond (i.e. drop if true, otherwise fail).
     80 **
     81 ** - Any other return value is interpreted as an IRRef or TRef. This
     82 **   can be a reference to an existing or a newly created instruction.
     83 **   Only the least-significant 16 bits (IRRef1) are used to form a TRef
     84 **   which is finally returned to the caller.
     85 **
     86 ** The FOLD engine receives instructions both from the trace recorder and
     87 ** substituted instructions from LOOP unrolling. This means all types
     88 ** of instructions may end up here, even though the recorder bypasses
     89 ** FOLD in some cases. Thus all loads, stores and allocations must have
     90 ** an any/any rule to avoid being passed on to CSE.
     91 **
     92 ** Carefully read the following requirements before adding or modifying
     93 ** any fold rules:
     94 **
     95 ** Requirement #1: All fold rules must preserve their destination type.
     96 **
     97 ** Consistently use INTFOLD() (KINT result) or lj_ir_knum() (KNUM result).
     98 ** Never use lj_ir_knumint() which can have either a KINT or KNUM result.
     99 **
    100 ** Requirement #2: Fold rules should not create *new* instructions which
    101 ** reference operands *across* PHIs.
    102 **
    103 ** E.g. a RETRYFOLD with 'fins->op1 = fleft->op1' is invalid if the
    104 ** left operand is a PHI. Then fleft->op1 would point across the PHI
    105 ** frontier to an invariant instruction. Adding a PHI for this instruction
    106 ** would be counterproductive. The solution is to add a barrier which
    107 ** prevents folding across PHIs, i.e. 'PHIBARRIER(fleft)' in this case.
    108 ** The only exception is for recurrences with high latencies like
    109 ** repeated int->num->int conversions.
    110 **
    111 ** One could relax this condition a bit if the referenced instruction is
    112 ** a PHI, too. But this often leads to worse code due to excessive
    113 ** register shuffling.
    114 **
    115 ** Note: returning *existing* instructions (e.g. LEFTFOLD) is ok, though.
    116 ** Even returning fleft->op1 would be ok, because a new PHI will added,
    117 ** if needed. But again, this leads to excessive register shuffling and
    118 ** should be avoided.
    119 **
    120 ** Requirement #3: The set of all fold rules must be monotonic to guarantee
    121 ** termination.
    122 **
    123 ** The goal is optimization, so one primarily wants to add strength-reducing
    124 ** rules. This means eliminating an instruction or replacing an instruction
    125 ** with one or more simpler instructions. Don't add fold rules which point
    126 ** into the other direction.
    127 **
    128 ** Some rules (like commutativity) do not directly reduce the strength of
    129 ** an instruction, but enable other fold rules (e.g. by moving constants
    130 ** to the right operand). These rules must be made unidirectional to avoid
    131 ** cycles.
    132 **
    133 ** Rule of thumb: the trace recorder expands the IR and FOLD shrinks it.
    134 */
    135 
    136 /* Some local macros to save typing. Undef'd at the end. */
    137 #define IR(ref)		(&J->cur.ir[(ref)])
    138 #define fins		(&J->fold.ins)
    139 #define fleft		(J->fold.left)
    140 #define fright		(J->fold.right)
    141 #define knumleft	(ir_knum(fleft)->n)
    142 #define knumright	(ir_knum(fright)->n)
    143 
    144 /* Pass IR on to next optimization in chain (FOLD). */
    145 #define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
    146 
    147 /* Fold function type. Fastcall on x86 significantly reduces their size. */
    148 typedef IRRef (LJ_FASTCALL *FoldFunc)(jit_State *J);
    149 
    150 /* Macros for the fold specs, so buildvm can recognize them. */
    151 #define LJFOLD(x)
    152 #define LJFOLDX(x)
    153 #define LJFOLDF(name)	static TRef LJ_FASTCALL fold_##name(jit_State *J)
    154 /* Note: They must be at the start of a line or buildvm ignores them! */
    155 
    156 /* Barrier to prevent using operands across PHIs. */
    157 #define PHIBARRIER(ir)	if (irt_isphi((ir)->t)) return NEXTFOLD
    158 
    159 /* Barrier to prevent folding across a GC step.
    160 ** GC steps can only happen at the head of a trace and at LOOP.
    161 ** And the GC is only driven forward if there's at least one allocation.
    162 */
    163 #define gcstep_barrier(J, ref) \
    164   ((ref) < J->chain[IR_LOOP] && \
    165    (J->chain[IR_SNEW] || J->chain[IR_XSNEW] || \
    166     J->chain[IR_TNEW] || J->chain[IR_TDUP] || \
    167     J->chain[IR_CNEW] || J->chain[IR_CNEWI] || \
    168     J->chain[IR_BUFSTR] || J->chain[IR_TOSTR] || J->chain[IR_CALLA]))
    169 
    170 /* -- Constant folding for FP numbers ------------------------------------- */
    171 
    172 LJFOLD(ADD KNUM KNUM)
    173 LJFOLD(SUB KNUM KNUM)
    174 LJFOLD(MUL KNUM KNUM)
    175 LJFOLD(DIV KNUM KNUM)
    176 LJFOLD(NEG KNUM KNUM)
    177 LJFOLD(ABS KNUM KNUM)
    178 LJFOLD(ATAN2 KNUM KNUM)
    179 LJFOLD(LDEXP KNUM KNUM)
    180 LJFOLD(MIN KNUM KNUM)
    181 LJFOLD(MAX KNUM KNUM)
    182 LJFOLDF(kfold_numarith)
    183 {
    184   lua_Number a = knumleft;
    185   lua_Number b = knumright;
    186   lua_Number y = lj_vm_foldarith(a, b, fins->o - IR_ADD);
    187   return lj_ir_knum(J, y);
    188 }
    189 
    190 LJFOLD(LDEXP KNUM KINT)
    191 LJFOLDF(kfold_ldexp)
    192 {
    193 #if LJ_TARGET_X86ORX64
    194   UNUSED(J);
    195   return NEXTFOLD;
    196 #else
    197   return lj_ir_knum(J, ldexp(knumleft, fright->i));
    198 #endif
    199 }
    200 
    201 LJFOLD(FPMATH KNUM any)
    202 LJFOLDF(kfold_fpmath)
    203 {
    204   lua_Number a = knumleft;
    205   lua_Number y = lj_vm_foldfpm(a, fins->op2);
    206   return lj_ir_knum(J, y);
    207 }
    208 
    209 LJFOLD(POW KNUM KINT)
    210 LJFOLDF(kfold_numpow)
    211 {
    212   lua_Number a = knumleft;
    213   lua_Number b = (lua_Number)fright->i;
    214   lua_Number y = lj_vm_foldarith(a, b, IR_POW - IR_ADD);
    215   return lj_ir_knum(J, y);
    216 }
    217 
    218 /* Must not use kfold_kref for numbers (could be NaN). */
    219 LJFOLD(EQ KNUM KNUM)
    220 LJFOLD(NE KNUM KNUM)
    221 LJFOLD(LT KNUM KNUM)
    222 LJFOLD(GE KNUM KNUM)
    223 LJFOLD(LE KNUM KNUM)
    224 LJFOLD(GT KNUM KNUM)
    225 LJFOLD(ULT KNUM KNUM)
    226 LJFOLD(UGE KNUM KNUM)
    227 LJFOLD(ULE KNUM KNUM)
    228 LJFOLD(UGT KNUM KNUM)
    229 LJFOLDF(kfold_numcomp)
    230 {
    231   return CONDFOLD(lj_ir_numcmp(knumleft, knumright, (IROp)fins->o));
    232 }
    233 
    234 /* -- Constant folding for 32 bit integers -------------------------------- */
    235 
    236 static int32_t kfold_intop(int32_t k1, int32_t k2, IROp op)
    237 {
    238   switch (op) {
    239   case IR_ADD: k1 += k2; break;
    240   case IR_SUB: k1 -= k2; break;
    241   case IR_MUL: k1 *= k2; break;
    242   case IR_MOD: k1 = lj_vm_modi(k1, k2); break;
    243   case IR_NEG: k1 = -k1; break;
    244   case IR_BAND: k1 &= k2; break;
    245   case IR_BOR: k1 |= k2; break;
    246   case IR_BXOR: k1 ^= k2; break;
    247   case IR_BSHL: k1 <<= (k2 & 31); break;
    248   case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 31)); break;
    249   case IR_BSAR: k1 >>= (k2 & 31); break;
    250   case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 31)); break;
    251   case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 31)); break;
    252   case IR_MIN: k1 = k1 < k2 ? k1 : k2; break;
    253   case IR_MAX: k1 = k1 > k2 ? k1 : k2; break;
    254   default: lua_assert(0); break;
    255   }
    256   return k1;
    257 }
    258 
    259 LJFOLD(ADD KINT KINT)
    260 LJFOLD(SUB KINT KINT)
    261 LJFOLD(MUL KINT KINT)
    262 LJFOLD(MOD KINT KINT)
    263 LJFOLD(NEG KINT KINT)
    264 LJFOLD(BAND KINT KINT)
    265 LJFOLD(BOR KINT KINT)
    266 LJFOLD(BXOR KINT KINT)
    267 LJFOLD(BSHL KINT KINT)
    268 LJFOLD(BSHR KINT KINT)
    269 LJFOLD(BSAR KINT KINT)
    270 LJFOLD(BROL KINT KINT)
    271 LJFOLD(BROR KINT KINT)
    272 LJFOLD(MIN KINT KINT)
    273 LJFOLD(MAX KINT KINT)
    274 LJFOLDF(kfold_intarith)
    275 {
    276   return INTFOLD(kfold_intop(fleft->i, fright->i, (IROp)fins->o));
    277 }
    278 
    279 LJFOLD(ADDOV KINT KINT)
    280 LJFOLD(SUBOV KINT KINT)
    281 LJFOLD(MULOV KINT KINT)
    282 LJFOLDF(kfold_intovarith)
    283 {
    284   lua_Number n = lj_vm_foldarith((lua_Number)fleft->i, (lua_Number)fright->i,
    285 				 fins->o - IR_ADDOV);
    286   int32_t k = lj_num2int(n);
    287   if (n != (lua_Number)k)
    288     return FAILFOLD;
    289   return INTFOLD(k);
    290 }
    291 
    292 LJFOLD(BNOT KINT)
    293 LJFOLDF(kfold_bnot)
    294 {
    295   return INTFOLD(~fleft->i);
    296 }
    297 
    298 LJFOLD(BSWAP KINT)
    299 LJFOLDF(kfold_bswap)
    300 {
    301   return INTFOLD((int32_t)lj_bswap((uint32_t)fleft->i));
    302 }
    303 
    304 LJFOLD(LT KINT KINT)
    305 LJFOLD(GE KINT KINT)
    306 LJFOLD(LE KINT KINT)
    307 LJFOLD(GT KINT KINT)
    308 LJFOLD(ULT KINT KINT)
    309 LJFOLD(UGE KINT KINT)
    310 LJFOLD(ULE KINT KINT)
    311 LJFOLD(UGT KINT KINT)
    312 LJFOLD(ABC KINT KINT)
    313 LJFOLDF(kfold_intcomp)
    314 {
    315   int32_t a = fleft->i, b = fright->i;
    316   switch ((IROp)fins->o) {
    317   case IR_LT: return CONDFOLD(a < b);
    318   case IR_GE: return CONDFOLD(a >= b);
    319   case IR_LE: return CONDFOLD(a <= b);
    320   case IR_GT: return CONDFOLD(a > b);
    321   case IR_ULT: return CONDFOLD((uint32_t)a < (uint32_t)b);
    322   case IR_UGE: return CONDFOLD((uint32_t)a >= (uint32_t)b);
    323   case IR_ULE: return CONDFOLD((uint32_t)a <= (uint32_t)b);
    324   case IR_ABC:
    325   case IR_UGT: return CONDFOLD((uint32_t)a > (uint32_t)b);
    326   default: lua_assert(0); return FAILFOLD;
    327   }
    328 }
    329 
    330 LJFOLD(UGE any KINT)
    331 LJFOLDF(kfold_intcomp0)
    332 {
    333   if (fright->i == 0)
    334     return DROPFOLD;
    335   return NEXTFOLD;
    336 }
    337 
    338 /* -- Constant folding for 64 bit integers -------------------------------- */
    339 
    340 static uint64_t kfold_int64arith(uint64_t k1, uint64_t k2, IROp op)
    341 {
    342   switch (op) {
    343 #if LJ_HASFFI
    344   case IR_ADD: k1 += k2; break;
    345   case IR_SUB: k1 -= k2; break;
    346   case IR_MUL: k1 *= k2; break;
    347   case IR_BAND: k1 &= k2; break;
    348   case IR_BOR: k1 |= k2; break;
    349   case IR_BXOR: k1 ^= k2; break;
    350 #endif
    351   default: UNUSED(k2); lua_assert(0); break;
    352   }
    353   return k1;
    354 }
    355 
    356 LJFOLD(ADD KINT64 KINT64)
    357 LJFOLD(SUB KINT64 KINT64)
    358 LJFOLD(MUL KINT64 KINT64)
    359 LJFOLD(BAND KINT64 KINT64)
    360 LJFOLD(BOR KINT64 KINT64)
    361 LJFOLD(BXOR KINT64 KINT64)
    362 LJFOLDF(kfold_int64arith)
    363 {
    364   return INT64FOLD(kfold_int64arith(ir_k64(fleft)->u64,
    365 				    ir_k64(fright)->u64, (IROp)fins->o));
    366 }
    367 
    368 LJFOLD(DIV KINT64 KINT64)
    369 LJFOLD(MOD KINT64 KINT64)
    370 LJFOLD(POW KINT64 KINT64)
    371 LJFOLDF(kfold_int64arith2)
    372 {
    373 #if LJ_HASFFI
    374   uint64_t k1 = ir_k64(fleft)->u64, k2 = ir_k64(fright)->u64;
    375   if (irt_isi64(fins->t)) {
    376     k1 = fins->o == IR_DIV ? lj_carith_divi64((int64_t)k1, (int64_t)k2) :
    377 	 fins->o == IR_MOD ? lj_carith_modi64((int64_t)k1, (int64_t)k2) :
    378 			     lj_carith_powi64((int64_t)k1, (int64_t)k2);
    379   } else {
    380     k1 = fins->o == IR_DIV ? lj_carith_divu64(k1, k2) :
    381 	 fins->o == IR_MOD ? lj_carith_modu64(k1, k2) :
    382 			     lj_carith_powu64(k1, k2);
    383   }
    384   return INT64FOLD(k1);
    385 #else
    386   UNUSED(J); lua_assert(0); return FAILFOLD;
    387 #endif
    388 }
    389 
    390 LJFOLD(BSHL KINT64 KINT)
    391 LJFOLD(BSHR KINT64 KINT)
    392 LJFOLD(BSAR KINT64 KINT)
    393 LJFOLD(BROL KINT64 KINT)
    394 LJFOLD(BROR KINT64 KINT)
    395 LJFOLDF(kfold_int64shift)
    396 {
    397 #if LJ_HASFFI
    398   uint64_t k = ir_k64(fleft)->u64;
    399   int32_t sh = (fright->i & 63);
    400   return INT64FOLD(lj_carith_shift64(k, sh, fins->o - IR_BSHL));
    401 #else
    402   UNUSED(J); lua_assert(0); return FAILFOLD;
    403 #endif
    404 }
    405 
    406 LJFOLD(BNOT KINT64)
    407 LJFOLDF(kfold_bnot64)
    408 {
    409 #if LJ_HASFFI
    410   return INT64FOLD(~ir_k64(fleft)->u64);
    411 #else
    412   UNUSED(J); lua_assert(0); return FAILFOLD;
    413 #endif
    414 }
    415 
    416 LJFOLD(BSWAP KINT64)
    417 LJFOLDF(kfold_bswap64)
    418 {
    419 #if LJ_HASFFI
    420   return INT64FOLD(lj_bswap64(ir_k64(fleft)->u64));
    421 #else
    422   UNUSED(J); lua_assert(0); return FAILFOLD;
    423 #endif
    424 }
    425 
    426 LJFOLD(LT KINT64 KINT64)
    427 LJFOLD(GE KINT64 KINT64)
    428 LJFOLD(LE KINT64 KINT64)
    429 LJFOLD(GT KINT64 KINT64)
    430 LJFOLD(ULT KINT64 KINT64)
    431 LJFOLD(UGE KINT64 KINT64)
    432 LJFOLD(ULE KINT64 KINT64)
    433 LJFOLD(UGT KINT64 KINT64)
    434 LJFOLDF(kfold_int64comp)
    435 {
    436 #if LJ_HASFFI
    437   uint64_t a = ir_k64(fleft)->u64, b = ir_k64(fright)->u64;
    438   switch ((IROp)fins->o) {
    439   case IR_LT: return CONDFOLD(a < b);
    440   case IR_GE: return CONDFOLD(a >= b);
    441   case IR_LE: return CONDFOLD(a <= b);
    442   case IR_GT: return CONDFOLD(a > b);
    443   case IR_ULT: return CONDFOLD((uint64_t)a < (uint64_t)b);
    444   case IR_UGE: return CONDFOLD((uint64_t)a >= (uint64_t)b);
    445   case IR_ULE: return CONDFOLD((uint64_t)a <= (uint64_t)b);
    446   case IR_UGT: return CONDFOLD((uint64_t)a > (uint64_t)b);
    447   default: lua_assert(0); return FAILFOLD;
    448   }
    449 #else
    450   UNUSED(J); lua_assert(0); return FAILFOLD;
    451 #endif
    452 }
    453 
    454 LJFOLD(UGE any KINT64)
    455 LJFOLDF(kfold_int64comp0)
    456 {
    457 #if LJ_HASFFI
    458   if (ir_k64(fright)->u64 == 0)
    459     return DROPFOLD;
    460   return NEXTFOLD;
    461 #else
    462   UNUSED(J); lua_assert(0); return FAILFOLD;
    463 #endif
    464 }
    465 
    466 /* -- Constant folding for strings ---------------------------------------- */
    467 
    468 LJFOLD(SNEW KKPTR KINT)
    469 LJFOLDF(kfold_snew_kptr)
    470 {
    471   GCstr *s = lj_str_new(J->L, (const char *)ir_kptr(fleft), (size_t)fright->i);
    472   return lj_ir_kstr(J, s);
    473 }
    474 
    475 LJFOLD(SNEW any KINT)
    476 LJFOLDF(kfold_snew_empty)
    477 {
    478   if (fright->i == 0)
    479     return lj_ir_kstr(J, &J2G(J)->strempty);
    480   return NEXTFOLD;
    481 }
    482 
    483 LJFOLD(STRREF KGC KINT)
    484 LJFOLDF(kfold_strref)
    485 {
    486   GCstr *str = ir_kstr(fleft);
    487   lua_assert((MSize)fright->i <= str->len);
    488   return lj_ir_kkptr(J, (char *)strdata(str) + fright->i);
    489 }
    490 
    491 LJFOLD(STRREF SNEW any)
    492 LJFOLDF(kfold_strref_snew)
    493 {
    494   PHIBARRIER(fleft);
    495   if (irref_isk(fins->op2) && fright->i == 0) {
    496     return fleft->op1;  /* strref(snew(ptr, len), 0) ==> ptr */
    497   } else {
    498     /* Reassociate: strref(snew(strref(str, a), len), b) ==> strref(str, a+b) */
    499     IRIns *ir = IR(fleft->op1);
    500     if (ir->o == IR_STRREF) {
    501       IRRef1 str = ir->op1;  /* IRIns * is not valid across emitir. */
    502       PHIBARRIER(ir);
    503       fins->op2 = emitir(IRTI(IR_ADD), ir->op2, fins->op2); /* Clobbers fins! */
    504       fins->op1 = str;
    505       fins->ot = IRT(IR_STRREF, IRT_PGC);
    506       return RETRYFOLD;
    507     }
    508   }
    509   return NEXTFOLD;
    510 }
    511 
    512 LJFOLD(CALLN CARG IRCALL_lj_str_cmp)
    513 LJFOLDF(kfold_strcmp)
    514 {
    515   if (irref_isk(fleft->op1) && irref_isk(fleft->op2)) {
    516     GCstr *a = ir_kstr(IR(fleft->op1));
    517     GCstr *b = ir_kstr(IR(fleft->op2));
    518     return INTFOLD(lj_str_cmp(a, b));
    519   }
    520   return NEXTFOLD;
    521 }
    522 
    523 /* -- Constant folding and forwarding for buffers ------------------------- */
    524 
    525 /*
    526 ** Buffer ops perform stores, but their effect is limited to the buffer
    527 ** itself. Also, buffer ops are chained: a use of an op implies a use of
    528 ** all other ops up the chain. Conversely, if an op is unused, all ops
    529 ** up the chain can go unsed. This largely eliminates the need to treat
    530 ** them as stores.
    531 **
    532 ** Alas, treating them as normal (IRM_N) ops doesn't work, because they
    533 ** cannot be CSEd in isolation. CSE for IRM_N is implicitly done in LOOP
    534 ** or if FOLD is disabled.
    535 **
    536 ** The compromise is to declare them as loads, emit them like stores and
    537 ** CSE whole chains manually when the BUFSTR is to be emitted. Any chain
    538 ** fragments left over from CSE are eliminated by DCE.
    539 */
    540 
    541 /* BUFHDR is emitted like a store, see below. */
    542 
    543 LJFOLD(BUFPUT BUFHDR BUFSTR)
    544 LJFOLDF(bufput_append)
    545 {
    546   /* New buffer, no other buffer op inbetween and same buffer? */
    547   if ((J->flags & JIT_F_OPT_FWD) &&
    548       !(fleft->op2 & IRBUFHDR_APPEND) &&
    549       fleft->prev == fright->op2 &&
    550       fleft->op1 == IR(fright->op2)->op1) {
    551     IRRef ref = fins->op1;
    552     IR(ref)->op2 = (fleft->op2 | IRBUFHDR_APPEND);  /* Modify BUFHDR. */
    553     IR(ref)->op1 = fright->op1;
    554     return ref;
    555   }
    556   return EMITFOLD;  /* Always emit, CSE later. */
    557 }
    558 
    559 LJFOLD(BUFPUT any any)
    560 LJFOLDF(bufput_kgc)
    561 {
    562   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fright->o == IR_KGC) {
    563     GCstr *s2 = ir_kstr(fright);
    564     if (s2->len == 0) {  /* Empty string? */
    565       return LEFTFOLD;
    566     } else {
    567       if (fleft->o == IR_BUFPUT && irref_isk(fleft->op2) &&
    568 	  !irt_isphi(fleft->t)) {  /* Join two constant string puts in a row. */
    569 	GCstr *s1 = ir_kstr(IR(fleft->op2));
    570 	IRRef kref = lj_ir_kstr(J, lj_buf_cat2str(J->L, s1, s2));
    571 	/* lj_ir_kstr() may realloc the IR and invalidates any IRIns *. */
    572 	IR(fins->op1)->op2 = kref;  /* Modify previous BUFPUT. */
    573 	return fins->op1;
    574       }
    575     }
    576   }
    577   return EMITFOLD;  /* Always emit, CSE later. */
    578 }
    579 
    580 LJFOLD(BUFSTR any any)
    581 LJFOLDF(bufstr_kfold_cse)
    582 {
    583   lua_assert(fleft->o == IR_BUFHDR || fleft->o == IR_BUFPUT ||
    584 	     fleft->o == IR_CALLL);
    585   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
    586     if (fleft->o == IR_BUFHDR) {  /* No put operations? */
    587       if (!(fleft->op2 & IRBUFHDR_APPEND))  /* Empty buffer? */
    588 	return lj_ir_kstr(J, &J2G(J)->strempty);
    589       fins->op1 = fleft->op1;
    590       fins->op2 = fleft->prev;  /* Relies on checks in bufput_append. */
    591       return CSEFOLD;
    592     } else if (fleft->o == IR_BUFPUT) {
    593       IRIns *irb = IR(fleft->op1);
    594       if (irb->o == IR_BUFHDR && !(irb->op2 & IRBUFHDR_APPEND))
    595 	return fleft->op2;  /* Shortcut for a single put operation. */
    596     }
    597   }
    598   /* Try to CSE the whole chain. */
    599   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
    600     IRRef ref = J->chain[IR_BUFSTR];
    601     while (ref) {
    602       IRIns *irs = IR(ref), *ira = fleft, *irb = IR(irs->op1);
    603       while (ira->o == irb->o && ira->op2 == irb->op2) {
    604 	lua_assert(ira->o == IR_BUFHDR || ira->o == IR_BUFPUT ||
    605 		   ira->o == IR_CALLL || ira->o == IR_CARG);
    606 	if (ira->o == IR_BUFHDR && !(ira->op2 & IRBUFHDR_APPEND))
    607 	  return ref;  /* CSE succeeded. */
    608 	if (ira->o == IR_CALLL && ira->op2 == IRCALL_lj_buf_puttab)
    609 	  break;
    610 	ira = IR(ira->op1);
    611 	irb = IR(irb->op1);
    612       }
    613       ref = irs->prev;
    614     }
    615   }
    616   return EMITFOLD;  /* No CSE possible. */
    617 }
    618 
    619 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_reverse)
    620 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_upper)
    621 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_lower)
    622 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putquoted)
    623 LJFOLDF(bufput_kfold_op)
    624 {
    625   if (irref_isk(fleft->op2)) {
    626     const CCallInfo *ci = &lj_ir_callinfo[fins->op2];
    627     SBuf *sb = lj_buf_tmp_(J->L);
    628     sb = ((SBuf * (LJ_FASTCALL *)(SBuf *, GCstr *))ci->func)(sb,
    629 						       ir_kstr(IR(fleft->op2)));
    630     fins->o = IR_BUFPUT;
    631     fins->op1 = fleft->op1;
    632     fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
    633     return RETRYFOLD;
    634   }
    635   return EMITFOLD;  /* Always emit, CSE later. */
    636 }
    637 
    638 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_rep)
    639 LJFOLDF(bufput_kfold_rep)
    640 {
    641   if (irref_isk(fleft->op2)) {
    642     IRIns *irc = IR(fleft->op1);
    643     if (irref_isk(irc->op2)) {
    644       SBuf *sb = lj_buf_tmp_(J->L);
    645       sb = lj_buf_putstr_rep(sb, ir_kstr(IR(irc->op2)), IR(fleft->op2)->i);
    646       fins->o = IR_BUFPUT;
    647       fins->op1 = irc->op1;
    648       fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
    649       return RETRYFOLD;
    650     }
    651   }
    652   return EMITFOLD;  /* Always emit, CSE later. */
    653 }
    654 
    655 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfxint)
    656 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum_int)
    657 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum_uint)
    658 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum)
    659 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfstr)
    660 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfchar)
    661 LJFOLDF(bufput_kfold_fmt)
    662 {
    663   IRIns *irc = IR(fleft->op1);
    664   lua_assert(irref_isk(irc->op2));  /* SFormat must be const. */
    665   if (irref_isk(fleft->op2)) {
    666     SFormat sf = (SFormat)IR(irc->op2)->i;
    667     IRIns *ira = IR(fleft->op2);
    668     SBuf *sb = lj_buf_tmp_(J->L);
    669     switch (fins->op2) {
    670     case IRCALL_lj_strfmt_putfxint:
    671       sb = lj_strfmt_putfxint(sb, sf, ir_k64(ira)->u64);
    672       break;
    673     case IRCALL_lj_strfmt_putfstr:
    674       sb = lj_strfmt_putfstr(sb, sf, ir_kstr(ira));
    675       break;
    676     case IRCALL_lj_strfmt_putfchar:
    677       sb = lj_strfmt_putfchar(sb, sf, ira->i);
    678       break;
    679     case IRCALL_lj_strfmt_putfnum_int:
    680     case IRCALL_lj_strfmt_putfnum_uint:
    681     case IRCALL_lj_strfmt_putfnum:
    682     default: {
    683       const CCallInfo *ci = &lj_ir_callinfo[fins->op2];
    684       sb = ((SBuf * (*)(SBuf *, SFormat, lua_Number))ci->func)(sb, sf,
    685 							 ir_knum(ira)->n);
    686       break;
    687       }
    688     }
    689     fins->o = IR_BUFPUT;
    690     fins->op1 = irc->op1;
    691     fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
    692     return RETRYFOLD;
    693   }
    694   return EMITFOLD;  /* Always emit, CSE later. */
    695 }
    696 
    697 /* -- Constant folding of pointer arithmetic ------------------------------ */
    698 
    699 LJFOLD(ADD KGC KINT)
    700 LJFOLD(ADD KGC KINT64)
    701 LJFOLDF(kfold_add_kgc)
    702 {
    703   GCobj *o = ir_kgc(fleft);
    704 #if LJ_64
    705   ptrdiff_t ofs = (ptrdiff_t)ir_kint64(fright)->u64;
    706 #else
    707   ptrdiff_t ofs = fright->i;
    708 #endif
    709 #if LJ_HASFFI
    710   if (irt_iscdata(fleft->t)) {
    711     CType *ct = ctype_raw(ctype_ctsG(J2G(J)), gco2cd(o)->ctypeid);
    712     if (ctype_isnum(ct->info) || ctype_isenum(ct->info) ||
    713 	ctype_isptr(ct->info) || ctype_isfunc(ct->info) ||
    714 	ctype_iscomplex(ct->info) || ctype_isvector(ct->info))
    715       return lj_ir_kkptr(J, (char *)o + ofs);
    716   }
    717 #endif
    718   return lj_ir_kptr(J, (char *)o + ofs);
    719 }
    720 
    721 LJFOLD(ADD KPTR KINT)
    722 LJFOLD(ADD KPTR KINT64)
    723 LJFOLD(ADD KKPTR KINT)
    724 LJFOLD(ADD KKPTR KINT64)
    725 LJFOLDF(kfold_add_kptr)
    726 {
    727   void *p = ir_kptr(fleft);
    728 #if LJ_64
    729   ptrdiff_t ofs = (ptrdiff_t)ir_kint64(fright)->u64;
    730 #else
    731   ptrdiff_t ofs = fright->i;
    732 #endif
    733   return lj_ir_kptr_(J, fleft->o, (char *)p + ofs);
    734 }
    735 
    736 LJFOLD(ADD any KGC)
    737 LJFOLD(ADD any KPTR)
    738 LJFOLD(ADD any KKPTR)
    739 LJFOLDF(kfold_add_kright)
    740 {
    741   if (fleft->o == IR_KINT || fleft->o == IR_KINT64) {
    742     IRRef1 tmp = fins->op1; fins->op1 = fins->op2; fins->op2 = tmp;
    743     return RETRYFOLD;
    744   }
    745   return NEXTFOLD;
    746 }
    747 
    748 /* -- Constant folding of conversions ------------------------------------- */
    749 
    750 LJFOLD(TOBIT KNUM KNUM)
    751 LJFOLDF(kfold_tobit)
    752 {
    753   return INTFOLD(lj_num2bit(knumleft));
    754 }
    755 
    756 LJFOLD(CONV KINT IRCONV_NUM_INT)
    757 LJFOLDF(kfold_conv_kint_num)
    758 {
    759   return lj_ir_knum(J, (lua_Number)fleft->i);
    760 }
    761 
    762 LJFOLD(CONV KINT IRCONV_NUM_U32)
    763 LJFOLDF(kfold_conv_kintu32_num)
    764 {
    765   return lj_ir_knum(J, (lua_Number)(uint32_t)fleft->i);
    766 }
    767 
    768 LJFOLD(CONV KINT IRCONV_INT_I8)
    769 LJFOLD(CONV KINT IRCONV_INT_U8)
    770 LJFOLD(CONV KINT IRCONV_INT_I16)
    771 LJFOLD(CONV KINT IRCONV_INT_U16)
    772 LJFOLDF(kfold_conv_kint_ext)
    773 {
    774   int32_t k = fleft->i;
    775   if ((fins->op2 & IRCONV_SRCMASK) == IRT_I8) k = (int8_t)k;
    776   else if ((fins->op2 & IRCONV_SRCMASK) == IRT_U8) k = (uint8_t)k;
    777   else if ((fins->op2 & IRCONV_SRCMASK) == IRT_I16) k = (int16_t)k;
    778   else k = (uint16_t)k;
    779   return INTFOLD(k);
    780 }
    781 
    782 LJFOLD(CONV KINT IRCONV_I64_INT)
    783 LJFOLD(CONV KINT IRCONV_U64_INT)
    784 LJFOLD(CONV KINT IRCONV_I64_U32)
    785 LJFOLD(CONV KINT IRCONV_U64_U32)
    786 LJFOLDF(kfold_conv_kint_i64)
    787 {
    788   if ((fins->op2 & IRCONV_SEXT))
    789     return INT64FOLD((uint64_t)(int64_t)fleft->i);
    790   else
    791     return INT64FOLD((uint64_t)(int64_t)(uint32_t)fleft->i);
    792 }
    793 
    794 LJFOLD(CONV KINT64 IRCONV_NUM_I64)
    795 LJFOLDF(kfold_conv_kint64_num_i64)
    796 {
    797   return lj_ir_knum(J, (lua_Number)(int64_t)ir_kint64(fleft)->u64);
    798 }
    799 
    800 LJFOLD(CONV KINT64 IRCONV_NUM_U64)
    801 LJFOLDF(kfold_conv_kint64_num_u64)
    802 {
    803   return lj_ir_knum(J, (lua_Number)ir_kint64(fleft)->u64);
    804 }
    805 
    806 LJFOLD(CONV KINT64 IRCONV_INT_I64)
    807 LJFOLD(CONV KINT64 IRCONV_U32_I64)
    808 LJFOLDF(kfold_conv_kint64_int_i64)
    809 {
    810   return INTFOLD((int32_t)ir_kint64(fleft)->u64);
    811 }
    812 
    813 LJFOLD(CONV KNUM IRCONV_INT_NUM)
    814 LJFOLDF(kfold_conv_knum_int_num)
    815 {
    816   lua_Number n = knumleft;
    817   int32_t k = lj_num2int(n);
    818   if (irt_isguard(fins->t) && n != (lua_Number)k) {
    819     /* We're about to create a guard which always fails, like CONV +1.5.
    820     ** Some pathological loops cause this during LICM, e.g.:
    821     **   local x,k,t = 0,1.5,{1,[1.5]=2}
    822     **   for i=1,200 do x = x+ t[k]; k = k == 1 and 1.5 or 1 end
    823     **   assert(x == 300)
    824     */
    825     return FAILFOLD;
    826   }
    827   return INTFOLD(k);
    828 }
    829 
    830 LJFOLD(CONV KNUM IRCONV_U32_NUM)
    831 LJFOLDF(kfold_conv_knum_u32_num)
    832 {
    833 #ifdef _MSC_VER
    834   {  /* Workaround for MSVC bug. */
    835     volatile uint32_t u = (uint32_t)knumleft;
    836     return INTFOLD((int32_t)u);
    837   }
    838 #else
    839   return INTFOLD((int32_t)(uint32_t)knumleft);
    840 #endif
    841 }
    842 
    843 LJFOLD(CONV KNUM IRCONV_I64_NUM)
    844 LJFOLDF(kfold_conv_knum_i64_num)
    845 {
    846   return INT64FOLD((uint64_t)(int64_t)knumleft);
    847 }
    848 
    849 LJFOLD(CONV KNUM IRCONV_U64_NUM)
    850 LJFOLDF(kfold_conv_knum_u64_num)
    851 {
    852   return INT64FOLD(lj_num2u64(knumleft));
    853 }
    854 
    855 LJFOLD(TOSTR KNUM any)
    856 LJFOLDF(kfold_tostr_knum)
    857 {
    858   return lj_ir_kstr(J, lj_strfmt_num(J->L, ir_knum(fleft)));
    859 }
    860 
    861 LJFOLD(TOSTR KINT any)
    862 LJFOLDF(kfold_tostr_kint)
    863 {
    864   return lj_ir_kstr(J, fins->op2 == IRTOSTR_INT ?
    865 		       lj_strfmt_int(J->L, fleft->i) :
    866 		       lj_strfmt_char(J->L, fleft->i));
    867 }
    868 
    869 LJFOLD(STRTO KGC)
    870 LJFOLDF(kfold_strto)
    871 {
    872   TValue n;
    873   if (lj_strscan_num(ir_kstr(fleft), &n))
    874     return lj_ir_knum(J, numV(&n));
    875   return FAILFOLD;
    876 }
    877 
    878 /* -- Constant folding of equality checks --------------------------------- */
    879 
    880 /* Don't constant-fold away FLOAD checks against KNULL. */
    881 LJFOLD(EQ FLOAD KNULL)
    882 LJFOLD(NE FLOAD KNULL)
    883 LJFOLDX(lj_opt_cse)
    884 
    885 /* But fold all other KNULL compares, since only KNULL is equal to KNULL. */
    886 LJFOLD(EQ any KNULL)
    887 LJFOLD(NE any KNULL)
    888 LJFOLD(EQ KNULL any)
    889 LJFOLD(NE KNULL any)
    890 LJFOLD(EQ KINT KINT)  /* Constants are unique, so same refs <==> same value. */
    891 LJFOLD(NE KINT KINT)
    892 LJFOLD(EQ KINT64 KINT64)
    893 LJFOLD(NE KINT64 KINT64)
    894 LJFOLD(EQ KGC KGC)
    895 LJFOLD(NE KGC KGC)
    896 LJFOLDF(kfold_kref)
    897 {
    898   return CONDFOLD((fins->op1 == fins->op2) ^ (fins->o == IR_NE));
    899 }
    900 
    901 /* -- Algebraic shortcuts ------------------------------------------------- */
    902 
    903 LJFOLD(FPMATH FPMATH IRFPM_FLOOR)
    904 LJFOLD(FPMATH FPMATH IRFPM_CEIL)
    905 LJFOLD(FPMATH FPMATH IRFPM_TRUNC)
    906 LJFOLDF(shortcut_round)
    907 {
    908   IRFPMathOp op = (IRFPMathOp)fleft->op2;
    909   if (op == IRFPM_FLOOR || op == IRFPM_CEIL || op == IRFPM_TRUNC)
    910     return LEFTFOLD;  /* round(round_left(x)) = round_left(x) */
    911   return NEXTFOLD;
    912 }
    913 
    914 LJFOLD(ABS ABS KNUM)
    915 LJFOLDF(shortcut_left)
    916 {
    917   return LEFTFOLD;  /* f(g(x)) ==> g(x) */
    918 }
    919 
    920 LJFOLD(ABS NEG KNUM)
    921 LJFOLDF(shortcut_dropleft)
    922 {
    923   PHIBARRIER(fleft);
    924   fins->op1 = fleft->op1;  /* abs(neg(x)) ==> abs(x) */
    925   return RETRYFOLD;
    926 }
    927 
    928 /* Note: no safe shortcuts with STRTO and TOSTR ("1e2" ==> +100 ==> "100"). */
    929 LJFOLD(NEG NEG any)
    930 LJFOLD(BNOT BNOT)
    931 LJFOLD(BSWAP BSWAP)
    932 LJFOLDF(shortcut_leftleft)
    933 {
    934   PHIBARRIER(fleft);  /* See above. Fold would be ok, but not beneficial. */
    935   return fleft->op1;  /* f(g(x)) ==> x */
    936 }
    937 
    938 /* -- FP algebraic simplifications ---------------------------------------- */
    939 
    940 /* FP arithmetic is tricky -- there's not much to simplify.
    941 ** Please note the following common pitfalls before sending "improvements":
    942 **   x+0 ==> x  is INVALID for x=-0
    943 **   0-x ==> -x is INVALID for x=+0
    944 **   x*0 ==> 0  is INVALID for x=-0, x=+-Inf or x=NaN
    945 */
    946 
    947 LJFOLD(ADD NEG any)
    948 LJFOLDF(simplify_numadd_negx)
    949 {
    950   PHIBARRIER(fleft);
    951   fins->o = IR_SUB;  /* (-a) + b ==> b - a */
    952   fins->op1 = fins->op2;
    953   fins->op2 = fleft->op1;
    954   return RETRYFOLD;
    955 }
    956 
    957 LJFOLD(ADD any NEG)
    958 LJFOLDF(simplify_numadd_xneg)
    959 {
    960   PHIBARRIER(fright);
    961   fins->o = IR_SUB;  /* a + (-b) ==> a - b */
    962   fins->op2 = fright->op1;
    963   return RETRYFOLD;
    964 }
    965 
    966 LJFOLD(SUB any KNUM)
    967 LJFOLDF(simplify_numsub_k)
    968 {
    969   lua_Number n = knumright;
    970   if (n == 0.0)  /* x - (+-0) ==> x */
    971     return LEFTFOLD;
    972   return NEXTFOLD;
    973 }
    974 
    975 LJFOLD(SUB NEG KNUM)
    976 LJFOLDF(simplify_numsub_negk)
    977 {
    978   PHIBARRIER(fleft);
    979   fins->op2 = fleft->op1;  /* (-x) - k ==> (-k) - x */
    980   fins->op1 = (IRRef1)lj_ir_knum(J, -knumright);
    981   return RETRYFOLD;
    982 }
    983 
    984 LJFOLD(SUB any NEG)
    985 LJFOLDF(simplify_numsub_xneg)
    986 {
    987   PHIBARRIER(fright);
    988   fins->o = IR_ADD;  /* a - (-b) ==> a + b */
    989   fins->op2 = fright->op1;
    990   return RETRYFOLD;
    991 }
    992 
    993 LJFOLD(MUL any KNUM)
    994 LJFOLD(DIV any KNUM)
    995 LJFOLDF(simplify_nummuldiv_k)
    996 {
    997   lua_Number n = knumright;
    998   if (n == 1.0) {  /* x o 1 ==> x */
    999     return LEFTFOLD;
   1000   } else if (n == -1.0) {  /* x o -1 ==> -x */
   1001     fins->o = IR_NEG;
   1002     fins->op2 = (IRRef1)lj_ir_ksimd(J, LJ_KSIMD_NEG);
   1003     return RETRYFOLD;
   1004   } else if (fins->o == IR_MUL && n == 2.0) {  /* x * 2 ==> x + x */
   1005     fins->o = IR_ADD;
   1006     fins->op2 = fins->op1;
   1007     return RETRYFOLD;
   1008   } else if (fins->o == IR_DIV) {  /* x / 2^k ==> x * 2^-k */
   1009     uint64_t u = ir_knum(fright)->u64;
   1010     uint32_t ex = ((uint32_t)(u >> 52) & 0x7ff);
   1011     if ((u & U64x(000fffff,ffffffff)) == 0 && ex - 1 < 0x7fd) {
   1012       u = (u & ((uint64_t)1 << 63)) | ((uint64_t)(0x7fe - ex) << 52);
   1013       fins->o = IR_MUL;  /* Multiply by exact reciprocal. */
   1014       fins->op2 = lj_ir_knum_u64(J, u);
   1015       return RETRYFOLD;
   1016     }
   1017   }
   1018   return NEXTFOLD;
   1019 }
   1020 
   1021 LJFOLD(MUL NEG KNUM)
   1022 LJFOLD(DIV NEG KNUM)
   1023 LJFOLDF(simplify_nummuldiv_negk)
   1024 {
   1025   PHIBARRIER(fleft);
   1026   fins->op1 = fleft->op1;  /* (-a) o k ==> a o (-k) */
   1027   fins->op2 = (IRRef1)lj_ir_knum(J, -knumright);
   1028   return RETRYFOLD;
   1029 }
   1030 
   1031 LJFOLD(MUL NEG NEG)
   1032 LJFOLD(DIV NEG NEG)
   1033 LJFOLDF(simplify_nummuldiv_negneg)
   1034 {
   1035   PHIBARRIER(fleft);
   1036   PHIBARRIER(fright);
   1037   fins->op1 = fleft->op1;  /* (-a) o (-b) ==> a o b */
   1038   fins->op2 = fright->op1;
   1039   return RETRYFOLD;
   1040 }
   1041 
   1042 LJFOLD(POW any KINT)
   1043 LJFOLDF(simplify_numpow_xk)
   1044 {
   1045   int32_t k = fright->i;
   1046   TRef ref = fins->op1;
   1047   if (k == 0)  /* x ^ 0 ==> 1 */
   1048     return lj_ir_knum_one(J);  /* Result must be a number, not an int. */
   1049   if (k == 1)  /* x ^ 1 ==> x */
   1050     return LEFTFOLD;
   1051   if ((uint32_t)(k+65536) > 2*65536u)  /* Limit code explosion. */
   1052     return NEXTFOLD;
   1053   if (k < 0) {  /* x ^ (-k) ==> (1/x) ^ k. */
   1054     ref = emitir(IRTN(IR_DIV), lj_ir_knum_one(J), ref);
   1055     k = -k;
   1056   }
   1057   /* Unroll x^k for 1 <= k <= 65536. */
   1058   for (; (k & 1) == 0; k >>= 1)  /* Handle leading zeros. */
   1059     ref = emitir(IRTN(IR_MUL), ref, ref);
   1060   if ((k >>= 1) != 0) {  /* Handle trailing bits. */
   1061     TRef tmp = emitir(IRTN(IR_MUL), ref, ref);
   1062     for (; k != 1; k >>= 1) {
   1063       if (k & 1)
   1064 	ref = emitir(IRTN(IR_MUL), ref, tmp);
   1065       tmp = emitir(IRTN(IR_MUL), tmp, tmp);
   1066     }
   1067     ref = emitir(IRTN(IR_MUL), ref, tmp);
   1068   }
   1069   return ref;
   1070 }
   1071 
   1072 LJFOLD(POW KNUM any)
   1073 LJFOLDF(simplify_numpow_kx)
   1074 {
   1075   lua_Number n = knumleft;
   1076   if (n == 2.0) {  /* 2.0 ^ i ==> ldexp(1.0, tonum(i)) */
   1077     fins->o = IR_CONV;
   1078 #if LJ_TARGET_X86ORX64
   1079     fins->op1 = fins->op2;
   1080     fins->op2 = IRCONV_NUM_INT;
   1081     fins->op2 = (IRRef1)lj_opt_fold(J);
   1082 #endif
   1083     fins->op1 = (IRRef1)lj_ir_knum_one(J);
   1084     fins->o = IR_LDEXP;
   1085     return RETRYFOLD;
   1086   }
   1087   return NEXTFOLD;
   1088 }
   1089 
   1090 /* -- Simplify conversions ------------------------------------------------ */
   1091 
   1092 LJFOLD(CONV CONV IRCONV_NUM_INT)  /* _NUM */
   1093 LJFOLDF(shortcut_conv_num_int)
   1094 {
   1095   PHIBARRIER(fleft);
   1096   /* Only safe with a guarded conversion to int. */
   1097   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_NUM && irt_isguard(fleft->t))
   1098     return fleft->op1;  /* f(g(x)) ==> x */
   1099   return NEXTFOLD;
   1100 }
   1101 
   1102 LJFOLD(CONV CONV IRCONV_INT_NUM)  /* _INT */
   1103 LJFOLD(CONV CONV IRCONV_U32_NUM)  /* _U32*/
   1104 LJFOLDF(simplify_conv_int_num)
   1105 {
   1106   /* Fold even across PHI to avoid expensive num->int conversions in loop. */
   1107   if ((fleft->op2 & IRCONV_SRCMASK) ==
   1108       ((fins->op2 & IRCONV_DSTMASK) >> IRCONV_DSH))
   1109     return fleft->op1;
   1110   return NEXTFOLD;
   1111 }
   1112 
   1113 LJFOLD(CONV CONV IRCONV_I64_NUM)  /* _INT or _U32 */
   1114 LJFOLD(CONV CONV IRCONV_U64_NUM)  /* _INT or _U32 */
   1115 LJFOLDF(simplify_conv_i64_num)
   1116 {
   1117   PHIBARRIER(fleft);
   1118   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT) {
   1119     /* Reduce to a sign-extension. */
   1120     fins->op1 = fleft->op1;
   1121     fins->op2 = ((IRT_I64<<5)|IRT_INT|IRCONV_SEXT);
   1122     return RETRYFOLD;
   1123   } else if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
   1124 #if LJ_TARGET_X64
   1125     return fleft->op1;
   1126 #else
   1127     /* Reduce to a zero-extension. */
   1128     fins->op1 = fleft->op1;
   1129     fins->op2 = (IRT_I64<<5)|IRT_U32;
   1130     return RETRYFOLD;
   1131 #endif
   1132   }
   1133   return NEXTFOLD;
   1134 }
   1135 
   1136 LJFOLD(CONV CONV IRCONV_INT_I64)  /* _INT or _U32 */
   1137 LJFOLD(CONV CONV IRCONV_INT_U64)  /* _INT or _U32 */
   1138 LJFOLD(CONV CONV IRCONV_U32_I64)  /* _INT or _U32 */
   1139 LJFOLD(CONV CONV IRCONV_U32_U64)  /* _INT or _U32 */
   1140 LJFOLDF(simplify_conv_int_i64)
   1141 {
   1142   int src;
   1143   PHIBARRIER(fleft);
   1144   src = (fleft->op2 & IRCONV_SRCMASK);
   1145   if (src == IRT_INT || src == IRT_U32) {
   1146     if (src == ((fins->op2 & IRCONV_DSTMASK) >> IRCONV_DSH)) {
   1147       return fleft->op1;
   1148     } else {
   1149       fins->op2 = ((fins->op2 & IRCONV_DSTMASK) | src);
   1150       fins->op1 = fleft->op1;
   1151       return RETRYFOLD;
   1152     }
   1153   }
   1154   return NEXTFOLD;
   1155 }
   1156 
   1157 LJFOLD(CONV CONV IRCONV_FLOAT_NUM)  /* _FLOAT */
   1158 LJFOLDF(simplify_conv_flt_num)
   1159 {
   1160   PHIBARRIER(fleft);
   1161   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_FLOAT)
   1162     return fleft->op1;
   1163   return NEXTFOLD;
   1164 }
   1165 
   1166 /* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
   1167 LJFOLD(TOBIT CONV KNUM)
   1168 LJFOLDF(simplify_tobit_conv)
   1169 {
   1170   /* Fold even across PHI to avoid expensive num->int conversions in loop. */
   1171   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT) {
   1172     lua_assert(irt_isnum(fleft->t));
   1173     return fleft->op1;
   1174   } else if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
   1175     lua_assert(irt_isnum(fleft->t));
   1176     fins->o = IR_CONV;
   1177     fins->op1 = fleft->op1;
   1178     fins->op2 = (IRT_INT<<5)|IRT_U32;
   1179     return RETRYFOLD;
   1180   }
   1181   return NEXTFOLD;
   1182 }
   1183 
   1184 /* Shortcut floor/ceil/round + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
   1185 LJFOLD(FPMATH CONV IRFPM_FLOOR)
   1186 LJFOLD(FPMATH CONV IRFPM_CEIL)
   1187 LJFOLD(FPMATH CONV IRFPM_TRUNC)
   1188 LJFOLDF(simplify_floor_conv)
   1189 {
   1190   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT ||
   1191       (fleft->op2 & IRCONV_SRCMASK) == IRT_U32)
   1192     return LEFTFOLD;
   1193   return NEXTFOLD;
   1194 }
   1195 
   1196 /* Strength reduction of widening. */
   1197 LJFOLD(CONV any IRCONV_I64_INT)
   1198 LJFOLD(CONV any IRCONV_U64_INT)
   1199 LJFOLDF(simplify_conv_sext)
   1200 {
   1201   IRRef ref = fins->op1;
   1202   int64_t ofs = 0;
   1203   if (!(fins->op2 & IRCONV_SEXT))
   1204     return NEXTFOLD;
   1205   PHIBARRIER(fleft);
   1206   if (fleft->o == IR_XLOAD && (irt_isu8(fleft->t) || irt_isu16(fleft->t)))
   1207     goto ok_reduce;
   1208   if (fleft->o == IR_ADD && irref_isk(fleft->op2)) {
   1209     ofs = (int64_t)IR(fleft->op2)->i;
   1210     ref = fleft->op1;
   1211   }
   1212   /* Use scalar evolution analysis results to strength-reduce sign-extension. */
   1213   if (ref == J->scev.idx) {
   1214     IRRef lo = J->scev.dir ? J->scev.start : J->scev.stop;
   1215     lua_assert(irt_isint(J->scev.t));
   1216     if (lo && IR(lo)->i + ofs >= 0) {
   1217     ok_reduce:
   1218 #if LJ_TARGET_X64
   1219       /* Eliminate widening. All 32 bit ops do an implicit zero-extension. */
   1220       return LEFTFOLD;
   1221 #else
   1222       /* Reduce to a (cheaper) zero-extension. */
   1223       fins->op2 &= ~IRCONV_SEXT;
   1224       return RETRYFOLD;
   1225 #endif
   1226     }
   1227   }
   1228   return NEXTFOLD;
   1229 }
   1230 
   1231 /* Strength reduction of narrowing. */
   1232 LJFOLD(CONV ADD IRCONV_INT_I64)
   1233 LJFOLD(CONV SUB IRCONV_INT_I64)
   1234 LJFOLD(CONV MUL IRCONV_INT_I64)
   1235 LJFOLD(CONV ADD IRCONV_INT_U64)
   1236 LJFOLD(CONV SUB IRCONV_INT_U64)
   1237 LJFOLD(CONV MUL IRCONV_INT_U64)
   1238 LJFOLD(CONV ADD IRCONV_U32_I64)
   1239 LJFOLD(CONV SUB IRCONV_U32_I64)
   1240 LJFOLD(CONV MUL IRCONV_U32_I64)
   1241 LJFOLD(CONV ADD IRCONV_U32_U64)
   1242 LJFOLD(CONV SUB IRCONV_U32_U64)
   1243 LJFOLD(CONV MUL IRCONV_U32_U64)
   1244 LJFOLDF(simplify_conv_narrow)
   1245 {
   1246   IROp op = (IROp)fleft->o;
   1247   IRType t = irt_type(fins->t);
   1248   IRRef op1 = fleft->op1, op2 = fleft->op2, mode = fins->op2;
   1249   PHIBARRIER(fleft);
   1250   op1 = emitir(IRTI(IR_CONV), op1, mode);
   1251   op2 = emitir(IRTI(IR_CONV), op2, mode);
   1252   fins->ot = IRT(op, t);
   1253   fins->op1 = op1;
   1254   fins->op2 = op2;
   1255   return RETRYFOLD;
   1256 }
   1257 
   1258 /* Special CSE rule for CONV. */
   1259 LJFOLD(CONV any any)
   1260 LJFOLDF(cse_conv)
   1261 {
   1262   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
   1263     IRRef op1 = fins->op1, op2 = (fins->op2 & IRCONV_MODEMASK);
   1264     uint8_t guard = irt_isguard(fins->t);
   1265     IRRef ref = J->chain[IR_CONV];
   1266     while (ref > op1) {
   1267       IRIns *ir = IR(ref);
   1268       /* Commoning with stronger checks is ok. */
   1269       if (ir->op1 == op1 && (ir->op2 & IRCONV_MODEMASK) == op2 &&
   1270 	  irt_isguard(ir->t) >= guard)
   1271 	return ref;
   1272       ref = ir->prev;
   1273     }
   1274   }
   1275   return EMITFOLD;  /* No fallthrough to regular CSE. */
   1276 }
   1277 
   1278 /* FP conversion narrowing. */
   1279 LJFOLD(TOBIT ADD KNUM)
   1280 LJFOLD(TOBIT SUB KNUM)
   1281 LJFOLD(CONV ADD IRCONV_INT_NUM)
   1282 LJFOLD(CONV SUB IRCONV_INT_NUM)
   1283 LJFOLD(CONV ADD IRCONV_I64_NUM)
   1284 LJFOLD(CONV SUB IRCONV_I64_NUM)
   1285 LJFOLDF(narrow_convert)
   1286 {
   1287   PHIBARRIER(fleft);
   1288   /* Narrowing ignores PHIs and repeating it inside the loop is not useful. */
   1289   if (J->chain[IR_LOOP])
   1290     return NEXTFOLD;
   1291   lua_assert(fins->o != IR_CONV || (fins->op2&IRCONV_CONVMASK) != IRCONV_TOBIT);
   1292   return lj_opt_narrow_convert(J);
   1293 }
   1294 
   1295 /* -- Integer algebraic simplifications ----------------------------------- */
   1296 
   1297 LJFOLD(ADD any KINT)
   1298 LJFOLD(ADDOV any KINT)
   1299 LJFOLD(SUBOV any KINT)
   1300 LJFOLDF(simplify_intadd_k)
   1301 {
   1302   if (fright->i == 0)  /* i o 0 ==> i */
   1303     return LEFTFOLD;
   1304   return NEXTFOLD;
   1305 }
   1306 
   1307 LJFOLD(MULOV any KINT)
   1308 LJFOLDF(simplify_intmul_k)
   1309 {
   1310   if (fright->i == 0)  /* i * 0 ==> 0 */
   1311     return RIGHTFOLD;
   1312   if (fright->i == 1)  /* i * 1 ==> i */
   1313     return LEFTFOLD;
   1314   if (fright->i == 2) {  /* i * 2 ==> i + i */
   1315     fins->o = IR_ADDOV;
   1316     fins->op2 = fins->op1;
   1317     return RETRYFOLD;
   1318   }
   1319   return NEXTFOLD;
   1320 }
   1321 
   1322 LJFOLD(SUB any KINT)
   1323 LJFOLDF(simplify_intsub_k)
   1324 {
   1325   if (fright->i == 0)  /* i - 0 ==> i */
   1326     return LEFTFOLD;
   1327   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
   1328   fins->op2 = (IRRef1)lj_ir_kint(J, -fright->i);  /* Overflow for -2^31 ok. */
   1329   return RETRYFOLD;
   1330 }
   1331 
   1332 LJFOLD(SUB KINT any)
   1333 LJFOLD(SUB KINT64 any)
   1334 LJFOLDF(simplify_intsub_kleft)
   1335 {
   1336   if (fleft->o == IR_KINT ? (fleft->i == 0) : (ir_kint64(fleft)->u64 == 0)) {
   1337     fins->o = IR_NEG;  /* 0 - i ==> -i */
   1338     fins->op1 = fins->op2;
   1339     return RETRYFOLD;
   1340   }
   1341   return NEXTFOLD;
   1342 }
   1343 
   1344 LJFOLD(ADD any KINT64)
   1345 LJFOLDF(simplify_intadd_k64)
   1346 {
   1347   if (ir_kint64(fright)->u64 == 0)  /* i + 0 ==> i */
   1348     return LEFTFOLD;
   1349   return NEXTFOLD;
   1350 }
   1351 
   1352 LJFOLD(SUB any KINT64)
   1353 LJFOLDF(simplify_intsub_k64)
   1354 {
   1355   uint64_t k = ir_kint64(fright)->u64;
   1356   if (k == 0)  /* i - 0 ==> i */
   1357     return LEFTFOLD;
   1358   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
   1359   fins->op2 = (IRRef1)lj_ir_kint64(J, (uint64_t)-(int64_t)k);
   1360   return RETRYFOLD;
   1361 }
   1362 
   1363 static TRef simplify_intmul_k(jit_State *J, int32_t k)
   1364 {
   1365   /* Note: many more simplifications are possible, e.g. 2^k1 +- 2^k2.
   1366   ** But this is mainly intended for simple address arithmetic.
   1367   ** Also it's easier for the backend to optimize the original multiplies.
   1368   */
   1369   if (k == 0) {  /* i * 0 ==> 0 */
   1370     return RIGHTFOLD;
   1371   } else if (k == 1) {  /* i * 1 ==> i */
   1372     return LEFTFOLD;
   1373   } else if ((k & (k-1)) == 0) {  /* i * 2^k ==> i << k */
   1374     fins->o = IR_BSHL;
   1375     fins->op2 = lj_ir_kint(J, lj_fls((uint32_t)k));
   1376     return RETRYFOLD;
   1377   }
   1378   return NEXTFOLD;
   1379 }
   1380 
   1381 LJFOLD(MUL any KINT)
   1382 LJFOLDF(simplify_intmul_k32)
   1383 {
   1384   if (fright->i >= 0)
   1385     return simplify_intmul_k(J, fright->i);
   1386   return NEXTFOLD;
   1387 }
   1388 
   1389 LJFOLD(MUL any KINT64)
   1390 LJFOLDF(simplify_intmul_k64)
   1391 {
   1392 #if LJ_HASFFI
   1393   if (ir_kint64(fright)->u64 < 0x80000000u)
   1394     return simplify_intmul_k(J, (int32_t)ir_kint64(fright)->u64);
   1395   return NEXTFOLD;
   1396 #else
   1397   UNUSED(J); lua_assert(0); return FAILFOLD;
   1398 #endif
   1399 }
   1400 
   1401 LJFOLD(MOD any KINT)
   1402 LJFOLDF(simplify_intmod_k)
   1403 {
   1404   int32_t k = fright->i;
   1405   lua_assert(k != 0);
   1406   if (k > 0 && (k & (k-1)) == 0) {  /* i % (2^k) ==> i & (2^k-1) */
   1407     fins->o = IR_BAND;
   1408     fins->op2 = lj_ir_kint(J, k-1);
   1409     return RETRYFOLD;
   1410   }
   1411   return NEXTFOLD;
   1412 }
   1413 
   1414 LJFOLD(MOD KINT any)
   1415 LJFOLDF(simplify_intmod_kleft)
   1416 {
   1417   if (fleft->i == 0)
   1418     return INTFOLD(0);
   1419   return NEXTFOLD;
   1420 }
   1421 
   1422 LJFOLD(SUB any any)
   1423 LJFOLD(SUBOV any any)
   1424 LJFOLDF(simplify_intsub)
   1425 {
   1426   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))  /* i - i ==> 0 */
   1427     return irt_is64(fins->t) ? INT64FOLD(0) : INTFOLD(0);
   1428   return NEXTFOLD;
   1429 }
   1430 
   1431 LJFOLD(SUB ADD any)
   1432 LJFOLDF(simplify_intsubadd_leftcancel)
   1433 {
   1434   if (!irt_isnum(fins->t)) {
   1435     PHIBARRIER(fleft);
   1436     if (fins->op2 == fleft->op1)  /* (i + j) - i ==> j */
   1437       return fleft->op2;
   1438     if (fins->op2 == fleft->op2)  /* (i + j) - j ==> i */
   1439       return fleft->op1;
   1440   }
   1441   return NEXTFOLD;
   1442 }
   1443 
   1444 LJFOLD(SUB SUB any)
   1445 LJFOLDF(simplify_intsubsub_leftcancel)
   1446 {
   1447   if (!irt_isnum(fins->t)) {
   1448     PHIBARRIER(fleft);
   1449     if (fins->op2 == fleft->op1) {  /* (i - j) - i ==> 0 - j */
   1450       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
   1451       fins->op2 = fleft->op2;
   1452       return RETRYFOLD;
   1453     }
   1454   }
   1455   return NEXTFOLD;
   1456 }
   1457 
   1458 LJFOLD(SUB any SUB)
   1459 LJFOLDF(simplify_intsubsub_rightcancel)
   1460 {
   1461   if (!irt_isnum(fins->t)) {
   1462     PHIBARRIER(fright);
   1463     if (fins->op1 == fright->op1)  /* i - (i - j) ==> j */
   1464       return fright->op2;
   1465   }
   1466   return NEXTFOLD;
   1467 }
   1468 
   1469 LJFOLD(SUB any ADD)
   1470 LJFOLDF(simplify_intsubadd_rightcancel)
   1471 {
   1472   if (!irt_isnum(fins->t)) {
   1473     PHIBARRIER(fright);
   1474     if (fins->op1 == fright->op1) {  /* i - (i + j) ==> 0 - j */
   1475       fins->op2 = fright->op2;
   1476       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
   1477       return RETRYFOLD;
   1478     }
   1479     if (fins->op1 == fright->op2) {  /* i - (j + i) ==> 0 - j */
   1480       fins->op2 = fright->op1;
   1481       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
   1482       return RETRYFOLD;
   1483     }
   1484   }
   1485   return NEXTFOLD;
   1486 }
   1487 
   1488 LJFOLD(SUB ADD ADD)
   1489 LJFOLDF(simplify_intsubaddadd_cancel)
   1490 {
   1491   if (!irt_isnum(fins->t)) {
   1492     PHIBARRIER(fleft);
   1493     PHIBARRIER(fright);
   1494     if (fleft->op1 == fright->op1) {  /* (i + j1) - (i + j2) ==> j1 - j2 */
   1495       fins->op1 = fleft->op2;
   1496       fins->op2 = fright->op2;
   1497       return RETRYFOLD;
   1498     }
   1499     if (fleft->op1 == fright->op2) {  /* (i + j1) - (j2 + i) ==> j1 - j2 */
   1500       fins->op1 = fleft->op2;
   1501       fins->op2 = fright->op1;
   1502       return RETRYFOLD;
   1503     }
   1504     if (fleft->op2 == fright->op1) {  /* (j1 + i) - (i + j2) ==> j1 - j2 */
   1505       fins->op1 = fleft->op1;
   1506       fins->op2 = fright->op2;
   1507       return RETRYFOLD;
   1508     }
   1509     if (fleft->op2 == fright->op2) {  /* (j1 + i) - (j2 + i) ==> j1 - j2 */
   1510       fins->op1 = fleft->op1;
   1511       fins->op2 = fright->op1;
   1512       return RETRYFOLD;
   1513     }
   1514   }
   1515   return NEXTFOLD;
   1516 }
   1517 
   1518 LJFOLD(BAND any KINT)
   1519 LJFOLD(BAND any KINT64)
   1520 LJFOLDF(simplify_band_k)
   1521 {
   1522   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
   1523 				     (int64_t)ir_k64(fright)->u64;
   1524   if (k == 0)  /* i & 0 ==> 0 */
   1525     return RIGHTFOLD;
   1526   if (k == -1)  /* i & -1 ==> i */
   1527     return LEFTFOLD;
   1528   return NEXTFOLD;
   1529 }
   1530 
   1531 LJFOLD(BOR any KINT)
   1532 LJFOLD(BOR any KINT64)
   1533 LJFOLDF(simplify_bor_k)
   1534 {
   1535   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
   1536 				     (int64_t)ir_k64(fright)->u64;
   1537   if (k == 0)  /* i | 0 ==> i */
   1538     return LEFTFOLD;
   1539   if (k == -1)  /* i | -1 ==> -1 */
   1540     return RIGHTFOLD;
   1541   return NEXTFOLD;
   1542 }
   1543 
   1544 LJFOLD(BXOR any KINT)
   1545 LJFOLD(BXOR any KINT64)
   1546 LJFOLDF(simplify_bxor_k)
   1547 {
   1548   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
   1549 				     (int64_t)ir_k64(fright)->u64;
   1550   if (k == 0)  /* i xor 0 ==> i */
   1551     return LEFTFOLD;
   1552   if (k == -1) {  /* i xor -1 ==> ~i */
   1553     fins->o = IR_BNOT;
   1554     fins->op2 = 0;
   1555     return RETRYFOLD;
   1556   }
   1557   return NEXTFOLD;
   1558 }
   1559 
   1560 LJFOLD(BSHL any KINT)
   1561 LJFOLD(BSHR any KINT)
   1562 LJFOLD(BSAR any KINT)
   1563 LJFOLD(BROL any KINT)
   1564 LJFOLD(BROR any KINT)
   1565 LJFOLDF(simplify_shift_ik)
   1566 {
   1567   int32_t mask = irt_is64(fins->t) ? 63 : 31;
   1568   int32_t k = (fright->i & mask);
   1569   if (k == 0)  /* i o 0 ==> i */
   1570     return LEFTFOLD;
   1571   if (k == 1 && fins->o == IR_BSHL) {  /* i << 1 ==> i + i */
   1572     fins->o = IR_ADD;
   1573     fins->op2 = fins->op1;
   1574     return RETRYFOLD;
   1575   }
   1576   if (k != fright->i) {  /* i o k ==> i o (k & mask) */
   1577     fins->op2 = (IRRef1)lj_ir_kint(J, k);
   1578     return RETRYFOLD;
   1579   }
   1580 #ifndef LJ_TARGET_UNIFYROT
   1581   if (fins->o == IR_BROR) {  /* bror(i, k) ==> brol(i, (-k)&mask) */
   1582     fins->o = IR_BROL;
   1583     fins->op2 = (IRRef1)lj_ir_kint(J, (-k)&mask);
   1584     return RETRYFOLD;
   1585   }
   1586 #endif
   1587   return NEXTFOLD;
   1588 }
   1589 
   1590 LJFOLD(BSHL any BAND)
   1591 LJFOLD(BSHR any BAND)
   1592 LJFOLD(BSAR any BAND)
   1593 LJFOLD(BROL any BAND)
   1594 LJFOLD(BROR any BAND)
   1595 LJFOLDF(simplify_shift_andk)
   1596 {
   1597   IRIns *irk = IR(fright->op2);
   1598   PHIBARRIER(fright);
   1599   if ((fins->o < IR_BROL ? LJ_TARGET_MASKSHIFT : LJ_TARGET_MASKROT) &&
   1600       irk->o == IR_KINT) {  /* i o (j & mask) ==> i o j */
   1601     int32_t mask = irt_is64(fins->t) ? 63 : 31;
   1602     int32_t k = irk->i & mask;
   1603     if (k == mask) {
   1604       fins->op2 = fright->op1;
   1605       return RETRYFOLD;
   1606     }
   1607   }
   1608   return NEXTFOLD;
   1609 }
   1610 
   1611 LJFOLD(BSHL KINT any)
   1612 LJFOLD(BSHR KINT any)
   1613 LJFOLD(BSHL KINT64 any)
   1614 LJFOLD(BSHR KINT64 any)
   1615 LJFOLDF(simplify_shift1_ki)
   1616 {
   1617   int64_t k = fleft->o == IR_KINT ? (int64_t)fleft->i :
   1618 				    (int64_t)ir_k64(fleft)->u64;
   1619   if (k == 0)  /* 0 o i ==> 0 */
   1620     return LEFTFOLD;
   1621   return NEXTFOLD;
   1622 }
   1623 
   1624 LJFOLD(BSAR KINT any)
   1625 LJFOLD(BROL KINT any)
   1626 LJFOLD(BROR KINT any)
   1627 LJFOLD(BSAR KINT64 any)
   1628 LJFOLD(BROL KINT64 any)
   1629 LJFOLD(BROR KINT64 any)
   1630 LJFOLDF(simplify_shift2_ki)
   1631 {
   1632   int64_t k = fleft->o == IR_KINT ? (int64_t)fleft->i :
   1633 				    (int64_t)ir_k64(fleft)->u64;
   1634   if (k == 0 || k == -1)  /* 0 o i ==> 0; -1 o i ==> -1 */
   1635     return LEFTFOLD;
   1636   return NEXTFOLD;
   1637 }
   1638 
   1639 LJFOLD(BSHL BAND KINT)
   1640 LJFOLD(BSHR BAND KINT)
   1641 LJFOLD(BROL BAND KINT)
   1642 LJFOLD(BROR BAND KINT)
   1643 LJFOLDF(simplify_shiftk_andk)
   1644 {
   1645   IRIns *irk = IR(fleft->op2);
   1646   PHIBARRIER(fleft);
   1647   if (irk->o == IR_KINT) {  /* (i & k1) o k2 ==> (i o k2) & (k1 o k2) */
   1648     int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
   1649     fins->op1 = fleft->op1;
   1650     fins->op1 = (IRRef1)lj_opt_fold(J);
   1651     fins->op2 = (IRRef1)lj_ir_kint(J, k);
   1652     fins->ot = IRTI(IR_BAND);
   1653     return RETRYFOLD;
   1654   }
   1655   return NEXTFOLD;
   1656 }
   1657 
   1658 LJFOLD(BAND BSHL KINT)
   1659 LJFOLD(BAND BSHR KINT)
   1660 LJFOLDF(simplify_andk_shiftk)
   1661 {
   1662   IRIns *irk = IR(fleft->op2);
   1663   if (irk->o == IR_KINT &&
   1664       kfold_intop(-1, irk->i, (IROp)fleft->o) == fright->i)
   1665     return LEFTFOLD;  /* (i o k1) & k2 ==> i, if (-1 o k1) == k2 */
   1666   return NEXTFOLD;
   1667 }
   1668 
   1669 /* -- Reassociation ------------------------------------------------------- */
   1670 
   1671 LJFOLD(ADD ADD KINT)
   1672 LJFOLD(MUL MUL KINT)
   1673 LJFOLD(BAND BAND KINT)
   1674 LJFOLD(BOR BOR KINT)
   1675 LJFOLD(BXOR BXOR KINT)
   1676 LJFOLDF(reassoc_intarith_k)
   1677 {
   1678   IRIns *irk = IR(fleft->op2);
   1679   if (irk->o == IR_KINT) {
   1680     int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
   1681     if (k == irk->i)  /* (i o k1) o k2 ==> i o k1, if (k1 o k2) == k1. */
   1682       return LEFTFOLD;
   1683     PHIBARRIER(fleft);
   1684     fins->op1 = fleft->op1;
   1685     fins->op2 = (IRRef1)lj_ir_kint(J, k);
   1686     return RETRYFOLD;  /* (i o k1) o k2 ==> i o (k1 o k2) */
   1687   }
   1688   return NEXTFOLD;
   1689 }
   1690 
   1691 LJFOLD(ADD ADD KINT64)
   1692 LJFOLD(MUL MUL KINT64)
   1693 LJFOLD(BAND BAND KINT64)
   1694 LJFOLD(BOR BOR KINT64)
   1695 LJFOLD(BXOR BXOR KINT64)
   1696 LJFOLDF(reassoc_intarith_k64)
   1697 {
   1698 #if LJ_HASFFI
   1699   IRIns *irk = IR(fleft->op2);
   1700   if (irk->o == IR_KINT64) {
   1701     uint64_t k = kfold_int64arith(ir_k64(irk)->u64,
   1702 				  ir_k64(fright)->u64, (IROp)fins->o);
   1703     PHIBARRIER(fleft);
   1704     fins->op1 = fleft->op1;
   1705     fins->op2 = (IRRef1)lj_ir_kint64(J, k);
   1706     return RETRYFOLD;  /* (i o k1) o k2 ==> i o (k1 o k2) */
   1707   }
   1708   return NEXTFOLD;
   1709 #else
   1710   UNUSED(J); lua_assert(0); return FAILFOLD;
   1711 #endif
   1712 }
   1713 
   1714 LJFOLD(MIN MIN any)
   1715 LJFOLD(MAX MAX any)
   1716 LJFOLD(BAND BAND any)
   1717 LJFOLD(BOR BOR any)
   1718 LJFOLDF(reassoc_dup)
   1719 {
   1720   if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
   1721     return LEFTFOLD;  /* (a o b) o a ==> a o b; (a o b) o b ==> a o b */
   1722   return NEXTFOLD;
   1723 }
   1724 
   1725 LJFOLD(BXOR BXOR any)
   1726 LJFOLDF(reassoc_bxor)
   1727 {
   1728   PHIBARRIER(fleft);
   1729   if (fins->op2 == fleft->op1)  /* (a xor b) xor a ==> b */
   1730     return fleft->op2;
   1731   if (fins->op2 == fleft->op2)  /* (a xor b) xor b ==> a */
   1732     return fleft->op1;
   1733   return NEXTFOLD;
   1734 }
   1735 
   1736 LJFOLD(BSHL BSHL KINT)
   1737 LJFOLD(BSHR BSHR KINT)
   1738 LJFOLD(BSAR BSAR KINT)
   1739 LJFOLD(BROL BROL KINT)
   1740 LJFOLD(BROR BROR KINT)
   1741 LJFOLDF(reassoc_shift)
   1742 {
   1743   IRIns *irk = IR(fleft->op2);
   1744   PHIBARRIER(fleft);  /* The (shift any KINT) rule covers k2 == 0 and more. */
   1745   if (irk->o == IR_KINT) {  /* (i o k1) o k2 ==> i o (k1 + k2) */
   1746     int32_t mask = irt_is64(fins->t) ? 63 : 31;
   1747     int32_t k = (irk->i & mask) + (fright->i & mask);
   1748     if (k > mask) {  /* Combined shift too wide? */
   1749       if (fins->o == IR_BSHL || fins->o == IR_BSHR)
   1750 	return mask == 31 ? INTFOLD(0) : INT64FOLD(0);
   1751       else if (fins->o == IR_BSAR)
   1752 	k = mask;
   1753       else
   1754 	k &= mask;
   1755     }
   1756     fins->op1 = fleft->op1;
   1757     fins->op2 = (IRRef1)lj_ir_kint(J, k);
   1758     return RETRYFOLD;
   1759   }
   1760   return NEXTFOLD;
   1761 }
   1762 
   1763 LJFOLD(MIN MIN KNUM)
   1764 LJFOLD(MAX MAX KNUM)
   1765 LJFOLD(MIN MIN KINT)
   1766 LJFOLD(MAX MAX KINT)
   1767 LJFOLDF(reassoc_minmax_k)
   1768 {
   1769   IRIns *irk = IR(fleft->op2);
   1770   if (irk->o == IR_KNUM) {
   1771     lua_Number a = ir_knum(irk)->n;
   1772     lua_Number y = lj_vm_foldarith(a, knumright, fins->o - IR_ADD);
   1773     if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
   1774       return LEFTFOLD;
   1775     PHIBARRIER(fleft);
   1776     fins->op1 = fleft->op1;
   1777     fins->op2 = (IRRef1)lj_ir_knum(J, y);
   1778     return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
   1779   } else if (irk->o == IR_KINT) {
   1780     int32_t a = irk->i;
   1781     int32_t y = kfold_intop(a, fright->i, fins->o);
   1782     if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
   1783       return LEFTFOLD;
   1784     PHIBARRIER(fleft);
   1785     fins->op1 = fleft->op1;
   1786     fins->op2 = (IRRef1)lj_ir_kint(J, y);
   1787     return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
   1788   }
   1789   return NEXTFOLD;
   1790 }
   1791 
   1792 LJFOLD(MIN MAX any)
   1793 LJFOLD(MAX MIN any)
   1794 LJFOLDF(reassoc_minmax_left)
   1795 {
   1796   if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
   1797     return RIGHTFOLD;  /* (b o1 a) o2 b ==> b; (a o1 b) o2 b ==> b */
   1798   return NEXTFOLD;
   1799 }
   1800 
   1801 LJFOLD(MIN any MAX)
   1802 LJFOLD(MAX any MIN)
   1803 LJFOLDF(reassoc_minmax_right)
   1804 {
   1805   if (fins->op1 == fright->op1 || fins->op1 == fright->op2)
   1806     return LEFTFOLD;  /* a o2 (a o1 b) ==> a; a o2 (b o1 a) ==> a */
   1807   return NEXTFOLD;
   1808 }
   1809 
   1810 /* -- Array bounds check elimination -------------------------------------- */
   1811 
   1812 /* Eliminate ABC across PHIs to handle t[i-1] forwarding case.
   1813 ** ABC(asize, (i+k)+(-k)) ==> ABC(asize, i), but only if it already exists.
   1814 ** Could be generalized to (i+k1)+k2 ==> i+(k1+k2), but needs better disambig.
   1815 */
   1816 LJFOLD(ABC any ADD)
   1817 LJFOLDF(abc_fwd)
   1818 {
   1819   if (LJ_LIKELY(J->flags & JIT_F_OPT_ABC)) {
   1820     if (irref_isk(fright->op2)) {
   1821       IRIns *add2 = IR(fright->op1);
   1822       if (add2->o == IR_ADD && irref_isk(add2->op2) &&
   1823 	  IR(fright->op2)->i == -IR(add2->op2)->i) {
   1824 	IRRef ref = J->chain[IR_ABC];
   1825 	IRRef lim = add2->op1;
   1826 	if (fins->op1 > lim) lim = fins->op1;
   1827 	while (ref > lim) {
   1828 	  IRIns *ir = IR(ref);
   1829 	  if (ir->op1 == fins->op1 && ir->op2 == add2->op1)
   1830 	    return DROPFOLD;
   1831 	  ref = ir->prev;
   1832 	}
   1833       }
   1834     }
   1835   }
   1836   return NEXTFOLD;
   1837 }
   1838 
   1839 /* Eliminate ABC for constants.
   1840 ** ABC(asize, k1), ABC(asize k2) ==> ABC(asize, max(k1, k2))
   1841 ** Drop second ABC if k2 is lower. Otherwise patch first ABC with k2.
   1842 */
   1843 LJFOLD(ABC any KINT)
   1844 LJFOLDF(abc_k)
   1845 {
   1846   if (LJ_LIKELY(J->flags & JIT_F_OPT_ABC)) {
   1847     IRRef ref = J->chain[IR_ABC];
   1848     IRRef asize = fins->op1;
   1849     while (ref > asize) {
   1850       IRIns *ir = IR(ref);
   1851       if (ir->op1 == asize && irref_isk(ir->op2)) {
   1852 	int32_t k = IR(ir->op2)->i;
   1853 	if (fright->i > k)
   1854 	  ir->op2 = fins->op2;
   1855 	return DROPFOLD;
   1856       }
   1857       ref = ir->prev;
   1858     }
   1859     return EMITFOLD;  /* Already performed CSE. */
   1860   }
   1861   return NEXTFOLD;
   1862 }
   1863 
   1864 /* Eliminate invariant ABC inside loop. */
   1865 LJFOLD(ABC any any)
   1866 LJFOLDF(abc_invar)
   1867 {
   1868   /* Invariant ABC marked as PTR. Drop if op1 is invariant, too. */
   1869   if (!irt_isint(fins->t) && fins->op1 < J->chain[IR_LOOP] &&
   1870       !irt_isphi(IR(fins->op1)->t))
   1871     return DROPFOLD;
   1872   return NEXTFOLD;
   1873 }
   1874 
   1875 /* -- Commutativity ------------------------------------------------------- */
   1876 
   1877 /* The refs of commutative ops are canonicalized. Lower refs go to the right.
   1878 ** Rationale behind this:
   1879 ** - It (also) moves constants to the right.
   1880 ** - It reduces the number of FOLD rules (e.g. (BOR any KINT) suffices).
   1881 ** - It helps CSE to find more matches.
   1882 ** - The assembler generates better code with constants at the right.
   1883 */
   1884 
   1885 LJFOLD(ADD any any)
   1886 LJFOLD(MUL any any)
   1887 LJFOLD(ADDOV any any)
   1888 LJFOLD(MULOV any any)
   1889 LJFOLDF(comm_swap)
   1890 {
   1891   if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
   1892     IRRef1 tmp = fins->op1;
   1893     fins->op1 = fins->op2;
   1894     fins->op2 = tmp;
   1895     return RETRYFOLD;
   1896   }
   1897   return NEXTFOLD;
   1898 }
   1899 
   1900 LJFOLD(EQ any any)
   1901 LJFOLD(NE any any)
   1902 LJFOLDF(comm_equal)
   1903 {
   1904   /* For non-numbers only: x == x ==> drop; x ~= x ==> fail */
   1905   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
   1906     return CONDFOLD(fins->o == IR_EQ);
   1907   return fold_comm_swap(J);
   1908 }
   1909 
   1910 LJFOLD(LT any any)
   1911 LJFOLD(GE any any)
   1912 LJFOLD(LE any any)
   1913 LJFOLD(GT any any)
   1914 LJFOLD(ULT any any)
   1915 LJFOLD(UGE any any)
   1916 LJFOLD(ULE any any)
   1917 LJFOLD(UGT any any)
   1918 LJFOLDF(comm_comp)
   1919 {
   1920   /* For non-numbers only: x <=> x ==> drop; x <> x ==> fail */
   1921   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
   1922     return CONDFOLD((fins->o ^ (fins->o >> 1)) & 1);
   1923   if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
   1924     IRRef1 tmp = fins->op1;
   1925     fins->op1 = fins->op2;
   1926     fins->op2 = tmp;
   1927     fins->o ^= 3; /* GT <-> LT, GE <-> LE, does not affect U */
   1928     return RETRYFOLD;
   1929   }
   1930   return NEXTFOLD;
   1931 }
   1932 
   1933 LJFOLD(BAND any any)
   1934 LJFOLD(BOR any any)
   1935 LJFOLD(MIN any any)
   1936 LJFOLD(MAX any any)
   1937 LJFOLDF(comm_dup)
   1938 {
   1939   if (fins->op1 == fins->op2)  /* x o x ==> x */
   1940     return LEFTFOLD;
   1941   return fold_comm_swap(J);
   1942 }
   1943 
   1944 LJFOLD(BXOR any any)
   1945 LJFOLDF(comm_bxor)
   1946 {
   1947   if (fins->op1 == fins->op2)  /* i xor i ==> 0 */
   1948     return irt_is64(fins->t) ? INT64FOLD(0) : INTFOLD(0);
   1949   return fold_comm_swap(J);
   1950 }
   1951 
   1952 /* -- Simplification of compound expressions ------------------------------ */
   1953 
   1954 static TRef kfold_xload(jit_State *J, IRIns *ir, const void *p)
   1955 {
   1956   int32_t k;
   1957   switch (irt_type(ir->t)) {
   1958   case IRT_NUM: return lj_ir_knum_u64(J, *(uint64_t *)p);
   1959   case IRT_I8: k = (int32_t)*(int8_t *)p; break;
   1960   case IRT_U8: k = (int32_t)*(uint8_t *)p; break;
   1961   case IRT_I16: k = (int32_t)(int16_t)lj_getu16(p); break;
   1962   case IRT_U16: k = (int32_t)(uint16_t)lj_getu16(p); break;
   1963   case IRT_INT: case IRT_U32: k = (int32_t)lj_getu32(p); break;
   1964   case IRT_I64: case IRT_U64: return lj_ir_kint64(J, *(uint64_t *)p);
   1965   default: return 0;
   1966   }
   1967   return lj_ir_kint(J, k);
   1968 }
   1969 
   1970 /* Turn: string.sub(str, a, b) == kstr
   1971 ** into: string.byte(str, a) == string.byte(kstr, 1) etc.
   1972 ** Note: this creates unaligned XLOADs on x86/x64.
   1973 */
   1974 LJFOLD(EQ SNEW KGC)
   1975 LJFOLD(NE SNEW KGC)
   1976 LJFOLDF(merge_eqne_snew_kgc)
   1977 {
   1978   GCstr *kstr = ir_kstr(fright);
   1979   int32_t len = (int32_t)kstr->len;
   1980   lua_assert(irt_isstr(fins->t));
   1981 
   1982 #if LJ_TARGET_UNALIGNED
   1983 #define FOLD_SNEW_MAX_LEN	4  /* Handle string lengths 0, 1, 2, 3, 4. */
   1984 #define FOLD_SNEW_TYPE8		IRT_I8	/* Creates shorter immediates. */
   1985 #else
   1986 #define FOLD_SNEW_MAX_LEN	1  /* Handle string lengths 0 or 1. */
   1987 #define FOLD_SNEW_TYPE8		IRT_U8  /* Prefer unsigned loads. */
   1988 #endif
   1989 
   1990   PHIBARRIER(fleft);
   1991   if (len <= FOLD_SNEW_MAX_LEN) {
   1992     IROp op = (IROp)fins->o;
   1993     IRRef strref = fleft->op1;
   1994     if (IR(strref)->o != IR_STRREF)
   1995       return NEXTFOLD;
   1996     if (op == IR_EQ) {
   1997       emitir(IRTGI(IR_EQ), fleft->op2, lj_ir_kint(J, len));
   1998       /* Caveat: fins/fleft/fright is no longer valid after emitir. */
   1999     } else {
   2000       /* NE is not expanded since this would need an OR of two conds. */
   2001       if (!irref_isk(fleft->op2))  /* Only handle the constant length case. */
   2002 	return NEXTFOLD;
   2003       if (IR(fleft->op2)->i != len)
   2004 	return DROPFOLD;
   2005     }
   2006     if (len > 0) {
   2007       /* A 4 byte load for length 3 is ok -- all strings have an extra NUL. */
   2008       uint16_t ot = (uint16_t)(len == 1 ? IRT(IR_XLOAD, FOLD_SNEW_TYPE8) :
   2009 			       len == 2 ? IRT(IR_XLOAD, IRT_U16) :
   2010 			       IRTI(IR_XLOAD));
   2011       TRef tmp = emitir(ot, strref,
   2012 			IRXLOAD_READONLY | (len > 1 ? IRXLOAD_UNALIGNED : 0));
   2013       TRef val = kfold_xload(J, IR(tref_ref(tmp)), strdata(kstr));
   2014       if (len == 3)
   2015 	tmp = emitir(IRTI(IR_BAND), tmp,
   2016 		     lj_ir_kint(J, LJ_ENDIAN_SELECT(0x00ffffff, 0xffffff00)));
   2017       fins->op1 = (IRRef1)tmp;
   2018       fins->op2 = (IRRef1)val;
   2019       fins->ot = (IROpT)IRTGI(op);
   2020       return RETRYFOLD;
   2021     } else {
   2022       return DROPFOLD;
   2023     }
   2024   }
   2025   return NEXTFOLD;
   2026 }
   2027 
   2028 /* -- Loads --------------------------------------------------------------- */
   2029 
   2030 /* Loads cannot be folded or passed on to CSE in general.
   2031 ** Alias analysis is needed to check for forwarding opportunities.
   2032 **
   2033 ** Caveat: *all* loads must be listed here or they end up at CSE!
   2034 */
   2035 
   2036 LJFOLD(ALOAD any)
   2037 LJFOLDX(lj_opt_fwd_aload)
   2038 
   2039 /* From HREF fwd (see below). Must eliminate, not supported by fwd/backend. */
   2040 LJFOLD(HLOAD KKPTR)
   2041 LJFOLDF(kfold_hload_kkptr)
   2042 {
   2043   UNUSED(J);
   2044   lua_assert(ir_kptr(fleft) == niltvg(J2G(J)));
   2045   return TREF_NIL;
   2046 }
   2047 
   2048 LJFOLD(HLOAD any)
   2049 LJFOLDX(lj_opt_fwd_hload)
   2050 
   2051 LJFOLD(ULOAD any)
   2052 LJFOLDX(lj_opt_fwd_uload)
   2053 
   2054 LJFOLD(CALLL any IRCALL_lj_tab_len)
   2055 LJFOLDX(lj_opt_fwd_tab_len)
   2056 
   2057 /* Upvalue refs are really loads, but there are no corresponding stores.
   2058 ** So CSE is ok for them, except for UREFO across a GC step (see below).
   2059 ** If the referenced function is const, its upvalue addresses are const, too.
   2060 ** This can be used to improve CSE by looking for the same address,
   2061 ** even if the upvalues originate from a different function.
   2062 */
   2063 LJFOLD(UREFO KGC any)
   2064 LJFOLD(UREFC KGC any)
   2065 LJFOLDF(cse_uref)
   2066 {
   2067   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
   2068     IRRef ref = J->chain[fins->o];
   2069     GCfunc *fn = ir_kfunc(fleft);
   2070     GCupval *uv = gco2uv(gcref(fn->l.uvptr[(fins->op2 >> 8)]));
   2071     while (ref > 0) {
   2072       IRIns *ir = IR(ref);
   2073       if (irref_isk(ir->op1)) {
   2074 	GCfunc *fn2 = ir_kfunc(IR(ir->op1));
   2075 	if (gco2uv(gcref(fn2->l.uvptr[(ir->op2 >> 8)])) == uv) {
   2076 	  if (fins->o == IR_UREFO && gcstep_barrier(J, ref))
   2077 	    break;
   2078 	  return ref;
   2079 	}
   2080       }
   2081       ref = ir->prev;
   2082     }
   2083   }
   2084   return EMITFOLD;
   2085 }
   2086 
   2087 LJFOLD(HREFK any any)
   2088 LJFOLDX(lj_opt_fwd_hrefk)
   2089 
   2090 LJFOLD(HREF TNEW any)
   2091 LJFOLDF(fwd_href_tnew)
   2092 {
   2093   if (lj_opt_fwd_href_nokey(J))
   2094     return lj_ir_kkptr(J, niltvg(J2G(J)));
   2095   return NEXTFOLD;
   2096 }
   2097 
   2098 LJFOLD(HREF TDUP KPRI)
   2099 LJFOLD(HREF TDUP KGC)
   2100 LJFOLD(HREF TDUP KNUM)
   2101 LJFOLDF(fwd_href_tdup)
   2102 {
   2103   TValue keyv;
   2104   lj_ir_kvalue(J->L, &keyv, fright);
   2105   if (lj_tab_get(J->L, ir_ktab(IR(fleft->op1)), &keyv) == niltvg(J2G(J)) &&
   2106       lj_opt_fwd_href_nokey(J))
   2107     return lj_ir_kkptr(J, niltvg(J2G(J)));
   2108   return NEXTFOLD;
   2109 }
   2110 
   2111 /* We can safely FOLD/CSE array/hash refs and field loads, since there
   2112 ** are no corresponding stores. But we need to check for any NEWREF with
   2113 ** an aliased table, as it may invalidate all of the pointers and fields.
   2114 ** Only HREF needs the NEWREF check -- AREF and HREFK already depend on
   2115 ** FLOADs. And NEWREF itself is treated like a store (see below).
   2116 ** LREF is constant (per trace) since coroutine switches are not inlined.
   2117 */
   2118 LJFOLD(FLOAD TNEW IRFL_TAB_ASIZE)
   2119 LJFOLDF(fload_tab_tnew_asize)
   2120 {
   2121   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
   2122     return INTFOLD(fleft->op1);
   2123   return NEXTFOLD;
   2124 }
   2125 
   2126 LJFOLD(FLOAD TNEW IRFL_TAB_HMASK)
   2127 LJFOLDF(fload_tab_tnew_hmask)
   2128 {
   2129   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
   2130     return INTFOLD((1 << fleft->op2)-1);
   2131   return NEXTFOLD;
   2132 }
   2133 
   2134 LJFOLD(FLOAD TDUP IRFL_TAB_ASIZE)
   2135 LJFOLDF(fload_tab_tdup_asize)
   2136 {
   2137   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
   2138     return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->asize);
   2139   return NEXTFOLD;
   2140 }
   2141 
   2142 LJFOLD(FLOAD TDUP IRFL_TAB_HMASK)
   2143 LJFOLDF(fload_tab_tdup_hmask)
   2144 {
   2145   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
   2146     return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->hmask);
   2147   return NEXTFOLD;
   2148 }
   2149 
   2150 LJFOLD(HREF any any)
   2151 LJFOLD(FLOAD any IRFL_TAB_ARRAY)
   2152 LJFOLD(FLOAD any IRFL_TAB_NODE)
   2153 LJFOLD(FLOAD any IRFL_TAB_ASIZE)
   2154 LJFOLD(FLOAD any IRFL_TAB_HMASK)
   2155 LJFOLDF(fload_tab_ah)
   2156 {
   2157   TRef tr = lj_opt_cse(J);
   2158   return lj_opt_fwd_tptr(J, tref_ref(tr)) ? tr : EMITFOLD;
   2159 }
   2160 
   2161 /* Strings are immutable, so we can safely FOLD/CSE the related FLOAD. */
   2162 LJFOLD(FLOAD KGC IRFL_STR_LEN)
   2163 LJFOLDF(fload_str_len_kgc)
   2164 {
   2165   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
   2166     return INTFOLD((int32_t)ir_kstr(fleft)->len);
   2167   return NEXTFOLD;
   2168 }
   2169 
   2170 LJFOLD(FLOAD SNEW IRFL_STR_LEN)
   2171 LJFOLDF(fload_str_len_snew)
   2172 {
   2173   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
   2174     PHIBARRIER(fleft);
   2175     return fleft->op2;
   2176   }
   2177   return NEXTFOLD;
   2178 }
   2179 
   2180 LJFOLD(FLOAD TOSTR IRFL_STR_LEN)
   2181 LJFOLDF(fload_str_len_tostr)
   2182 {
   2183   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fleft->op2 == IRTOSTR_CHAR)
   2184     return INTFOLD(1);
   2185   return NEXTFOLD;
   2186 }
   2187 
   2188 /* The C type ID of cdata objects is immutable. */
   2189 LJFOLD(FLOAD KGC IRFL_CDATA_CTYPEID)
   2190 LJFOLDF(fload_cdata_typeid_kgc)
   2191 {
   2192   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
   2193     return INTFOLD((int32_t)ir_kcdata(fleft)->ctypeid);
   2194   return NEXTFOLD;
   2195 }
   2196 
   2197 /* Get the contents of immutable cdata objects. */
   2198 LJFOLD(FLOAD KGC IRFL_CDATA_PTR)
   2199 LJFOLD(FLOAD KGC IRFL_CDATA_INT)
   2200 LJFOLD(FLOAD KGC IRFL_CDATA_INT64)
   2201 LJFOLDF(fload_cdata_int64_kgc)
   2202 {
   2203   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
   2204     void *p = cdataptr(ir_kcdata(fleft));
   2205     if (irt_is64(fins->t))
   2206       return INT64FOLD(*(uint64_t *)p);
   2207     else
   2208       return INTFOLD(*(int32_t *)p);
   2209   }
   2210   return NEXTFOLD;
   2211 }
   2212 
   2213 LJFOLD(FLOAD CNEW IRFL_CDATA_CTYPEID)
   2214 LJFOLD(FLOAD CNEWI IRFL_CDATA_CTYPEID)
   2215 LJFOLDF(fload_cdata_typeid_cnew)
   2216 {
   2217   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
   2218     return fleft->op1;  /* No PHI barrier needed. CNEW/CNEWI op1 is const. */
   2219   return NEXTFOLD;
   2220 }
   2221 
   2222 /* Pointer, int and int64 cdata objects are immutable. */
   2223 LJFOLD(FLOAD CNEWI IRFL_CDATA_PTR)
   2224 LJFOLD(FLOAD CNEWI IRFL_CDATA_INT)
   2225 LJFOLD(FLOAD CNEWI IRFL_CDATA_INT64)
   2226 LJFOLDF(fload_cdata_ptr_int64_cnew)
   2227 {
   2228   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
   2229     return fleft->op2;  /* Fold even across PHI to avoid allocations. */
   2230   return NEXTFOLD;
   2231 }
   2232 
   2233 LJFOLD(FLOAD any IRFL_STR_LEN)
   2234 LJFOLD(FLOAD any IRFL_FUNC_ENV)
   2235 LJFOLD(FLOAD any IRFL_THREAD_ENV)
   2236 LJFOLD(FLOAD any IRFL_CDATA_CTYPEID)
   2237 LJFOLD(FLOAD any IRFL_CDATA_PTR)
   2238 LJFOLD(FLOAD any IRFL_CDATA_INT)
   2239 LJFOLD(FLOAD any IRFL_CDATA_INT64)
   2240 LJFOLD(VLOAD any any)  /* Vararg loads have no corresponding stores. */
   2241 LJFOLDX(lj_opt_cse)
   2242 
   2243 /* All other field loads need alias analysis. */
   2244 LJFOLD(FLOAD any any)
   2245 LJFOLDX(lj_opt_fwd_fload)
   2246 
   2247 /* This is for LOOP only. Recording handles SLOADs internally. */
   2248 LJFOLD(SLOAD any any)
   2249 LJFOLDF(fwd_sload)
   2250 {
   2251   if ((fins->op2 & IRSLOAD_FRAME)) {
   2252     TRef tr = lj_opt_cse(J);
   2253     return tref_ref(tr) < J->chain[IR_RETF] ? EMITFOLD : tr;
   2254   } else {
   2255     lua_assert(J->slot[fins->op1] != 0);
   2256     return J->slot[fins->op1];
   2257   }
   2258 }
   2259 
   2260 /* Only fold for KKPTR. The pointer _and_ the contents must be const. */
   2261 LJFOLD(XLOAD KKPTR any)
   2262 LJFOLDF(xload_kptr)
   2263 {
   2264   TRef tr = kfold_xload(J, fins, ir_kptr(fleft));
   2265   return tr ? tr : NEXTFOLD;
   2266 }
   2267 
   2268 LJFOLD(XLOAD any any)
   2269 LJFOLDX(lj_opt_fwd_xload)
   2270 
   2271 /* -- Write barriers ------------------------------------------------------ */
   2272 
   2273 /* Write barriers are amenable to CSE, but not across any incremental
   2274 ** GC steps.
   2275 **
   2276 ** The same logic applies to open upvalue references, because a stack
   2277 ** may be resized during a GC step (not the current stack, but maybe that
   2278 ** of a coroutine).
   2279 */
   2280 LJFOLD(TBAR any)
   2281 LJFOLD(OBAR any any)
   2282 LJFOLD(UREFO any any)
   2283 LJFOLDF(barrier_tab)
   2284 {
   2285   TRef tr = lj_opt_cse(J);
   2286   if (gcstep_barrier(J, tref_ref(tr)))  /* CSE across GC step? */
   2287     return EMITFOLD;  /* Raw emit. Assumes fins is left intact by CSE. */
   2288   return tr;
   2289 }
   2290 
   2291 LJFOLD(TBAR TNEW)
   2292 LJFOLD(TBAR TDUP)
   2293 LJFOLDF(barrier_tnew_tdup)
   2294 {
   2295   /* New tables are always white and never need a barrier. */
   2296   if (fins->op1 < J->chain[IR_LOOP])  /* Except across a GC step. */
   2297     return NEXTFOLD;
   2298   return DROPFOLD;
   2299 }
   2300 
   2301 /* -- Profiling ----------------------------------------------------------- */
   2302 
   2303 LJFOLD(PROF any any)
   2304 LJFOLDF(prof)
   2305 {
   2306   IRRef ref = J->chain[IR_PROF];
   2307   if (ref+1 == J->cur.nins)  /* Drop neighbouring IR_PROF. */
   2308     return ref;
   2309   return EMITFOLD;
   2310 }
   2311 
   2312 /* -- Stores and allocations ---------------------------------------------- */
   2313 
   2314 /* Stores and allocations cannot be folded or passed on to CSE in general.
   2315 ** But some stores can be eliminated with dead-store elimination (DSE).
   2316 **
   2317 ** Caveat: *all* stores and allocs must be listed here or they end up at CSE!
   2318 */
   2319 
   2320 LJFOLD(ASTORE any any)
   2321 LJFOLD(HSTORE any any)
   2322 LJFOLDX(lj_opt_dse_ahstore)
   2323 
   2324 LJFOLD(USTORE any any)
   2325 LJFOLDX(lj_opt_dse_ustore)
   2326 
   2327 LJFOLD(FSTORE any any)
   2328 LJFOLDX(lj_opt_dse_fstore)
   2329 
   2330 LJFOLD(XSTORE any any)
   2331 LJFOLDX(lj_opt_dse_xstore)
   2332 
   2333 LJFOLD(NEWREF any any)  /* Treated like a store. */
   2334 LJFOLD(CALLA any any)
   2335 LJFOLD(CALLL any any)  /* Safeguard fallback. */
   2336 LJFOLD(CALLS any any)
   2337 LJFOLD(CALLXS any any)
   2338 LJFOLD(XBAR)
   2339 LJFOLD(RETF any any)  /* Modifies BASE. */
   2340 LJFOLD(TNEW any any)
   2341 LJFOLD(TDUP any)
   2342 LJFOLD(CNEW any any)
   2343 LJFOLD(XSNEW any any)
   2344 LJFOLD(BUFHDR any any)
   2345 LJFOLDX(lj_ir_emit)
   2346 
   2347 /* ------------------------------------------------------------------------ */
   2348 
   2349 /* Every entry in the generated hash table is a 32 bit pattern:
   2350 **
   2351 ** xxxxxxxx iiiiiii lllllll rrrrrrrrrr
   2352 **
   2353 **   xxxxxxxx = 8 bit index into fold function table
   2354 **    iiiiiii = 7 bit folded instruction opcode
   2355 **    lllllll = 7 bit left instruction opcode
   2356 ** rrrrrrrrrr = 8 bit right instruction opcode or 10 bits from literal field
   2357 */
   2358 
   2359 #include "lj_folddef.h"
   2360 
   2361 /* ------------------------------------------------------------------------ */
   2362 
   2363 /* Fold IR instruction. */
   2364 TRef LJ_FASTCALL lj_opt_fold(jit_State *J)
   2365 {
   2366   uint32_t key, any;
   2367   IRRef ref;
   2368 
   2369   if (LJ_UNLIKELY((J->flags & JIT_F_OPT_MASK) != JIT_F_OPT_DEFAULT)) {
   2370     lua_assert(((JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE|JIT_F_OPT_DSE) |
   2371 		JIT_F_OPT_DEFAULT) == JIT_F_OPT_DEFAULT);
   2372     /* Folding disabled? Chain to CSE, but not for loads/stores/allocs. */
   2373     if (!(J->flags & JIT_F_OPT_FOLD) && irm_kind(lj_ir_mode[fins->o]) == IRM_N)
   2374       return lj_opt_cse(J);
   2375 
   2376     /* No FOLD, forwarding or CSE? Emit raw IR for loads, except for SLOAD. */
   2377     if ((J->flags & (JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE)) !=
   2378 		    (JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE) &&
   2379 	irm_kind(lj_ir_mode[fins->o]) == IRM_L && fins->o != IR_SLOAD)
   2380       return lj_ir_emit(J);
   2381 
   2382     /* No FOLD or DSE? Emit raw IR for stores. */
   2383     if ((J->flags & (JIT_F_OPT_FOLD|JIT_F_OPT_DSE)) !=
   2384 		    (JIT_F_OPT_FOLD|JIT_F_OPT_DSE) &&
   2385 	irm_kind(lj_ir_mode[fins->o]) == IRM_S)
   2386       return lj_ir_emit(J);
   2387   }
   2388 
   2389   /* Fold engine start/retry point. */
   2390 retry:
   2391   /* Construct key from opcode and operand opcodes (unless literal/none). */
   2392   key = ((uint32_t)fins->o << 17);
   2393   if (fins->op1 >= J->cur.nk) {
   2394     key += (uint32_t)IR(fins->op1)->o << 10;
   2395     *fleft = *IR(fins->op1);
   2396     if (fins->op1 < REF_TRUE)
   2397       fleft[1] = IR(fins->op1)[1];
   2398   }
   2399   if (fins->op2 >= J->cur.nk) {
   2400     key += (uint32_t)IR(fins->op2)->o;
   2401     *fright = *IR(fins->op2);
   2402     if (fins->op2 < REF_TRUE)
   2403       fright[1] = IR(fins->op2)[1];
   2404   } else {
   2405     key += (fins->op2 & 0x3ffu);  /* Literal mask. Must include IRCONV_*MASK. */
   2406   }
   2407 
   2408   /* Check for a match in order from most specific to least specific. */
   2409   any = 0;
   2410   for (;;) {
   2411     uint32_t k = key | (any & 0x1ffff);
   2412     uint32_t h = fold_hashkey(k);
   2413     uint32_t fh = fold_hash[h];  /* Lookup key in semi-perfect hash table. */
   2414     if ((fh & 0xffffff) == k || (fh = fold_hash[h+1], (fh & 0xffffff) == k)) {
   2415       ref = (IRRef)tref_ref(fold_func[fh >> 24](J));
   2416       if (ref != NEXTFOLD)
   2417 	break;
   2418     }
   2419     if (any == 0xfffff)  /* Exhausted folding. Pass on to CSE. */
   2420       return lj_opt_cse(J);
   2421     any = (any | (any >> 10)) ^ 0xffc00;
   2422   }
   2423 
   2424   /* Return value processing, ordered by frequency. */
   2425   if (LJ_LIKELY(ref >= MAX_FOLD))
   2426     return TREF(ref, irt_t(IR(ref)->t));
   2427   if (ref == RETRYFOLD)
   2428     goto retry;
   2429   if (ref == KINTFOLD)
   2430     return lj_ir_kint(J, fins->i);
   2431   if (ref == FAILFOLD)
   2432     lj_trace_err(J, LJ_TRERR_GFAIL);
   2433   lua_assert(ref == DROPFOLD);
   2434   return REF_DROP;
   2435 }
   2436 
   2437 /* -- Common-Subexpression Elimination ------------------------------------ */
   2438 
   2439 /* CSE an IR instruction. This is very fast due to the skip-list chains. */
   2440 TRef LJ_FASTCALL lj_opt_cse(jit_State *J)
   2441 {
   2442   /* Avoid narrow to wide store-to-load forwarding stall */
   2443   IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
   2444   IROp op = fins->o;
   2445   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
   2446     /* Limited search for same operands in per-opcode chain. */
   2447     IRRef ref = J->chain[op];
   2448     IRRef lim = fins->op1;
   2449     if (fins->op2 > lim) lim = fins->op2;  /* Relies on lit < REF_BIAS. */
   2450     while (ref > lim) {
   2451       if (IR(ref)->op12 == op12)
   2452 	return TREF(ref, irt_t(IR(ref)->t));  /* Common subexpression found. */
   2453       ref = IR(ref)->prev;
   2454     }
   2455   }
   2456   /* Otherwise emit IR (inlined for speed). */
   2457   {
   2458     IRRef ref = lj_ir_nextins(J);
   2459     IRIns *ir = IR(ref);
   2460     ir->prev = J->chain[op];
   2461     ir->op12 = op12;
   2462     J->chain[op] = (IRRef1)ref;
   2463     ir->o = fins->o;
   2464     J->guardemit.irt |= fins->t.irt;
   2465     return TREF(ref, irt_t((ir->t = fins->t)));
   2466   }
   2467 }
   2468 
   2469 /* CSE with explicit search limit. */
   2470 TRef LJ_FASTCALL lj_opt_cselim(jit_State *J, IRRef lim)
   2471 {
   2472   IRRef ref = J->chain[fins->o];
   2473   IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
   2474   while (ref > lim) {
   2475     if (IR(ref)->op12 == op12)
   2476       return ref;
   2477     ref = IR(ref)->prev;
   2478   }
   2479   return lj_ir_emit(J);
   2480 }
   2481 
   2482 /* ------------------------------------------------------------------------ */
   2483 
   2484 #undef IR
   2485 #undef fins
   2486 #undef fleft
   2487 #undef fright
   2488 #undef knumleft
   2489 #undef knumright
   2490 #undef emitir
   2491 
   2492 #endif