src/lj_opt_narrow.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654

/*
** NARROW: Narrowing of numbers to integers (double to int32_t).
** STRIPOV: Stripping of overflow checks.
** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h
*/

#define lj_opt_narrow_c
#define LUA_CORE

#include "lj_obj.h"

#if LJ_HASJIT

#include "lj_bc.h"
#include "lj_ir.h"
#include "lj_jit.h"
#include "lj_iropt.h"
#include "lj_trace.h"
#include "lj_vm.h"
#include "lj_strscan.h"

/* Rationale for narrowing optimizations:
**
** Lua has only a single number type and this is a FP double by default.
** Narrowing doubles to integers does not pay off for the interpreter on a
** current-generation x86/x64 machine. Most FP operations need the same
** amount of execution resources as their integer counterparts, except
** with slightly longer latencies. Longer latencies are a non-issue for
** the interpreter, since they are usually hidden by other overhead.
**
** The total CPU execution bandwidth is the sum of the bandwidth of the FP
** and the integer units, because they execute in parallel. The FP units
** have an equal or higher bandwidth than the integer units. Not using
** them means losing execution bandwidth. Moving work away from them to
** the already quite busy integer units is a losing proposition.
**
** The situation for JIT-compiled code is a bit different: the higher code
** density makes the extra latencies much more visible. Tight loops expose
** the latencies for updating the induction variables. Array indexing
** requires narrowing conversions with high latencies and additional
** guards (to check that the index is really an integer). And many common
** optimizations only work on integers.
**
** One solution would be speculative, eager narrowing of all number loads.
** This causes many problems, like losing -0 or the need to resolve type
** mismatches between traces. It also effectively forces the integer type
** to have overflow-checking semantics. This impedes many basic
** optimizations and requires adding overflow checks to all integer
** arithmetic operations (whereas FP arithmetics can do without).
**
** Always replacing an FP op with an integer op plus an overflow check is
** counter-productive on a current-generation super-scalar CPU. Although
** the overflow check branches are highly predictable, they will clog the
** execution port for the branch unit and tie up reorder buffers. This is
** turning a pure data-flow dependency into a different data-flow
** dependency (with slightly lower latency) *plus* a control dependency.
** In general, you don't want to do this since latencies due to data-flow
** dependencies can be well hidden by out-of-order execution.
**
** A better solution is to keep all numbers as FP values and only narrow
** when it's beneficial to do so. LuaJIT uses predictive narrowing for
** induction variables and demand-driven narrowing for index expressions,
** integer arguments and bit operations. Additionally it can eliminate or
** hoist most of the resulting overflow checks. Regular arithmetic
** computations are never narrowed to integers.
**
** The integer type in the IR has convenient wrap-around semantics and
** ignores overflow. Extra operations have been added for
** overflow-checking arithmetic (ADDOV/SUBOV) instead of an extra type.
** Apart from reducing overall complexity of the compiler, this also
** nicely solves the problem where you want to apply algebraic
** simplifications to ADD, but not to ADDOV. And the x86/x64 assembler can
** use lea instead of an add for integer ADD, but not for ADDOV (lea does
** not affect the flags, but it helps to avoid register moves).
**
**
** All of the above has to be reconsidered for architectures with slow FP
** operations or without a hardware FPU. The dual-number mode of LuaJIT
** addresses this issue. Arithmetic operations are performed on integers
** as far as possible and overflow checks are added as needed.
**
** This implies that narrowing for integer arguments and bit operations
** should also strip overflow checks, e.g. replace ADDOV with ADD. The
** original overflow guards are weak and can be eliminated by DCE, if
** there's no other use.
**
** A slight twist is that it's usually beneficial to use overflow-checked
** integer arithmetics if all inputs are already integers. This is the only
** change that affects the single-number mode, too.
*/

/* Some local macros to save typing. Undef'd at the end. */
#define IR(ref)			(&J->cur.ir[(ref)])
#define fins			(&J->fold.ins)

/* Pass IR on to next optimization in chain (FOLD). */
#define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))

#define emitir_raw(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))

/* -- Elimination of narrowing type conversions --------------------------- */

/* Narrowing of index expressions and bit operations is demand-driven. The
** trace recorder emits a narrowing type conversion (CONV.int.num or TOBIT)
** in all of these cases (e.g. array indexing or string indexing). FOLD
** already takes care of eliminating simple redundant conversions like
** CONV.int.num(CONV.num.int(x)) ==> x.
**
** But the surrounding code is FP-heavy and arithmetic operations are
** performed on FP numbers (for the single-number mode). Consider a common
** example such as 'x=t[i+1]', with 'i' already an integer (due to induction
** variable narrowing). The index expression would be recorded as
**   CONV.int.num(ADD(CONV.num.int(i), 1))
** which is clearly suboptimal.
**
** One can do better by recursively backpropagating the narrowing type
** conversion across FP arithmetic operations. This turns FP ops into
** their corresponding integer counterparts. Depending on the semantics of
** the conversion they also need to check for overflow. Currently only ADD
** and SUB are supported.
**
** The above example can be rewritten as
**   ADDOV(CONV.int.num(CONV.num.int(i)), 1)
** and then into ADDOV(i, 1) after folding of the conversions. The original
** FP ops remain in the IR and are eliminated by DCE since all references to
** them are gone.
**
** [In dual-number mode the trace recorder already emits ADDOV etc., but
** this can be further reduced. See below.]
**
** Special care has to be taken to avoid narrowing across an operation
** which is potentially operating on non-integral operands. One obvious
** case is when an expression contains a non-integral constant, but ends
** up as an integer index at runtime (like t[x+1.5] with x=0.5).
**
** Operations with two non-constant operands illustrate a similar problem
** (like t[a+b] with a=1.5 and b=2.5). Backpropagation has to stop there,
** unless it can be proven that either operand is integral (e.g. by CSEing
** a previous conversion). As a not-so-obvious corollary this logic also
** applies for a whole expression tree (e.g. t[(a+1)+(b+1)]).
**
** Correctness of the transformation is guaranteed by avoiding to expand
** the tree by adding more conversions than the one we would need to emit
** if not backpropagating. TOBIT employs a more optimistic rule, because
** the conversion has special semantics, designed to make the life of the
** compiler writer easier. ;-)
**
** Using on-the-fly backpropagation of an expression tree doesn't work
** because it's unknown whether the transform is correct until the end.
** This either requires IR rollback and cache invalidation for every
** subtree or a two-pass algorithm. The former didn't work out too well,
** so the code now combines a recursive collector with a stack-based
** emitter.
**
** [A recursive backpropagation algorithm with backtracking, employing
** skip-list lookup and round-robin caching, emitting stack operations
** on-the-fly for a stack-based interpreter -- and all of that in a meager
** kilobyte? Yep, compilers are a great treasure chest. Throw away your
** textbooks and read the codebase of a compiler today!]
**
** There's another optimization opportunity for array indexing: it's
** always accompanied by an array bounds-check. The outermost overflow
** check may be delegated to the ABC operation. This works because ABC is
** an unsigned comparison and wrap-around due to overflow creates negative
** numbers.
**
** But this optimization is only valid for constants that cannot overflow
** an int32_t into the range of valid array indexes [0..2^27+1). A check
** for +-2^30 is safe since -2^31 - 2^30 wraps to 2^30 and 2^31-1 + 2^30
** wraps to -2^30-1.
**
** It's also good enough in practice, since e.g. t[i+1] or t[i-10] are
** quite common. So the above example finally ends up as ADD(i, 1)!
**
** Later on, the assembler is able to fuse the whole array reference and
** the ADD into the memory operands of loads and other instructions. This
** is why LuaJIT is able to generate very pretty (and fast) machine code
** for array indexing. And that, my dear, concludes another story about
** one of the hidden secrets of LuaJIT ...
*/

/* Maximum backpropagation depth and maximum stack size. */
#define NARROW_MAX_BACKPROP	100
#define NARROW_MAX_STACK	256

/* The stack machine has a 32 bit instruction format: [IROpT | IRRef1]
** The lower 16 bits hold a reference (or 0). The upper 16 bits hold
** the IR opcode + type or one of the following special opcodes:
*/
enum {
  NARROW_REF,		/* Push ref. */
  NARROW_CONV,		/* Push conversion of ref. */
  NARROW_SEXT,		/* Push sign-extension of ref. */
  NARROW_INT		/* Push KINT ref. The next code holds an int32_t. */
};

typedef uint32_t NarrowIns;

#define NARROWINS(op, ref)	(((op) << 16) + (ref))
#define narrow_op(ins)		((IROpT)((ins) >> 16))
#define narrow_ref(ins)		((IRRef1)(ins))

/* Context used for narrowing of type conversions. */
typedef struct NarrowConv {
  jit_State *J;		/* JIT compiler state. */
  NarrowIns *sp;	/* Current stack pointer. */
  NarrowIns *maxsp;	/* Maximum stack pointer minus redzone. */
  IRRef mode;		/* Conversion mode (IRCONV_*). */
  IRType t;		/* Destination type: IRT_INT or IRT_I64. */
  NarrowIns stack[NARROW_MAX_STACK];  /* Stack holding stack-machine code. */
} NarrowConv;

/* Lookup a reference in the backpropagation cache. */
static BPropEntry *narrow_bpc_get(jit_State *J, IRRef1 key, IRRef mode)
{
  ptrdiff_t i;
  for (i = 0; i < BPROP_SLOTS; i++) {
    BPropEntry *bp = &J->bpropcache[i];
    /* Stronger checks are ok, too. */
    if (bp->key == key && bp->mode >= mode &&
	((bp->mode ^ mode) & IRCONV_MODEMASK) == 0)
      return bp;
  }
  return NULL;
}

/* Add an entry to the backpropagation cache. */
static void narrow_bpc_set(jit_State *J, IRRef1 key, IRRef1 val, IRRef mode)
{
  uint32_t slot = J->bpropslot;
  BPropEntry *bp = &J->bpropcache[slot];
  J->bpropslot = (slot + 1) & (BPROP_SLOTS-1);
  bp->key = key;
  bp->val = val;
  bp->mode = mode;
}

/* Backpropagate overflow stripping. */
static void narrow_stripov_backprop(NarrowConv *nc, IRRef ref, int depth)
{
  jit_State *J = nc->J;
  IRIns *ir = IR(ref);
  if (ir->o == IR_ADDOV || ir->o == IR_SUBOV ||
      (ir->o == IR_MULOV && (nc->mode & IRCONV_CONVMASK) == IRCONV_ANY)) {
    BPropEntry *bp = narrow_bpc_get(nc->J, ref, IRCONV_TOBIT);
    if (bp) {
      ref = bp->val;
    } else if (++depth < NARROW_MAX_BACKPROP && nc->sp < nc->maxsp) {
      NarrowIns *savesp = nc->sp;
      narrow_stripov_backprop(nc, ir->op1, depth);
      if (nc->sp < nc->maxsp) {
	narrow_stripov_backprop(nc, ir->op2, depth);
	if (nc->sp < nc->maxsp) {
	  *nc->sp++ = NARROWINS(IRT(ir->o - IR_ADDOV + IR_ADD, IRT_INT), ref);
	  return;
	}
      }
      nc->sp = savesp;  /* Path too deep, need to backtrack. */
    }
  }
  *nc->sp++ = NARROWINS(NARROW_REF, ref);
}

/* Backpropagate narrowing conversion. Return number of needed conversions. */
static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
{
  jit_State *J = nc->J;
  IRIns *ir = IR(ref);
  IRRef cref;

  if (nc->sp >= nc->maxsp) return 10;  /* Path too deep. */

  /* Check the easy cases first. */
  if (ir->o == IR_CONV && (ir->op2 & IRCONV_SRCMASK) == IRT_INT) {
    if ((nc->mode & IRCONV_CONVMASK) <= IRCONV_ANY)
      narrow_stripov_backprop(nc, ir->op1, depth+1);
    else
      *nc->sp++ = NARROWINS(NARROW_REF, ir->op1);  /* Undo conversion. */
    if (nc->t == IRT_I64)
      *nc->sp++ = NARROWINS(NARROW_SEXT, 0);  /* Sign-extend integer. */
    return 0;
  } else if (ir->o == IR_KNUM) {  /* Narrow FP constant. */
    lua_Number n = ir_knum(ir)->n;
    if ((nc->mode & IRCONV_CONVMASK) == IRCONV_TOBIT) {
      /* Allows a wider range of constants. */
      int64_t k64 = (int64_t)n;
      if (n == (lua_Number)k64) {  /* Only if const doesn't lose precision. */
	*nc->sp++ = NARROWINS(NARROW_INT, 0);
	*nc->sp++ = (NarrowIns)k64;  /* But always truncate to 32 bits. */
	return 0;
      }
    } else {
      int32_t k = lj_num2int(n);
      /* Only if constant is a small integer. */
      if (checki16(k) && n == (lua_Number)k) {
	*nc->sp++ = NARROWINS(NARROW_INT, 0);
	*nc->sp++ = (NarrowIns)k;
	return 0;
      }
    }
    return 10;  /* Never narrow other FP constants (this is rare). */
  }

  /* Try to CSE the conversion. Stronger checks are ok, too. */
  cref = J->chain[fins->o];
  while (cref > ref) {
    IRIns *cr = IR(cref);
    if (cr->op1 == ref &&
	(fins->o == IR_TOBIT ||
	 ((cr->op2 & IRCONV_MODEMASK) == (nc->mode & IRCONV_MODEMASK) &&
	  irt_isguard(cr->t) >= irt_isguard(fins->t)))) {
      *nc->sp++ = NARROWINS(NARROW_REF, cref);
      return 0;  /* Already there, no additional conversion needed. */
    }
    cref = cr->prev;
  }

  /* Backpropagate across ADD/SUB. */
  if (ir->o == IR_ADD || ir->o == IR_SUB) {
    /* Try cache lookup first. */
    IRRef mode = nc->mode;
    BPropEntry *bp;
    /* Inner conversions need a stronger check. */
    if ((mode & IRCONV_CONVMASK) == IRCONV_INDEX && depth > 0)
      mode += IRCONV_CHECK-IRCONV_INDEX;
    bp = narrow_bpc_get(nc->J, (IRRef1)ref, mode);
    if (bp) {
      *nc->sp++ = NARROWINS(NARROW_REF, bp->val);
      return 0;
    } else if (nc->t == IRT_I64) {
      /* Try sign-extending from an existing (checked) conversion to int. */
      mode = (IRT_INT<<5)|IRT_NUM|IRCONV_INDEX;
      bp = narrow_bpc_get(nc->J, (IRRef1)ref, mode);
      if (bp) {
	*nc->sp++ = NARROWINS(NARROW_REF, bp->val);
	*nc->sp++ = NARROWINS(NARROW_SEXT, 0);
	return 0;
      }
    }
    if (++depth < NARROW_MAX_BACKPROP && nc->sp < nc->maxsp) {
      NarrowIns *savesp = nc->sp;
      int count = narrow_conv_backprop(nc, ir->op1, depth);
      count += narrow_conv_backprop(nc, ir->op2, depth);
      if (count <= 1) {  /* Limit total number of conversions. */
	*nc->sp++ = NARROWINS(IRT(ir->o, nc->t), ref);
	return count;
      }
      nc->sp = savesp;  /* Too many conversions, need to backtrack. */
    }
  }

  /* Otherwise add a conversion. */
  *nc->sp++ = NARROWINS(NARROW_CONV, ref);
  return 1;
}

/* Emit the conversions collected during backpropagation. */
static IRRef narrow_conv_emit(jit_State *J, NarrowConv *nc)
{
  /* The fins fields must be saved now -- emitir() overwrites them. */
  IROpT guardot = irt_isguard(fins->t) ? IRTG(IR_ADDOV-IR_ADD, 0) : 0;
  IROpT convot = fins->ot;
  IRRef1 convop2 = fins->op2;
  NarrowIns *next = nc->stack;  /* List of instructions from backpropagation. */
  NarrowIns *last = nc->sp;
  NarrowIns *sp = nc->stack;  /* Recycle the stack to store operands. */
  while (next < last) {  /* Simple stack machine to process the ins. list. */
    NarrowIns ref = *next++;
    IROpT op = narrow_op(ref);
    if (op == NARROW_REF) {
      *sp++ = ref;
    } else if (op == NARROW_CONV) {
      *sp++ = emitir_raw(convot, ref, convop2);  /* Raw emit avoids a loop. */
    } else if (op == NARROW_SEXT) {
      lua_assert(sp >= nc->stack+1);
      sp[-1] = emitir(IRT(IR_CONV, IRT_I64), sp[-1],
		      (IRT_I64<<5)|IRT_INT|IRCONV_SEXT);
    } else if (op == NARROW_INT) {
      lua_assert(next < last);
      *sp++ = nc->t == IRT_I64 ?
	      lj_ir_kint64(J, (int64_t)(int32_t)*next++) :
	      lj_ir_kint(J, *next++);
    } else {  /* Regular IROpT. Pops two operands and pushes one result. */
      IRRef mode = nc->mode;
      lua_assert(sp >= nc->stack+2);
      sp--;
      /* Omit some overflow checks for array indexing. See comments above. */
      if ((mode & IRCONV_CONVMASK) == IRCONV_INDEX) {
	if (next == last && irref_isk(narrow_ref(sp[0])) &&
	  (uint32_t)IR(narrow_ref(sp[0]))->i + 0x40000000u < 0x80000000u)
	  guardot = 0;
	else  /* Otherwise cache a stronger check. */
	  mode += IRCONV_CHECK-IRCONV_INDEX;
      }
      sp[-1] = emitir(op+guardot, sp[-1], sp[0]);
      /* Add to cache. */
      if (narrow_ref(ref))
	narrow_bpc_set(J, narrow_ref(ref), narrow_ref(sp[-1]), mode);
    }
  }
  lua_assert(sp == nc->stack+1);
  return nc->stack[0];
}

/* Narrow a type conversion of an arithmetic operation. */
TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J)
{
  if ((J->flags & JIT_F_OPT_NARROW)) {
    NarrowConv nc;
    nc.J = J;
    nc.sp = nc.stack;
    nc.maxsp = &nc.stack[NARROW_MAX_STACK-4];
    nc.t = irt_type(fins->t);
    if (fins->o == IR_TOBIT) {
      nc.mode = IRCONV_TOBIT;  /* Used only in the backpropagation cache. */
    } else {
      nc.mode = fins->op2;
    }
    if (narrow_conv_backprop(&nc, fins->op1, 0) <= 1)
      return narrow_conv_emit(J, &nc);
  }
  return NEXTFOLD;
}

/* -- Narrowing of implicit conversions ----------------------------------- */

/* Recursively strip overflow checks. */
static TRef narrow_stripov(jit_State *J, TRef tr, int lastop, IRRef mode)
{
  IRRef ref = tref_ref(tr);
  IRIns *ir = IR(ref);
  int op = ir->o;
  if (op >= IR_ADDOV && op <= lastop) {
    BPropEntry *bp = narrow_bpc_get(J, ref, mode);
    if (bp) {
      return TREF(bp->val, irt_t(IR(bp->val)->t));
    } else {
      IRRef op1 = ir->op1, op2 = ir->op2;  /* The IR may be reallocated. */
      op1 = narrow_stripov(J, op1, lastop, mode);
      op2 = narrow_stripov(J, op2, lastop, mode);
      tr = emitir(IRT(op - IR_ADDOV + IR_ADD,
		      ((mode & IRCONV_DSTMASK) >> IRCONV_DSH)), op1, op2);
      narrow_bpc_set(J, ref, tref_ref(tr), mode);
    }
  } else if (LJ_64 && (mode & IRCONV_SEXT) && !irt_is64(ir->t)) {
    tr = emitir(IRT(IR_CONV, IRT_INTP), tr, mode);
  }
  return tr;
}

/* Narrow array index. */
TRef LJ_FASTCALL lj_opt_narrow_index(jit_State *J, TRef tr)
{
  IRIns *ir;
  lua_assert(tref_isnumber(tr));
  if (tref_isnum(tr))  /* Conversion may be narrowed, too. See above. */
    return emitir(IRTGI(IR_CONV), tr, IRCONV_INT_NUM|IRCONV_INDEX);
  /* Omit some overflow checks for array indexing. See comments above. */
  ir = IR(tref_ref(tr));
  if ((ir->o == IR_ADDOV || ir->o == IR_SUBOV) && irref_isk(ir->op2) &&
      (uint32_t)IR(ir->op2)->i + 0x40000000u < 0x80000000u)
    return emitir(IRTI(ir->o - IR_ADDOV + IR_ADD), ir->op1, ir->op2);
  return tr;
}

/* Narrow conversion to integer operand (overflow undefined). */
TRef LJ_FASTCALL lj_opt_narrow_toint(jit_State *J, TRef tr)
{
  if (tref_isstr(tr))
    tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
  if (tref_isnum(tr))  /* Conversion may be narrowed, too. See above. */
    return emitir(IRTI(IR_CONV), tr, IRCONV_INT_NUM|IRCONV_ANY);
  if (!tref_isinteger(tr))
    lj_trace_err(J, LJ_TRERR_BADTYPE);
  /*
  ** Undefined overflow semantics allow stripping of ADDOV, SUBOV and MULOV.
  ** Use IRCONV_TOBIT for the cache entries, since the semantics are the same.
  */
  return narrow_stripov(J, tr, IR_MULOV, (IRT_INT<<5)|IRT_INT|IRCONV_TOBIT);
}

/* Narrow conversion to bitop operand (overflow wrapped). */
TRef LJ_FASTCALL lj_opt_narrow_tobit(jit_State *J, TRef tr)
{
  if (tref_isstr(tr))
    tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
  if (tref_isnum(tr))  /* Conversion may be narrowed, too. See above. */
    return emitir(IRTI(IR_TOBIT), tr, lj_ir_knum_tobit(J));
  if (!tref_isinteger(tr))
    lj_trace_err(J, LJ_TRERR_BADTYPE);
  /*
  ** Wrapped overflow semantics allow stripping of ADDOV and SUBOV.
  ** MULOV cannot be stripped due to precision widening.
  */
  return narrow_stripov(J, tr, IR_SUBOV, (IRT_INT<<5)|IRT_INT|IRCONV_TOBIT);
}

#if LJ_HASFFI
/* Narrow C array index (overflow undefined). */
TRef LJ_FASTCALL lj_opt_narrow_cindex(jit_State *J, TRef tr)
{
  lua_assert(tref_isnumber(tr));
  if (tref_isnum(tr))
    return emitir(IRT(IR_CONV, IRT_INTP), tr, (IRT_INTP<<5)|IRT_NUM|IRCONV_ANY);
  /* Undefined overflow semantics allow stripping of ADDOV, SUBOV and MULOV. */
  return narrow_stripov(J, tr, IR_MULOV,
			LJ_64 ? ((IRT_INTP<<5)|IRT_INT|IRCONV_SEXT) :
				((IRT_INTP<<5)|IRT_INT|IRCONV_TOBIT));
}
#endif

/* -- Narrowing of arithmetic operators ----------------------------------- */

/* Check whether a number fits into an int32_t (-0 is ok, too). */
static int numisint(lua_Number n)
{
  return (n == (lua_Number)lj_num2int(n));
}

/* Convert string to number. Error out for non-numeric string values. */
static TRef conv_str_tonum(jit_State *J, TRef tr, TValue *o)
{
  if (tref_isstr(tr)) {
    tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
    /* Would need an inverted STRTO for this rare and useless case. */
    if (!lj_strscan_num(strV(o), o))  /* Convert in-place. Value used below. */
      lj_trace_err(J, LJ_TRERR_BADTYPE);  /* Punt if non-numeric. */
  }
  return tr;
}

/* Narrowing of arithmetic operations. */
TRef lj_opt_narrow_arith(jit_State *J, TRef rb, TRef rc,
			 TValue *vb, TValue *vc, IROp op)
{
  rb = conv_str_tonum(J, rb, vb);
  rc = conv_str_tonum(J, rc, vc);
  /* Must not narrow MUL in non-DUALNUM variant, because it loses -0. */
  if ((op >= IR_ADD && op <= (LJ_DUALNUM ? IR_MUL : IR_SUB)) &&
      tref_isinteger(rb) && tref_isinteger(rc) &&
      numisint(lj_vm_foldarith(numberVnum(vb), numberVnum(vc),
			       (int)op - (int)IR_ADD)))
    return emitir(IRTGI((int)op - (int)IR_ADD + (int)IR_ADDOV), rb, rc);
  if (!tref_isnum(rb)) rb = emitir(IRTN(IR_CONV), rb, IRCONV_NUM_INT);
  if (!tref_isnum(rc)) rc = emitir(IRTN(IR_CONV), rc, IRCONV_NUM_INT);
  return emitir(IRTN(op), rb, rc);
}

/* Narrowing of unary minus operator. */
TRef lj_opt_narrow_unm(jit_State *J, TRef rc, TValue *vc)
{
  rc = conv_str_tonum(J, rc, vc);
  if (tref_isinteger(rc)) {
    if ((uint32_t)numberVint(vc) != 0x80000000u)
      return emitir(IRTGI(IR_SUBOV), lj_ir_kint(J, 0), rc);
    rc = emitir(IRTN(IR_CONV), rc, IRCONV_NUM_INT);
  }
  return emitir(IRTN(IR_NEG), rc, lj_ir_knum_neg(J));
}

/* Narrowing of modulo operator. */
TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc)
{
  TRef tmp;
  rb = conv_str_tonum(J, rb, vb);
  rc = conv_str_tonum(J, rc, vc);
  if ((LJ_DUALNUM || (J->flags & JIT_F_OPT_NARROW)) &&
      tref_isinteger(rb) && tref_isinteger(rc) &&
      (tvisint(vc) ? intV(vc) != 0 : !tviszero(vc))) {
    emitir(IRTGI(IR_NE), rc, lj_ir_kint(J, 0));
    return emitir(IRTI(IR_MOD), rb, rc);
  }
  /* b % c ==> b - floor(b/c)*c */
  rb = lj_ir_tonum(J, rb);
  rc = lj_ir_tonum(J, rc);
  tmp = emitir(IRTN(IR_DIV), rb, rc);
  tmp = emitir(IRTN(IR_FPMATH), tmp, IRFPM_FLOOR);
  tmp = emitir(IRTN(IR_MUL), tmp, rc);
  return emitir(IRTN(IR_SUB), rb, tmp);
}

/* Narrowing of power operator or math.pow. */
TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc)
{
  rb = conv_str_tonum(J, rb, vb);
  rb = lj_ir_tonum(J, rb);  /* Left arg is always treated as an FP number. */
  rc = conv_str_tonum(J, rc, vc);
  /* Narrowing must be unconditional to preserve (-x)^i semantics. */
  if (tvisint(vc) || numisint(numV(vc))) {
    int checkrange = 0;
    /* Split pow is faster for bigger exponents. But do this only for (+k)^i. */
    if (tref_isk(rb) && (int32_t)ir_knum(IR(tref_ref(rb)))->u32.hi >= 0) {
      int32_t k = numberVint(vc);
      if (!(k >= -65536 && k <= 65536)) goto split_pow;
      checkrange = 1;
    }
    if (!tref_isinteger(rc)) {
      /* Guarded conversion to integer! */
      rc = emitir(IRTGI(IR_CONV), rc, IRCONV_INT_NUM|IRCONV_CHECK);
    }
    if (checkrange && !tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
      TRef tmp = emitir(IRTI(IR_ADD), rc, lj_ir_kint(J, 65536));
      emitir(IRTGI(IR_ULE), tmp, lj_ir_kint(J, 2*65536));
    }
    return emitir(IRTN(IR_POW), rb, rc);
  }
split_pow:
  /* FOLD covers most cases, but some are easier to do here. */
  if (tref_isk(rb) && tvispone(ir_knum(IR(tref_ref(rb)))))
    return rb;  /* 1 ^ x ==> 1 */
  rc = lj_ir_tonum(J, rc);
  if (tref_isk(rc) && ir_knum(IR(tref_ref(rc)))->n == 0.5)
    return emitir(IRTN(IR_FPMATH), rb, IRFPM_SQRT);  /* x ^ 0.5 ==> sqrt(x) */
  /* Split up b^c into exp2(c*log2(b)). Assembler may rejoin later. */
  rb = emitir(IRTN(IR_FPMATH), rb, IRFPM_LOG2);
  rc = emitir(IRTN(IR_MUL), rb, rc);
  return emitir(IRTN(IR_FPMATH), rc, IRFPM_EXP2);
}

/* -- Predictive narrowing of induction variables ------------------------- */

/* Narrow a single runtime value. */
static int narrow_forl(jit_State *J, cTValue *o)
{
  if (tvisint(o)) return 1;
  if (LJ_DUALNUM || (J->flags & JIT_F_OPT_NARROW)) return numisint(numV(o));
  return 0;
}

/* Narrow the FORL index type by looking at the runtime values. */
IRType lj_opt_narrow_forl(jit_State *J, cTValue *tv)
{
  lua_assert(tvisnumber(&tv[FORL_IDX]) &&
	     tvisnumber(&tv[FORL_STOP]) &&
	     tvisnumber(&tv[FORL_STEP]));
  /* Narrow only if the runtime values of start/stop/step are all integers. */
  if (narrow_forl(J, &tv[FORL_IDX]) &&
      narrow_forl(J, &tv[FORL_STOP]) &&
      narrow_forl(J, &tv[FORL_STEP])) {
    /* And if the loop index can't possibly overflow. */
    lua_Number step = numberVnum(&tv[FORL_STEP]);
    lua_Number sum = numberVnum(&tv[FORL_STOP]) + step;
    if (0 <= step ? (sum <= 2147483647.0) : (sum >= -2147483648.0))
      return IRT_INT;
  }
  return IRT_NUM;
}

#undef IR
#undef fins
#undef emitir
#undef emitir_raw

#endif