ljx

FORK: LuaJIT with native 5.2 and 5.3 support
git clone https://git.neptards.moe/neptards/ljx.git
Log | Files | Refs | README

lj_lex.c (13885B)


      1 /*
      2 ** Lexical analyzer.
      3 ** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
      4 **
      5 ** Major portions taken verbatim or adapted from the Lua interpreter.
      6 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
      7 */
      8 
      9 #define lj_lex_c
     10 #define LUA_CORE
     11 
     12 #include "lj_obj.h"
     13 #include "lj_gc.h"
     14 #include "lj_err.h"
     15 #include "lj_buf.h"
     16 #include "lj_str.h"
     17 #if LJ_HASFFI
     18 #include "lj_tab.h"
     19 #include "lj_ctype.h"
     20 #include "lj_cdata.h"
     21 #include "lualib.h"
     22 #endif
     23 #include "lj_state.h"
     24 #include "lj_lex.h"
     25 #include "lj_parse.h"
     26 #include "lj_char.h"
     27 #include "lj_strscan.h"
     28 #include "lj_strfmt.h"
     29 
     30 /* Lua lexer token names. */
     31 static const char *const tokennames[] = {
     32 #define TKSTR1(name)		#name,
     33 #define TKSTR2(name, sym)	#sym,
     34 TKDEF(TKSTR1, TKSTR2)
     35 #undef TKSTR1
     36 #undef TKSTR2
     37   NULL
     38 };
     39 
     40 /* -- Buffer handling ----------------------------------------------------- */
     41 
     42 #define LEX_EOF			(-1)
     43 #define lex_iseol(ls)		(ls->c == '\n' || ls->c == '\r')
     44 
     45 /* Get more input from reader. */
     46 static LJ_NOINLINE LexChar lex_more(LexState *ls)
     47 {
     48   size_t sz;
     49   const char *p = ls->rfunc(ls->L, ls->rdata, &sz);
     50   if (p == NULL || sz == 0) return LEX_EOF;
     51   ls->pe = p + sz;
     52   ls->p = p + 1;
     53   return (LexChar)(uint8_t)p[0];
     54 }
     55 
     56 /* Get next character. */
     57 static LJ_AINLINE LexChar lex_next(LexState *ls)
     58 {
     59   return (ls->c = ls->p < ls->pe ? (LexChar)(uint8_t)*ls->p++ : lex_more(ls));
     60 }
     61 
     62 /* Save character. */
     63 static LJ_AINLINE void lex_save(LexState *ls, LexChar c)
     64 {
     65   lj_buf_putb(&ls->sb, c);
     66 }
     67 
     68 /* Save previous character and get next character. */
     69 static LJ_AINLINE LexChar lex_savenext(LexState *ls)
     70 {
     71   lex_save(ls, ls->c);
     72   return lex_next(ls);
     73 }
     74 
     75 /* Skip line break. Handles "\n", "\r", "\r\n" or "\n\r". */
     76 static void lex_newline(LexState *ls)
     77 {
     78   LexChar old = ls->c;
     79   lua_assert(lex_iseol(ls));
     80   lex_next(ls);  /* Skip "\n" or "\r". */
     81   if (lex_iseol(ls) && ls->c != old) lex_next(ls);  /* Skip "\n\r" or "\r\n". */
     82   if (++ls->linenumber >= LJ_MAX_LINE)
     83     lj_lex_error(ls, ls->tok, LJ_ERR_XLINES);
     84 }
     85 
     86 /* -- Scanner for terminals ----------------------------------------------- */
     87 
     88 /* Parse a number literal. */
     89 static void lex_number(LexState *ls, TValue *tv)
     90 {
     91   StrScanFmt fmt;
     92   LexChar c, xp = 'e';
     93   lua_assert(lj_char_isdigit(ls->c));
     94   if ((c = ls->c) == '0' && (lex_savenext(ls) | 0x20) == 'x')
     95     xp = 'p';
     96   while (lj_char_isident(ls->c) || ls->c == '.' ||
     97 	 ((ls->c == '-' || ls->c == '+') && (c | 0x20) == xp)) {
     98     c = ls->c;
     99     lex_savenext(ls);
    100   }
    101   lex_save(ls, '\0');
    102   fmt = lj_strscan_scan((const uint8_t *)sbufB(&ls->sb), tv,
    103 	  (LJ_DUALNUM ? STRSCAN_OPT_TOINT : STRSCAN_OPT_TONUM) |
    104 	  (LJ_HASFFI ? (STRSCAN_OPT_LL|STRSCAN_OPT_IMAG) : 0));
    105   if (LJ_DUALNUM && fmt == STRSCAN_INT) {
    106     setitype(tv, LJ_TISNUM);
    107   } else if (fmt == STRSCAN_NUM) {
    108     /* Already in correct format. */
    109 #if LJ_HASFFI
    110   } else if (fmt != STRSCAN_ERROR) {
    111     lua_State *L = ls->L;
    112     GCcdata *cd;
    113     lua_assert(fmt == STRSCAN_I64 || fmt == STRSCAN_U64 || fmt == STRSCAN_IMAG);
    114     if (!ctype_ctsG(G(L))) {
    115       ptrdiff_t oldtop = savestack(L, L->top);
    116       luaopen_ffi(L);  /* Load FFI library on-demand. */
    117       L->top = restorestack(L, oldtop);
    118     }
    119     if (fmt == STRSCAN_IMAG) {
    120       cd = lj_cdata_new_(L, CTID_COMPLEX_DOUBLE, 2*sizeof(double));
    121       ((double *)cdataptr(cd))[0] = 0;
    122       ((double *)cdataptr(cd))[1] = numV(tv);
    123     } else {
    124       cd = lj_cdata_new_(L, fmt==STRSCAN_I64 ? CTID_INT64 : CTID_UINT64, 8);
    125       *(uint64_t *)cdataptr(cd) = tv->u64;
    126     }
    127     lj_parse_keepcdata(ls, tv, cd);
    128 #endif
    129   } else {
    130     lua_assert(fmt == STRSCAN_ERROR);
    131     lj_lex_error(ls, TK_number, LJ_ERR_XNUMBER);
    132   }
    133 }
    134 
    135 /* Skip equal signs for "[=...=[" and "]=...=]" and return their count. */
    136 static int lex_skipeq(LexState *ls)
    137 {
    138   int count = 0;
    139   LexChar s = ls->c;
    140   lua_assert(s == '[' || s == ']');
    141   while (lex_savenext(ls) == '=')
    142     count++;
    143   return (ls->c == s) ? count : (-count) - 1;
    144 }
    145 
    146 /* Parse a long string or long comment (tv set to NULL). */
    147 static void lex_longstring(LexState *ls, TValue *tv, int sep)
    148 {
    149   lex_savenext(ls);  /* Skip second '['. */
    150   if (lex_iseol(ls))  /* Skip initial newline. */
    151     lex_newline(ls);
    152   for (;;) {
    153     switch (ls->c) {
    154     case LEX_EOF:
    155       lj_lex_error(ls, TK_eof, tv ? LJ_ERR_XLSTR : LJ_ERR_XLCOM);
    156       break;
    157     case ']':
    158       if (lex_skipeq(ls) == sep) {
    159 	lex_savenext(ls);  /* Skip second ']'. */
    160 	goto endloop;
    161       }
    162       break;
    163     case '\n':
    164     case '\r':
    165       lex_save(ls, '\n');
    166       lex_newline(ls);
    167       if (!tv) lj_buf_reset(&ls->sb);  /* Don't waste space for comments. */
    168       break;
    169     default:
    170       lex_savenext(ls);
    171       break;
    172     }
    173   } endloop:
    174   if (tv) {
    175     GCstr *str = lj_parse_keepstr(ls, sbufB(&ls->sb) + (2 + (MSize)sep),
    176 				      sbuflen(&ls->sb) - 2*(2 + (MSize)sep));
    177     setstrV(ls->L, tv, str);
    178   }
    179 }
    180 
    181 /* Parse a string. */
    182 static void lex_string(LexState *ls, TValue *tv)
    183 {
    184   LexChar delim = ls->c;  /* Delimiter is '\'' or '"'. */
    185   lex_savenext(ls);
    186   while (ls->c != delim) {
    187     switch (ls->c) {
    188     case LEX_EOF:
    189       lj_lex_error(ls, TK_eof, LJ_ERR_XSTR);
    190       continue;
    191     case '\n':
    192     case '\r':
    193       lj_lex_error(ls, TK_string, LJ_ERR_XSTR);
    194       continue;
    195     case '\\': {
    196       LexChar c = lex_next(ls);  /* Skip the '\\'. */
    197       switch (c) {
    198       case 'a': c = '\a'; break;
    199       case 'b': c = '\b'; break;
    200       case 'f': c = '\f'; break;
    201       case 'n': c = '\n'; break;
    202       case 'r': c = '\r'; break;
    203       case 't': c = '\t'; break;
    204       case 'v': c = '\v'; break;
    205       case 'x':  /* Hexadecimal escape '\xXX'. */
    206 	c = (lex_next(ls) & 15u) << 4;
    207 	if (!lj_char_isdigit(ls->c)) {
    208 	  if (!lj_char_isxdigit(ls->c))
    209 	    lj_lex_error(ls, TK_string, LJ_ERR_XHEX);
    210 	  c += 9 << 4;
    211 	}
    212 	c += (lex_next(ls) & 15u);
    213 	if (!lj_char_isdigit(ls->c)) {
    214 	  if (!lj_char_isxdigit(ls->c))
    215 	    lj_lex_error(ls, TK_string, LJ_ERR_XHEX);
    216 	  c += 9;
    217 	}
    218 	break;
    219       case 'u':  /* Unicode escape '\u{XX...}'. */
    220 	if (lex_next(ls) != '{') goto err_xesc;
    221 	lex_next(ls);
    222 	c = 0;
    223 	do {
    224 	  c = (c << 4) | (ls->c & 15u);
    225 	  if (!lj_char_isdigit(ls->c)) {
    226 	    if (!lj_char_isxdigit(ls->c)) goto err_xesc;
    227 	    c += 9;
    228 	  }
    229 	  if (c >= 0x110000) goto err_xesc;  /* Out of Unicode range. */
    230 	} while (lex_next(ls) != '}');
    231 	if (c < 0x800) {
    232 	  if (c < 0x80) break;
    233 	  lex_save(ls, 0xc0 | (c >> 6));
    234 	} else {
    235 	  if (c >= 0x10000) {
    236 	    lex_save(ls, 0xf0 | (c >> 18));
    237 	    lex_save(ls, 0x80 | ((c >> 12) & 0x3f));
    238 	  } else {
    239 	    if (c >= 0xd800 && c < 0xe000) goto err_xesc;  /* No surrogates. */
    240 	    lex_save(ls, 0xe0 | (c >> 12));
    241 	  }
    242 	  lex_save(ls, 0x80 | ((c >> 6) & 0x3f));
    243 	}
    244 	c = 0x80 | (c & 0x3f);
    245 	break;
    246       case 'z':  /* Skip whitespace. */
    247 	lex_next(ls);
    248 	while (lj_char_isspace(ls->c))
    249 	  if (lex_iseol(ls)) lex_newline(ls); else lex_next(ls);
    250 	continue;
    251       case '\n': case '\r': lex_save(ls, '\n'); lex_newline(ls); continue;
    252       case '\\': case '\"': case '\'': break;
    253       case LEX_EOF: continue;
    254       default:
    255 	if (!lj_char_isdigit(c))
    256 	  goto err_xesc;
    257 	c -= '0';  /* Decimal escape '\ddd'. */
    258 	if (lj_char_isdigit(lex_next(ls))) {
    259 	  c = c*10 + (ls->c - '0');
    260 	  if (lj_char_isdigit(lex_next(ls))) {
    261 	    c = c*10 + (ls->c - '0');
    262 	    if (c > 255) {
    263 	    err_xesc:
    264 	      lj_lex_error(ls, TK_string, LJ_ERR_XESC);
    265 	    }
    266 	    lex_next(ls);
    267 	  }
    268 	}
    269 	lex_save(ls, c);
    270 	continue;
    271       }
    272       lex_save(ls, c);
    273       lex_next(ls);
    274       continue;
    275       }
    276     default:
    277       lex_savenext(ls);
    278       break;
    279     }
    280   }
    281   lex_savenext(ls);  /* Skip trailing delimiter. */
    282   setstrV(ls->L, tv,
    283 	  lj_parse_keepstr(ls, sbufB(&ls->sb)+1, sbuflen(&ls->sb)-2));
    284 }
    285 
    286 /* -- Main lexical scanner ------------------------------------------------ */
    287 static LJ_AINLINE int check(LexState *ls, LexChar c)
    288 {
    289   if (ls->c == c) {
    290     lex_next(ls);
    291     return 1;
    292   }
    293   return 0;
    294 }
    295 
    296 /* Get next lexical token. */
    297 static LexToken lex_scan(LexState *ls, TValue *tv)
    298 {
    299   lj_buf_reset(&ls->sb);
    300   for (;;) {
    301     if (lj_char_isident(ls->c)) {
    302       GCstr *s;
    303       if (lj_char_isdigit(ls->c)) {  /* Numeric literal. */
    304 	lex_number(ls, tv);
    305 	return TK_number;
    306       }
    307       /* Identifier or reserved word. */
    308       do {
    309 	lex_savenext(ls);
    310       } while (lj_char_isident(ls->c));
    311       s = lj_parse_keepstr(ls, sbufB(&ls->sb), sbuflen(&ls->sb));
    312       setstrV(ls->L, tv, s);
    313       if (s->reserved > 0)  /* Reserved word? */
    314 	return TK_OFS + s->reserved;
    315       return TK_name;
    316     }
    317     switch (ls->c) {
    318     case '\n':
    319     case '\r':
    320       lex_newline(ls);
    321       continue;
    322     case ' ':
    323     case '\t':
    324     case '\v':
    325     case '\f':
    326       lex_next(ls);
    327       continue;
    328     case '-':
    329       lex_next(ls);
    330       if (ls->c != '-') return '-';
    331       lex_next(ls);
    332       if (ls->c == '[') {  /* Long comment "--[=*[...]=*]". */
    333 	int sep = lex_skipeq(ls);
    334 	lj_buf_reset(&ls->sb);  /* `lex_skipeq' may dirty the buffer */
    335 	if (sep >= 0) {
    336 	  lex_longstring(ls, NULL, sep);
    337 	  lj_buf_reset(&ls->sb);
    338 	  continue;
    339 	}
    340       }
    341       /* Short comment "--.*\n". */
    342       while (!lex_iseol(ls) && ls->c != LEX_EOF)
    343 	lex_next(ls);
    344       continue;
    345     case '[': {
    346       int sep = lex_skipeq(ls);
    347       if (sep >= 0) {
    348 	lex_longstring(ls, tv, sep);
    349 	return TK_string;
    350       } else if (sep == -1) {
    351 	return '[';
    352       } else {
    353 	lj_lex_error(ls, TK_string, LJ_ERR_XLDELIM);
    354 	continue;
    355       }
    356       }
    357     case '=':
    358       lex_next(ls);
    359       if (ls->c != '=') return '='; else { lex_next(ls); return TK_eq; }
    360     case '<':
    361       lex_next(ls);
    362       if (check(ls, '=')) return TK_le;
    363 #if LJ_53
    364       if (check(ls, '<')) return TK_shl;
    365 #endif
    366       return '<';
    367     case '>':
    368       lex_next(ls);
    369       if (check(ls, '=')) return TK_ge;
    370 #if LJ_53
    371       if (check(ls, '>')) return TK_shr;
    372 #endif
    373       return '>';
    374 #if LJ_53
    375     case '/':
    376       lex_next(ls);
    377       if (check(ls, '/')) return TK_idiv;
    378       return '/';
    379 #endif
    380     case '~':
    381       lex_next(ls);
    382       if (ls->c != '=') return '~'; else { lex_next(ls); return TK_ne; }
    383     case ':':
    384       lex_next(ls);
    385       if (ls->c != ':') return ':'; else { lex_next(ls); return TK_label; }
    386     case '"':
    387     case '\'':
    388       lex_string(ls, tv);
    389       return TK_string;
    390     case '.':
    391       if (lex_savenext(ls) == '.') {
    392 	lex_next(ls);
    393 	if (ls->c == '.') {
    394 	  lex_next(ls);
    395 	  return TK_dots;   /* ... */
    396 	}
    397 	return TK_concat;   /* .. */
    398       } else if (!lj_char_isdigit(ls->c)) {
    399 	return '.';
    400       } else {
    401 	lex_number(ls, tv);
    402 	return TK_number;
    403       }
    404     case LEX_EOF:
    405       return TK_eof;
    406     default: {
    407       LexChar c = ls->c;
    408       lex_next(ls);
    409       return c;  /* Single-char tokens (+ - / ...). */
    410     }
    411     }
    412   }
    413 }
    414 
    415 /* -- Lexer API ----------------------------------------------------------- */
    416 
    417 /* Setup lexer state. */
    418 int lj_lex_setup(lua_State *L, LexState *ls)
    419 {
    420   int header = 0;
    421   ls->L = L;
    422   ls->fs = NULL;
    423   ls->pe = ls->p = NULL;
    424   ls->vstack = NULL;
    425   ls->sizevstack = 0;
    426   ls->vtop = 0;
    427   ls->bcstack = NULL;
    428   ls->sizebcstack = 0;
    429   ls->tok = 0;
    430   ls->lookahead = TK_eof;  /* No look-ahead token. */
    431   ls->linenumber = 1;
    432   ls->lastline = 1;
    433   lex_next(ls);  /* Read-ahead first char. */
    434   if (ls->c == 0xef && ls->p + 2 <= ls->pe && (uint8_t)ls->p[0] == 0xbb &&
    435       (uint8_t)ls->p[1] == 0xbf) {  /* Skip UTF-8 BOM (if buffered). */
    436     ls->p += 2;
    437     lex_next(ls);
    438     header = 1;
    439   }
    440   if (ls->c == '#') {  /* Skip POSIX #! header line. */
    441     do {
    442       lex_next(ls);
    443       if (ls->c == LEX_EOF) return 0;
    444     } while (!lex_iseol(ls));
    445     lex_newline(ls);
    446     header = 1;
    447   }
    448   if (ls->c == LUA_SIGNATURE[0]) {  /* Bytecode dump. */
    449     if (header) {
    450       /*
    451       ** Loading bytecode with an extra header is disabled for security
    452       ** reasons. This may circumvent the usual check for bytecode vs.
    453       ** Lua code by looking at the first char. Since this is a potential
    454       ** security violation no attempt is made to echo the chunkname either.
    455       */
    456       setstrV(L, L->top++, lj_err_str(L, LJ_ERR_BCBAD));
    457       lj_err_throw(L, LUA_ERRSYNTAX);
    458     }
    459     return 1;
    460   }
    461   return 0;
    462 }
    463 
    464 /* Cleanup lexer state. */
    465 void lj_lex_cleanup(lua_State *L, LexState *ls)
    466 {
    467   global_State *g = G(L);
    468   lj_mem_freevec(g, ls->bcstack, ls->sizebcstack, BCInsLine);
    469   lj_mem_freevec(g, ls->vstack, ls->sizevstack, VarInfo);
    470   lj_buf_free(g, &ls->sb);
    471 }
    472 
    473 /* Return next lexical token. */
    474 void lj_lex_next(LexState *ls)
    475 {
    476   ls->lastline = ls->linenumber;
    477   if (LJ_LIKELY(ls->lookahead == TK_eof)) {  /* No lookahead token? */
    478     ls->tok = lex_scan(ls, &ls->tokval);  /* Get next token. */
    479   } else {  /* Otherwise return lookahead token. */
    480     ls->tok = ls->lookahead;
    481     ls->lookahead = TK_eof;
    482     ls->tokval = ls->lookaheadval;
    483   }
    484 }
    485 
    486 /* Look ahead for the next token. */
    487 LexToken lj_lex_lookahead(LexState *ls)
    488 {
    489   lua_assert(ls->lookahead == TK_eof);
    490   ls->lookahead = lex_scan(ls, &ls->lookaheadval);
    491   return ls->lookahead;
    492 }
    493 
    494 /* Convert token to string. */
    495 const char *lj_lex_token2str(LexState *ls, LexToken tok)
    496 {
    497   if (tok > TK_OFS)
    498     return tokennames[tok-TK_OFS-1];
    499   else if (!lj_char_iscntrl(tok))
    500     return lj_strfmt_pushf(ls->L, LUA_QL("%c"), tok);
    501   else
    502     return lj_strfmt_pushf(ls->L, "char(%d)", tok);
    503 }
    504 
    505 /* Lexer error. */
    506 void lj_lex_error(LexState *ls, LexToken tok, ErrMsg em, ...)
    507 {
    508   const char *tokstr;
    509   va_list argp;
    510   if (tok == 0) {
    511     tokstr = NULL;
    512   } else if (tok == TK_name || tok == TK_string || tok == TK_number) {
    513     lex_save(ls, '\0');
    514     tokstr = sbufB(&ls->sb);
    515   } else {
    516     tokstr = lj_lex_token2str(ls, tok);
    517   }
    518   va_start(argp, em);
    519   lj_err_lex(ls->L, ls->chunkname, tokstr, ls->linenumber, em, argp, tok == TK_string);
    520   va_end(argp);
    521 }
    522 
    523 /* Initialize strings for reserved words. */
    524 void lj_lex_init(lua_State *L)
    525 {
    526   uint32_t i;
    527   for (i = 0; i < TK_RESERVED; i++) {
    528     GCstr *s = lj_str_newz(L, tokennames[i]);
    529     fixstring(s);  /* Reserved words are never collected. */
    530     s->reserved = (uint8_t)(i+1);
    531   }
    532 }
    533