ref: 63d88a9c1b48234e50ef3e349c546f5e9fee1bea
parent: 7bf5257d008758f60bdffe0b68f4fa8fdca2e0f3
author: Tor Andersson <[email protected]>
date: Sat Jan 18 10:07:08 EST 2014
Rewrite lexer to use a proper lookahead token. Easier to understand, and in preparation for reading UTF-8.
--- a/jslex.c
+++ b/jslex.c
@@ -131,23 +131,23 @@
return 0;
}
-#define PEEK() (**sp)
+#define PEEK (J->lexchar)
#define NEXT() next(J, sp)
-#define UNGET() ((*sp)--)
-#define NEXTPEEK() (NEXT(), PEEK())
-#define ACCEPT(x) (PEEK() == x ? (NEXT(), 1) : 0)
+#define NEXTPEEK (NEXT(), PEEK)
+#define ACCEPT(x) (PEEK == x ? (NEXT(), 1) : 0)
+#define EXPECT(x) (ACCEPT(x) || (jsP_error(J, "expected '%c'", x), 0))
-static int next(js_State *J, const char **sp)
+static void next(js_State *J, const char **sp)
{
int c = *(*sp)++;
/* consume CR LF as one unit */
- if (c == '\r' && PEEK() == '\n')
+ if (c == '\r' && PEEK == '\n')
(*sp)++;
if (isnewline(c)) {
J->line++;
- J->newline = 1;
+ c = '\n';
}
- return c;
+ J->lexchar = c;
}
static void textinit(js_State *J)
@@ -176,34 +176,33 @@
static inline void lexlinecomment(js_State *J, const char **sp)
{
- int c = PEEK();
- while (c && !isnewline(c)) {
- c = NEXTPEEK();
- }
+ while (PEEK && !isnewline(PEEK))
+ NEXT();
}
static inline int lexcomment(js_State *J, const char **sp)
{
- while (1) {
- int c = NEXT();
- if (c == '*') {
- while (c == '*')
- c = NEXT();
- if (c == '/')
+ /* already consumed initial '/' '*' sequence */
+ while (PEEK != 0) {
+ if (ACCEPT('*')) {
+ while (ACCEPT('*'))
+ NEXT();
+ if (ACCEPT('/'))
return 0;
- } else if (c == 0) {
- return -1;
}
+ NEXT();
}
+ return -1;
}
static inline double lexhex(js_State *J, const char **sp)
{
double n = 0;
- int c = PEEK();
- while (ishex(c)) {
- n = n * 16 + tohex(c);
- c = NEXTPEEK();
+ if (!ishex(PEEK))
+ return jsP_error(J, "malformed hexadecimal number");
+ while (ishex(PEEK)) {
+ n = n * 16 + tohex(PEEK);
+ NEXT();
}
return n;
}
@@ -211,10 +210,11 @@
static inline double lexinteger(js_State *J, const char **sp)
{
double n = 0;
- int c = PEEK();
- while (isdec(c)) {
- n = n * 10 + (c - '0');
- c = NEXTPEEK();
+ if (!isdec(PEEK))
+ return jsP_error(J, "malformed number");
+ while (isdec(PEEK)) {
+ n = n * 10 + (PEEK - '0');
+ NEXT();
}
return n;
}
@@ -223,11 +223,10 @@
{
double n = 0;
double d = 1;
- int c = PEEK();
- while (isdec(c)) {
- n = n * 10 + (c - '0');
+ while (isdec(PEEK)) {
+ n = n * 10 + (PEEK - '0');
d = d * 10;
- c = NEXTPEEK();
+ NEXT();
}
return n / d;
}
@@ -234,13 +233,12 @@
static inline double lexexponent(js_State *J, const char **sp)
{
+ double sign;
if (ACCEPT('e') || ACCEPT('E')) {
- if (ACCEPT('-'))
- return -lexinteger(J, sp);
- else if (ACCEPT('+'))
- return lexinteger(J, sp);
- else
- return lexinteger(J, sp);
+ if (ACCEPT('-')) sign = -1;
+ else if (ACCEPT('+')) sign = 1;
+ else sign = 1;
+ return sign * lexinteger(J, sp);
}
return 0;
}
@@ -247,25 +245,30 @@
static inline int lexnumber(js_State *J, const char **sp)
{
- double n;
+ double n = 0;
- if ((*sp)[0] == '0' && ((*sp)[1] == 'x' || (*sp)[1] == 'X')) {
- *sp += 2;
- if (!ishex(PEEK()))
- return jsP_error(J, "0x not followed by hexademical digit");
- J->number = lexhex(J, sp);
- return TK_NUMBER;
+ if (ACCEPT('0')) {
+ if (ACCEPT('x') || ACCEPT('X')) {
+ J->number = lexhex(J, sp);
+ return TK_NUMBER;
+ }
+ if (isdec(PEEK))
+ return jsP_error(J, "number with leading zero");
}
- if ((*sp)[0] == '0' && isdec((*sp)[1]))
- return jsP_error(J, "number with leading zero");
+ if (ACCEPT('.')) {
+ if (!isdec(PEEK))
+ return '.';
+ n = lexfraction(J, sp);
+ n *= pow(10, lexexponent(J, sp));
+ } else {
+ n = lexinteger(J, sp);
+ if (ACCEPT('.'))
+ n += lexfraction(J, sp);
+ n *= pow(10, lexexponent(J, sp));
+ }
- n = lexinteger(J, sp);
- if (ACCEPT('.'))
- n += lexfraction(J, sp);
- n *= pow(10, lexexponent(J, sp));
-
- if (isidentifierstart(PEEK()))
+ if (isidentifierstart(PEEK))
return jsP_error(J, "number with letter suffix");
J->number = n;
@@ -274,58 +277,66 @@
static inline int lexescape(js_State *J, const char **sp)
{
- int c = NEXT();
int x = 0;
- if (isnewline(c))
+ /* already consumed '\' */
+
+ if (isnewline(PEEK)) {
+ NEXT();
return 0;
+ }
- switch (c) {
+ switch (PEEK) {
case 'u':
- if (!ishex(PEEK())) return 1; else x |= NEXTPEEK() << 12;
- if (!ishex(PEEK())) return 1; else x |= NEXTPEEK() << 8;
- if (!ishex(PEEK())) return 1; else x |= NEXTPEEK() << 4;
- if (!ishex(PEEK())) return 1; else x |= NEXTPEEK();
+ NEXT();
+ if (!ishex(PEEK)) return 1; else { x |= PEEK << 12; NEXT(); }
+ if (!ishex(PEEK)) return 1; else { x |= PEEK << 8; NEXT(); }
+ if (!ishex(PEEK)) return 1; else { x |= PEEK << 4; NEXT(); }
+ if (!ishex(PEEK)) return 1; else { x |= PEEK; NEXT(); }
textpush(J, x);
break;
case 'x':
- if (!ishex(PEEK())) return 1; else x |= NEXTPEEK() << 4;
- if (!ishex(PEEK())) return 1; else x |= NEXTPEEK();
+ NEXT();
+ if (!ishex(PEEK)) return 1; else { x |= PEEK << 4; NEXT(); }
+ if (!ishex(PEEK)) return 1; else { x |= PEEK; NEXT(); }
textpush(J, x);
break;
- case '0': textpush(J, 0); break;
- case '\\': textpush(J, '\\'); break;
- case '\'': textpush(J, '\''); break;
- case '"': textpush(J, '"'); break;
- case 'b': textpush(J, '\b'); break;
- case 'f': textpush(J, '\f'); break;
- case 'n': textpush(J, '\n'); break;
- case 'r': textpush(J, '\r'); break;
- case 't': textpush(J, '\t'); break;
- case 'v': textpush(J, '\v'); break;
- default: textpush(J, c); break;
+ case '0': textpush(J, 0); NEXT(); break;
+ case '\\': textpush(J, '\\'); NEXT(); break;
+ case '\'': textpush(J, '\''); NEXT(); break;
+ case '"': textpush(J, '"'); NEXT(); break;
+ case 'b': textpush(J, '\b'); NEXT(); break;
+ case 'f': textpush(J, '\f'); NEXT(); break;
+ case 'n': textpush(J, '\n'); NEXT(); break;
+ case 'r': textpush(J, '\r'); NEXT(); break;
+ case 't': textpush(J, '\t'); NEXT(); break;
+ case 'v': textpush(J, '\v'); NEXT(); break;
+ default: textpush(J, PEEK); NEXT(); break;
}
return 0;
}
-static inline int lexstring(js_State *J, const char **sp, int q)
+static inline int lexstring(js_State *J, const char **sp)
{
const char *s;
- int c = NEXT();
+ int q = PEEK;
+ NEXT();
+
textinit(J);
- while (c != q) {
- if (c == 0 || isnewline(c))
+ while (PEEK != q) {
+ if (PEEK == 0 || isnewline(PEEK))
return jsP_error(J, "string not terminated");
- if (c == '\\') {
+ if (ACCEPT('\\')) {
if (lexescape(J, sp))
return jsP_error(J, "malformed escape sequence");
} else {
- textpush(J, c);
+ textpush(J, PEEK);
+ NEXT();
}
- c = NEXT();
}
+ EXPECT(q);
s = textend(J);
@@ -359,25 +370,26 @@
int g, m, i;
int c;
+ /* already consumed initial '/' */
+
textinit(J);
/* regexp body */
- c = NEXT();
- while (c != '/') {
- if (c == 0 || isnewline(c)) {
+ while (PEEK != '/') {
+ if (PEEK == 0 || isnewline(PEEK)) {
return jsP_error(J, "regular expression not terminated");
- } else if (c == '\\') {
- textpush(J, c);
- c = NEXT();
- if (c == 0 || isnewline(c))
+ } else if (ACCEPT('\\')) {
+ textpush(J, '\\');
+ if (PEEK == 0 || isnewline(PEEK))
return jsP_error(J, "regular expression not terminated");
- textpush(J, c);
- c = NEXT();
+ textpush(J, PEEK);
+ NEXT();
} else {
- textpush(J, c);
- c = NEXT();
+ textpush(J, PEEK);
+ NEXT();
}
}
+ EXPECT('/');
s = textend(J);
@@ -384,13 +396,12 @@
/* regexp flags */
g = i = m = 0;
- c = PEEK();
- while (isidentifierpart(c)) {
- if (c == 'g') ++g;
- else if (c == 'i') ++i;
- else if (c == 'm') ++m;
- else return jsP_error(J, "illegal flag in regular expression: %c", c);
- c = NEXTPEEK();
+ c = PEEK;
+ while (isidentifierpart(PEEK)) {
+ if (ACCEPT('g')) ++g;
+ else if (ACCEPT('i')) ++i;
+ else if (ACCEPT('m')) ++m;
+ else return jsP_error(J, "illegal flag in regular expression: %c", PEEK);
}
if (g > 1 || i > 1 || m > 1)
@@ -424,18 +435,19 @@
while (1) {
J->lexline = J->line; /* save location of beginning of token */
- int c = NEXT();
- while (iswhite(c))
- c = NEXT();
+ while (iswhite(PEEK))
+ NEXT();
- if (isnewline(c)) {
+ if (isnewline(PEEK)) {
+ NEXT();
+ J->newline = 1;
if (isnlthcontext(J->lasttoken))
return ';';
continue;
}
- if (c == '/') {
+ if (ACCEPT('/')) {
if (ACCEPT('/')) {
lexlinecomment(J, sp);
continue;
@@ -453,14 +465,14 @@
}
// TODO: \uXXXX escapes
- if (isidentifierstart(c)) {
+ if (isidentifierstart(PEEK)) {
textinit(J);
- textpush(J, c);
+ textpush(J, PEEK);
- c = PEEK();
- while (isidentifierpart(c)) {
- textpush(J, c);
- c = NEXTPEEK();
+ NEXT();
+ while (isidentifierpart(PEEK)) {
+ textpush(J, PEEK);
+ NEXT();
}
textend(J);
@@ -468,37 +480,32 @@
return findkeyword(J, J->buf.text);
}
- if (c >= '0' && c <= '9') {
- UNGET();
+ if (PEEK >= '0' && PEEK <= '9') {
return lexnumber(J, sp);
}
- switch (c) {
- case '(':
- case ')':
- case ',':
- case ':':
- case ';':
- case '?':
- case '[':
- case ']':
- case '{':
- case '}':
- case '~':
- return c;
+ switch (PEEK) {
+ case '(': NEXT(); return '(';
+ case ')': NEXT(); return ')';
+ case ',': NEXT(); return ',';
+ case ':': NEXT(); return ':';
+ case ';': NEXT(); return ';';
+ case '?': NEXT(); return '?';
+ case '[': NEXT(); return '[';
+ case ']': NEXT(); return ']';
+ case '{': NEXT(); return '{';
+ case '}': NEXT(); return '}';
+ case '~': NEXT(); return '~';
case '\'':
case '"':
- return lexstring(J, sp, c);
+ return lexstring(J, sp);
case '.':
- if (isdec(PEEK())) {
- UNGET();
- return lexnumber(J, sp);
- }
- return '.';
+ return lexnumber(J, sp);
case '<':
+ NEXT();
if (ACCEPT('<')) {
if (ACCEPT('='))
return TK_SHL_ASS;
@@ -509,6 +516,7 @@
return '<';
case '>':
+ NEXT();
if (ACCEPT('>')) {
if (ACCEPT('>')) {
if (ACCEPT('='))
@@ -524,6 +532,7 @@
return '>';
case '=':
+ NEXT();
if (ACCEPT('=')) {
if (ACCEPT('='))
return TK_STRICTEQ;
@@ -532,6 +541,7 @@
return '=';
case '!':
+ NEXT();
if (ACCEPT('=')) {
if (ACCEPT('='))
return TK_STRICTNE;
@@ -540,6 +550,7 @@
return '!';
case '+':
+ NEXT();
if (ACCEPT('+'))
return TK_INC;
if (ACCEPT('='))
@@ -547,6 +558,7 @@
return '+';
case '-':
+ NEXT();
if (ACCEPT('-'))
return TK_DEC;
if (ACCEPT('='))
@@ -554,16 +566,19 @@
return '-';
case '*':
+ NEXT();
if (ACCEPT('='))
return TK_MUL_ASS;
return '*';
case '%':
+ NEXT();
if (ACCEPT('='))
return TK_MOD_ASS;
return '%';
case '&':
+ NEXT();
if (ACCEPT('&'))
return TK_AND;
if (ACCEPT('='))
@@ -571,6 +586,7 @@
return '&';
case '|':
+ NEXT();
if (ACCEPT('|'))
return TK_OR;
if (ACCEPT('='))
@@ -578,6 +594,7 @@
return '|';
case '^':
+ NEXT();
if (ACCEPT('='))
return TK_XOR_ASS;
return '^';
@@ -586,9 +603,9 @@
return 0; /* EOF */
}
- if (c >= 0x20 && c <= 0x7E)
- return jsP_error(J, "unexpected character: '%c'", c);
- return jsP_error(J, "unexpected character: \\u%04X", c);
+ if (PEEK >= 0x20 && PEEK <= 0x7E)
+ return jsP_error(J, "unexpected character: '%c'", PEEK);
+ return jsP_error(J, "unexpected character: \\u%04X", PEEK);
}
}
@@ -598,9 +615,7 @@
J->source = source;
J->line = 1;
J->lasttoken = 0;
- // FIXME: parse utf-8 proper instead of just skipping BOM
- if (!strncmp(source, "\357\273\277", 3))
- J->source += 3;
+ next(J, &J->source); /* load first lookahead character */
}
int jsP_lex(js_State *J)
--- a/jsstate.h
+++ b/jsstate.h
@@ -20,6 +20,7 @@
/* lexer */
struct { char *text; size_t len, cap; } buf;
int lexline;
+ int lexchar;
int lasttoken;
int newline;