ref: bb2279e43d905ca8383d5dec5eee57feec5462b1
parent: c816b1193efeabbeeddcf8c5324e92b895956dd5
author: Sebastian Rasmussen <[email protected]>
date: Mon Dec 23 14:47:12 EST 2013
Add lexer.
--- /dev/null
+++ b/Makefile
@@ -1,0 +1,25 @@
+SRCS := $(wildcard js*.c)
+HDRS := $(wildcard js*.h)
+OBJS := $(SRCS:%.c=build/%.o)
+
+CFLAGS = -Wall -g
+
+default: build js
+
+build:
+ mkdir -p build
+
+build/%.o : %.c $(HDRS)
+ $(CC) -c $< -o $@ $(CFLAGS)
+
+build/libjs.a: $(OBJS)
+ ar cru $@ $^
+
+js: build/main.o build/libjs.a
+ $(CC) -o $@ $^ -lm
+
+tags:
+ ctags *.c *.h
+
+clean:
+ rm -f build/* js
--- /dev/null
+++ b/js-lex.c
@@ -1,0 +1,468 @@
+#include "js.h"
+
+#define nelem(a) (sizeof (a) / sizeof (a)[0])
+
+struct {
+ const char *string;
+ js_Token token;
+} keywords[] = {
+ {"abstract", JS_ABSTRACT},
+ {"boolean", JS_BOOLEAN},
+ {"break", JS_BREAK},
+ {"byte", JS_BYTE},
+ {"case", JS_CASE},
+ {"catch", JS_CATCH},
+ {"char", JS_CHAR},
+ {"class", JS_CLASS},
+ {"const", JS_CONST},
+ {"continue", JS_CONTINUE},
+ {"debugger", JS_DEBUGGER},
+ {"default", JS_DEFAULT},
+ {"delete", JS_DELETE},
+ {"do", JS_DO},
+ {"double", JS_DOUBLE},
+ {"else", JS_ELSE},
+ {"enum", JS_ENUM},
+ {"export", JS_EXPORT},
+ {"extends", JS_EXTENDS},
+ {"false", JS_FALSE},
+ {"final", JS_FINAL},
+ {"finally", JS_FINALLY},
+ {"float", JS_FLOAT},
+ {"for", JS_FOR},
+ {"function", JS_FUNCTION},
+ {"goto", JS_GOTO},
+ {"if", JS_IF},
+ {"implements", JS_IMPLEMENTS},
+ {"import", JS_IMPORT},
+ {"in", JS_IN},
+ {"instanceof", JS_INSTANCEOF},
+ {"int", JS_INT},
+ {"interface", JS_INTERFACE},
+ {"long", JS_LONG},
+ {"native", JS_NATIVE},
+ {"new", JS_NEW},
+ {"null", JS_NULL},
+ {"package", JS_PACKAGE},
+ {"private", JS_PRIVATE},
+ {"protected", JS_PROTECTED},
+ {"public", JS_PUBLIC},
+ {"return", JS_RETURN},
+ {"short", JS_SHORT},
+ {"static", JS_STATIC},
+ {"super", JS_SUPER},
+ {"switch", JS_SWITCH},
+ {"synchronized", JS_SYNCHRONIZED},
+ {"this", JS_THIS},
+ {"throw", JS_THROW},
+ {"throws", JS_THROWS},
+ {"transient", JS_TRANSIENT},
+ {"true", JS_TRUE},
+ {"try", JS_TRY},
+ {"typeof", JS_TYPEOF},
+ {"var", JS_VAR},
+ {"void", JS_VOID},
+ {"volatile", JS_VOLATILE},
+ {"while", JS_WHILE},
+ {"with", JS_WITH},
+};
+
+const char *tokenstrings[] = {
+ "ERROR", "EOF", "(identifier)", "null", "true", "false", "(number)",
+ "(string)", "(regexp)", "\\n", "{", "}", "(", ")", "[", "]", ".", ";",
+ ",", "<", ">", "<=", ">=", "==", "!=", "===", "!==", "+", "-", "*",
+ "%", "++", "--", "<<", ">>", ">>>", "&", "|", "^", "!", "~", "&&",
+ "||", "?", ":", "=", "+=", "-=", "*=", "%=", "<<=", ">>=", ">>>=",
+ "&=", "|=", "^=", "/", "/=", "break", "case", "catch", "continue",
+ "default", "delete", "do", "else", "finally", "for", "function", "if",
+ "in", "instanceof", "new", "return", "switch", "this", "throw", "try",
+ "typeof", "var", "void", "while", "with", "abstract", "boolean",
+ "byte", "char", "class", "const", "debugger", "double", "enum",
+ "export", "extends", "final", "float", "goto", "implements", "import",
+ "int", "interface", "long", "native", "package", "private",
+ "protected", "public", "short", "static", "super", "synchronized",
+ "throws", "transient", "volatile",
+};
+
+const char *js_tokentostring(js_Token t)
+{
+ return tokenstrings[t];
+}
+
+static inline js_Token findkeyword(const char *s)
+{
+ int m, l, r;
+ int c;
+
+ l = 0;
+ r = nelem(keywords) - 1;
+
+ while (l <= r) {
+ m = (l + r) >> 1;
+ c = strcmp(s, keywords[m].string);
+ if (c < 0)
+ r = m - 1;
+ else if (c > 0)
+ l = m + 1;
+ else
+ return keywords[m].token;
+ }
+
+ return JS_IDENTIFIER;
+}
+
+static inline int iswhite(int c)
+{
+ return c == 0x9 || c == 0xb || c == 0xc || c == 0x20 || c == 0xa0;
+}
+
+static inline int isnewline(c)
+{
+ return c == 0xa || c == 0xd || c == 0x2028 || c == 0x2029;
+}
+
+#define GETC() *(*sp)++
+#define UNGETC() (*sp)--
+#define LOOK(x) (**sp == x ? *(*sp)++ : 0)
+
+static inline void lexlinecomment(const char **sp)
+{
+ int c = GETC();
+ while (!isnewline(c))
+ c = GETC();
+ UNGETC();
+}
+
+static inline int lexcomment(const char **sp)
+{
+ while (1) {
+ int c = GETC();
+ if (c == '*') {
+ while (c == '*')
+ c = GETC();
+ if (c == '/')
+ return 0;
+ } else if (c == 0) {
+ return -1;
+ }
+ }
+}
+
+static inline int isidentifierstart(int c)
+{
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '$' || c == '_';
+}
+
+static inline int isidentifierpart(int c)
+{
+ return (c >= '0' && c <= '9') || isidentifierstart(c);
+}
+
+static inline int isdec(int c)
+{
+ return (c >= '0' && c <= '9');
+}
+
+static inline int ishex(int c)
+{
+ return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+
+static inline int tohex(int c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 0xa;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 0xa;
+ return 0;
+}
+
+static inline js_Token lexhex(const char **sp, double *yynumber)
+{
+ int c = GETC();
+ double n = 0;
+
+ if (!ishex(c))
+ return JS_ERROR;
+
+ do {
+ n = n * 16 + tohex(c);
+ c = GETC();
+ } while (ishex(c));
+
+ UNGETC();
+ *yynumber = n;
+
+ return JS_NUMBER;
+}
+
+static inline double lexinteger(const char **sp)
+{
+ int c = GETC();
+ double n = 0;
+
+ while (isdec(c)) {
+ n = n * 10 + (c - '0');
+ c = GETC();
+ }
+
+ UNGETC();
+
+ return n;
+}
+
+static inline double lexfraction(const char **sp)
+{
+ int c = GETC();
+ double n = 0;
+ double d = 1;
+
+ while (isdec(c)) {
+ n = n * 10 + (c - '0');
+ d = d * 10;
+ c = GETC();
+ }
+
+ UNGETC();
+
+ return n / d;
+}
+
+static inline js_Token lexnumber(int c, const char **sp, double *yynumber)
+{
+ double i, f, e;
+
+ if (c == '0' && (LOOK('x') || LOOK('X')))
+ return lexhex(sp, yynumber);
+
+ UNGETC();
+
+ i = lexinteger(sp);
+
+ f = 0;
+ if (LOOK('.'))
+ f = lexfraction(sp);
+
+ e = 0;
+ if (LOOK('e') || LOOK('E')) {
+ if (LOOK('-'))
+ e = -lexinteger(sp);
+ else if (LOOK('+'))
+ e = lexinteger(sp);
+ else
+ e = lexinteger(sp);
+ }
+
+ *yynumber = (i + f) * pow(10, e);
+
+ return JS_NUMBER;
+}
+
+static inline int lexescape(const char **sp)
+{
+ int c = GETC();
+ int x, y, z, w;
+
+ switch (c) {
+ case '0': return 0;
+ case 'u':
+ x = tohex(GETC());
+ y = tohex(GETC());
+ z = tohex(GETC());
+ w = tohex(GETC());
+ return (x << 12) | (y << 8) | (z << 4) | w;
+ case 'x':
+ x = tohex(GETC());
+ y = tohex(GETC());
+ return (x << 4) | y;
+ case '\'': return '\'';
+ case '"': return '"';
+ case '\\': return '\\';
+ case 'b': return '\b';
+ case 'f': return '\f';
+ case 'n': return '\n';
+ case 'r': return '\r';
+ case 't': return '\t';
+ case 'v': return '\v';
+ default: return c;
+ }
+}
+
+static inline js_Token lexstring(int q, const char **sp, char *yytext, size_t yylen)
+{
+ char *p = yytext;
+ int c = GETC();
+
+ while (c != q) {
+ if (c == 0 || isnewline(c))
+ return JS_ERROR;
+
+ if (c == '\\')
+ c = lexescape(sp);
+
+ if (p - yytext >= yylen)
+ return JS_ERROR;
+ *p++ = c;
+ c = GETC();
+ }
+
+ *p = 0;
+
+ return JS_STRING;
+}
+
+js_Token js_lex(js_State *J, const char **sp, char *yytext, size_t yylen, double *yynumber)
+{
+ int c = GETC();
+
+ while (c) {
+ while (iswhite(c))
+ c = GETC();
+
+ if (isnewline(c))
+ return JS_NEWLINE;
+
+ if (c == '/') {
+ c = GETC();
+ if (c == '/') {
+ lexlinecomment(sp);
+ } else if (c == '*') {
+ if (lexcomment(sp))
+ return JS_ERROR;
+ } else if (c == '=') {
+ return JS_SLASH_EQ;
+ } else {
+ UNGETC();
+ return JS_SLASH;
+ }
+ }
+
+ if (isidentifierstart(c)) {
+ char *p = yytext;
+
+ do {
+ if (p - yytext >= yylen)
+ return JS_ERROR;
+ *p++ = c;
+ c = GETC();
+ } while (isidentifierpart(c));
+
+ UNGETC();
+ *p = 0;
+
+ return findkeyword(yytext);
+ }
+
+ if ((c >= '0' && c <= '9') || c == '.')
+ return lexnumber(c, sp, yynumber);
+
+ if (c == '\'' || c == '"')
+ return lexstring(c, sp, yytext, yylen);
+
+ switch (c) {
+ case '{': return JS_LCURLY;
+ case '}': return JS_RCURLY;
+ case '(': return JS_LPAREN;
+ case ')': return JS_RPAREN;
+ case '[': return JS_LSQUARE;
+ case ']': return JS_RSQUARE;
+ case '.': return JS_PERIOD;
+ case ';': return JS_SEMICOLON;
+ case ',': return JS_COMMA;
+
+ case '<':
+ if (LOOK('<')) {
+ if (LOOK('='))
+ return JS_LT_LT_EQ;
+ return JS_LT_LT;
+ }
+ if (LOOK('='))
+ return JS_LT_EQ;
+ return JS_LT;
+
+ case '>':
+ if (LOOK('>')) {
+ if (LOOK('>')) {
+ if (LOOK('='))
+ return JS_GT_GT_GT_EQ;
+ return JS_GT_GT_GT;
+ }
+ if (LOOK('='))
+ return JS_GT_GT_EQ;
+ return JS_GT_GT;
+ }
+ if (LOOK('='))
+ return JS_GT_EQ;
+ return JS_GT;
+
+ case '=':
+ if (LOOK('=')) {
+ if (LOOK('='))
+ return JS_EQ_EQ_EQ;
+ return JS_EQ_EQ;
+ }
+ return JS_EQ;
+
+ case '!':
+ if (LOOK('=')) {
+ if (LOOK('='))
+ return JS_EXCL_EQ_EQ;
+ return JS_EXCL_EQ;
+ }
+ return JS_EXCL;
+
+ case '+':
+ if (LOOK('+'))
+ return JS_PLUS_PLUS;
+ if (LOOK('='))
+ return JS_PLUS_EQ;
+ return JS_PLUS;
+
+ case '-':
+ if (LOOK('-'))
+ return JS_MINUS_MINUS;
+ if (LOOK('='))
+ return JS_MINUS_EQ;
+ return JS_MINUS;
+
+ case '*':
+ if (LOOK('='))
+ return JS_STAR_EQ;
+ return JS_STAR;
+
+ case '%':
+ if (LOOK('='))
+ return JS_PERCENT_EQ;
+ return JS_PERCENT;
+
+ case '&':
+ if (LOOK('&'))
+ return JS_AND_AND;
+ if (LOOK('='))
+ return JS_AND_EQ;
+ return JS_AND;
+
+ case '|':
+ if (LOOK('|'))
+ return JS_BAR_BAR;
+ if (LOOK('='))
+ return JS_BAR_EQ;
+ return JS_BAR;
+
+ case '^':
+ if (LOOK('='))
+ return JS_HAT_EQ;
+ return JS_HAT;
+
+ case '~': return JS_TILDE;
+ case '?': return JS_QUESTION;
+ case ':': return JS_COLON;
+ }
+
+ c = GETC();
+ }
+
+ return JS_EOF;
+}
--- /dev/null
+++ b/js-load.c
@@ -1,0 +1,59 @@
+#include "js.h"
+
+int js_loadstring(js_State *J, const char *source)
+{
+ char yytext[512];
+ double yynumber;
+ js_Token t;
+
+ do {
+ t = js_lex(J, &source, yytext, sizeof yytext, &yynumber);
+
+ if (t == JS_NUMBER)
+ printf("%g\n", yynumber);
+ else if (t == JS_IDENTIFIER)
+ printf("id:%s\n", yytext);
+ else if (t == JS_STRING)
+ printf("'%s'\n", yytext);
+ else
+ printf("%s\n", js_tokentostring(t));
+ } while (t != JS_EOF && t != JS_ERROR);
+
+ return 0;
+}
+
+int js_loadfile(js_State *J, const char *filename)
+{
+ FILE *f;
+ char *s;
+ int n, t;
+
+ f = fopen(filename, "r");
+ if (!f)
+ return js_error(J, "cannot open file: '%s'", filename);
+
+ fseek(f, 0, SEEK_END);
+ n = ftell(f);
+ fseek(f, 0, SEEK_SET);
+
+ s = malloc(n + 1); /* add space for string terminator */
+ if (!s) {
+ fclose(f);
+ return js_error(J, "cannot allocate storage for file contents: '%s'", filename);
+ }
+
+ t = fread(s, 1, n, f);
+ if (t != n) {
+ free(s);
+ fclose(f);
+ return js_error(J, "cannot read data from file: '%s'", filename);
+ }
+
+ s[n] = 0; /* zero-terminate string containing file data */
+
+ t = js_loadstring(J, s);
+
+ free(s);
+ fclose(f);
+ return t;
+}
--- /dev/null
+++ b/js-state.c
@@ -1,0 +1,28 @@
+#include "js.h"
+
+js_State *js_newstate(void)
+{
+ js_State *J = malloc(sizeof *J);
+ memset(J, 0, sizeof(*J));
+ return J;
+}
+
+void js_close(js_State *J)
+{
+ free(J);
+}
+
+int js_error(js_State *J, const char *fmt, ...)
+{
+ va_list ap;
+
+ fprintf(stderr, "error: ");
+
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ fprintf(stderr, "\n");
+
+ return 0;
+}
--- /dev/null
+++ b/js.h
@@ -1,0 +1,162 @@
+#ifndef js_h
+#define js_h
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+
+typedef struct js_State js_State;
+
+typedef int (*js_CFunction)(js_State *J);
+
+js_State *js_newstate(void);
+void js_close(js_State *J);
+
+int js_error(js_State *J, const char *fmt, ...);
+
+int js_loadstring(js_State *J, const char *s);
+int js_loadfile(js_State *J, const char *filename);
+
+/* private */
+
+typedef enum js_Token js_Token;
+
+enum js_Token
+{
+ JS_ERROR,
+ JS_EOF,
+
+ JS_IDENTIFIER,
+ JS_NULL,
+ JS_TRUE,
+ JS_FALSE,
+ JS_NUMBER,
+ JS_STRING,
+ JS_REGEXP,
+ JS_NEWLINE,
+
+ /* punctuators */
+ JS_LCURLY,
+ JS_RCURLY,
+ JS_LPAREN,
+ JS_RPAREN,
+ JS_LSQUARE,
+ JS_RSQUARE,
+ JS_PERIOD,
+ JS_SEMICOLON,
+ JS_COMMA,
+ JS_LT,
+ JS_GT,
+ JS_LT_EQ,
+ JS_GT_EQ,
+ JS_EQ_EQ,
+ JS_EXCL_EQ,
+ JS_EQ_EQ_EQ,
+ JS_EXCL_EQ_EQ,
+ JS_PLUS,
+ JS_MINUS,
+ JS_STAR,
+ JS_PERCENT,
+ JS_PLUS_PLUS,
+ JS_MINUS_MINUS,
+ JS_LT_LT,
+ JS_GT_GT,
+ JS_GT_GT_GT,
+ JS_AND,
+ JS_BAR,
+ JS_HAT,
+ JS_EXCL,
+ JS_TILDE,
+ JS_AND_AND,
+ JS_BAR_BAR,
+ JS_QUESTION,
+ JS_COLON,
+ JS_EQ,
+ JS_PLUS_EQ,
+ JS_MINUS_EQ,
+ JS_STAR_EQ,
+ JS_PERCENT_EQ,
+ JS_LT_LT_EQ,
+ JS_GT_GT_EQ,
+ JS_GT_GT_GT_EQ,
+ JS_AND_EQ,
+ JS_BAR_EQ,
+ JS_HAT_EQ,
+ JS_SLASH,
+ JS_SLASH_EQ,
+
+ /* keywords */
+ JS_BREAK,
+ JS_CASE,
+ JS_CATCH,
+ JS_CONTINUE,
+ JS_DEFAULT,
+ JS_DELETE,
+ JS_DO,
+ JS_ELSE,
+ JS_FINALLY,
+ JS_FOR,
+ JS_FUNCTION,
+ JS_IF,
+ JS_IN,
+ JS_INSTANCEOF,
+ JS_NEW,
+ JS_RETURN,
+ JS_SWITCH,
+ JS_THIS,
+ JS_THROW,
+ JS_TRY,
+ JS_TYPEOF,
+ JS_VAR,
+ JS_VOID,
+ JS_WHILE,
+ JS_WITH,
+
+ /* future reserved words */
+ JS_ABSTRACT,
+ JS_BOOLEAN,
+ JS_BYTE,
+ JS_CHAR,
+ JS_CLASS,
+ JS_CONST,
+ JS_DEBUGGER,
+ JS_DOUBLE,
+ JS_ENUM,
+ JS_EXPORT,
+ JS_EXTENDS,
+ JS_FINAL,
+ JS_FLOAT,
+ JS_GOTO,
+ JS_IMPLEMENTS,
+ JS_IMPORT,
+ JS_INT,
+ JS_INTERFACE,
+ JS_LONG,
+ JS_NATIVE,
+ JS_PACKAGE,
+ JS_PRIVATE,
+ JS_PROTECTED,
+ JS_PUBLIC,
+ JS_SHORT,
+ JS_STATIC,
+ JS_SUPER,
+ JS_SYNCHRONIZED,
+ JS_THROWS,
+ JS_TRANSIENT,
+ JS_VOLATILE,
+
+};
+
+struct js_State
+{
+ char yytext[512];
+ int top;
+};
+
+js_Token js_lex(js_State *J, const char **sp, char *yytext, size_t yylen, double *yynumber);
+const char *js_tokentostring(js_Token t);
+
+#endif
--- /dev/null
+++ b/main.c
@@ -1,0 +1,19 @@
+#include "js.h"
+
+int
+main(int argc, char **argv)
+{
+ js_State *J;
+ int i;
+
+ J = js_newstate();
+
+ for (i = 1; i < argc; i++) {
+ js_loadfile(J, argv[1]);
+ // js_run(J);
+ }
+
+ js_close(J);
+
+ return 0;
+}