;;; ECMAScript for Guile
;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;; Code:
(define-module (language ecmascript tokenize)
#:use-module (ice-9 rdelim)
#:use-module ((srfi srfi-1) #:select (unfold-right))
#:use-module (system base lalr)
#:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1))
(define (syntax-error what loc form . args)
(throw 'syntax-error #f what
(and=> loc source-location->source-properties)
form #f args))
(define (port-source-location port)
(make-source-location (port-filename port)
(port-line port)
(port-column port)
(false-if-exception (ftell port))
#f))
;; taken from SSAX, sorta
(define (read-until delims port loc)
(if (eof-object? (peek-char port))
(syntax-error "EOF while reading a token" loc #f)
(let ((token (read-delimited delims port 'peek)))
(if (eof-object? (peek-char port))
(syntax-error "EOF while reading a token" loc token)
token))))
(define (char-hex? c)
(and (not (eof-object? c))
(or (char-numeric? c)
(memv c '(#\a #\b #\c #\d #\e #\f))
(memv c '(#\A #\B #\C #\D #\E #\F)))))
(define (digit->number c)
(- (char->integer c) (char->integer #\0)))
(define (hex->number c)
(if (char-numeric? c)
(digit->number c)
(+ 10 (- (char->integer (char-downcase c)) (char->integer #\a)))))
(define (read-slash port loc div?)
(let ((c1 (begin
(read-char port)
(peek-char port))))
(cond
((eof-object? c1)
;; hmm. error if we're not looking for a div? ?
(make-lexical-token '/ loc #f))
((char=? c1 #\/)
(read-line port)
(next-token port div?))
((char=? c1 #\*)
(read-char port)
(let lp ((c (read-char port)))
(cond
((eof-object? c)
(syntax-error "EOF while in multi-line comment" loc #f))
((char=? c #\*)
(if (eqv? (peek-char port) #\/)
(begin
(read-char port)
(next-token port div?))
(lp (read-char port))))
(else
(lp (read-char port))))))
(div?
(case c1
((#\=) (read-char port) (make-lexical-token '/= loc #f))
(else (make-lexical-token '/ loc #f))))
(else
(read-regexp port loc)))))
(define (read-regexp port loc)
;; first slash already read
(let ((terms (string #\/ #\\ #\nl #\cr)))
(let lp ((str (read-until terms port loc)) (head ""))
(let ((terminator (peek-char port)))
(cond
((char=? terminator #\/)
(read-char port)
;; flags
(let lp ((c (peek-char port)) (flags '()))
(if (or (eof-object? c)
(not (or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\$)
(char=? c #\_))))
(make-lexical-token 'RegexpLiteral loc
(cons (string-append head str)
(reverse flags)))
(begin (read-char port)
(lp (peek-char port) (cons c flags))))))
((char=? terminator #\\)
(read-char port)
(let ((echar (read-char port)))
(lp (read-until terms port loc)
(string-append head str (string #\\ echar)))))
(else
(syntax-error "regexp literals may not contain newlines"
loc str)))))))
(define (read-string port loc)
(let ((c (read-char port)))
(let ((terms (string c #\\ #\nl #\cr)))
(define (read-escape port)
(let ((c (read-char port)))
(case c
((#\' #\" #\\) c)
((#\b) #\bs)
((#\f) #\np)
((#\n) #\nl)
((#\r) #\cr)
((#\t) #\tab)
((#\v) #\vt)
((#\0)
(let ((next (peek-char port)))
(cond
((eof-object? next) #\nul)
((char-numeric? next)
(syntax-error "octal escape sequences are not supported"
loc #f))
(else #\nul))))
((#\x)
(let* ((a (read-char port))
(b (read-char port)))
(cond
((and (char-hex? a) (char-hex? b))
(integer->char (+ (* 16 (hex->number a)) (hex->number b))))
(else
(syntax-error "bad hex character escape" loc (string a b))))))
((#\u)
(let* ((a (read-char port))
(b (read-char port))
(c (read-char port))
(d (read-char port)))
(integer->char (string->number (string a b c d) 16))))
(else
c))))
(let lp ((str (read-until terms port loc)))
(let ((terminator (peek-char port)))
(cond
((char=? terminator c)
(read-char port)
(make-lexical-token 'StringLiteral loc str))
((char=? terminator #\\)
(read-char port)
(let ((echar (read-escape port)))
(lp (string-append str (string echar)
(read-until terms port loc)))))
(else
(syntax-error "string literals may not contain newlines"
loc str))))))))
(define *keywords*
'(("break" . break)
("else" . else)
("new" . new)
("var" . var)
("case" . case)
("finally" . finally)
("return" . return)
("void" . void)
("catch" . catch)
("for" . for)
("switch" . switch)
("while" . while)
("continue" . continue)
("function" . function)
("this" . this)
("with" . with)
("default" . default)
("if" . if)
("throw" . throw)
("delete" . delete)
("in" . in)
("try" . try)
("do" . do)
("instanceof" . instanceof)
("typeof" . typeof)
;; these aren't exactly keywords, but hey
("null" . null)
("true" . true)
("false" . false)))
(define *future-reserved-words*
'(("abstract" . abstract)
("enum" . enum)
("int" . int)
("short" . short)
("boolean" . boolean)
("export" . export)
("interface" . interface)
("static" . static)
("byte" . byte)
("extends" . extends)
("long" . long)
("super" . super)
("char" . char)
("final" . final)
("native" . native)
("synchronized" . synchronized)
("class" . class)
("float" . float)
("package" . package)
("throws" . throws)
("const" . const)
("goto" . goto)
("private" . private)
("transient" . transient)
("debugger" . debugger)
("implements" . implements)
("protected" . protected)
("volatile" . volatile)
("double" . double)
("import" . import)
("public" . public)))
(define (read-identifier port loc)
(let lp ((c (peek-char port)) (chars '()))
(if (or (eof-object? c)
(not (or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\$)
(char=? c #\_))))
(let ((word (list->string (reverse chars))))
(cond ((assoc-ref *keywords* word)
=> (lambda (x) (make-lexical-token x loc #f)))
((assoc-ref *future-reserved-words* word)
(syntax-error "word is reserved for the future, dude."
loc word))
(else (make-lexical-token 'Identifier loc
(string->symbol word)))))
(begin (read-char port)
(lp (peek-char port) (cons c chars))))))
(define (read-numeric port loc)
(let* ((c0 (if (char=? (peek-char port) #\.)
#\0
(read-char port)))
(c1 (peek-char port)))
(cond
((eof-object? c1) (digit->number c0))
((and (char=? c0 #\0) (or (char=? c1 #\x) (char=? c1 #\X)))
(read-char port)
(let ((c (peek-char port)))
(if (not (char-hex? c))
(syntax-error "bad digit reading hexadecimal number"
loc c))
(let lp ((c c) (acc 0))
(cond ((char-hex? c)
(read-char port)
(lp (peek-char port)
(+ (* 16 acc) (hex->number c))))
(else
acc)))))
((and (char=? c0 #\0) (char-numeric? c1))
(let lp ((c c1) (acc 0))
(cond ((eof-object? c) acc)
((char-numeric? c)
(if (or (char=? c #\8) (char=? c #\9))
(syntax-error "invalid digit in octal sequence"
loc c))
(read-char port)
(lp (peek-char port)
(+ (* 8 acc) (digit->number c))))
(else
acc))))
(else
(let lp ((c1 c1) (acc (digit->number c0)))
(cond
((eof-object? c1) acc)
((char-numeric? c1)
(read-char port)
(lp (peek-char port)
(+ (* 10 acc) (digit->number c1))))
((or (char=? c1 #\e) (char=? c1 #\E))
(read-char port)
(let ((add (let ((c (peek-char port)))
(cond ((eof-object? c)
(syntax-error "error reading exponent: EOF"
loc #f))
((char=? c #\+) (read-char port) +)
((char=? c #\-) (read-char port) -)
((char-numeric? c) +)
(else
(syntax-error "error reading exponent: non-digit"
loc c))))))
(let lp ((c (peek-char port)) (e 0))
(cond ((and (not (eof-object? c)) (char-numeric? c))
(read-char port)
(lp (peek-char port) (add (* 10 e) (digit->number c))))
(else
(* (if (negative? e) (* acc 1.0) acc) (expt 10 e)))))))
((char=? c1 #\.)
(read-char port)
(let lp2 ((c (peek-char port)) (dec 0.0) (n -1))
(cond ((and (not (eof-object? c)) (char-numeric? c))
(read-char port)
(lp2 (peek-char port)
(+ dec (* (digit->number c) (expt 10 n)))
(1- n)))
(else
;; loop back to catch an exponential part
(lp c (+ acc dec))))))
(else
acc)))))))
(define *punctuation*
'(("{" . lbrace)
("}" . rbrace)
("(" . lparen)
(")" . rparen)
("[" . lbracket)
("]" . rbracket)
("." . dot)
(";" . semicolon)
("," . comma)
("<" . <)
(">" . >)
("<=" . <=)
(">=" . >=)
("==" . ==)
("!=" . !=)
("===" . ===)
("!==" . !==)
("+" . +)
("-" . -)
("*" . *)
("%" . %)
("++" . ++)
("--" . --)
("<<" . <<)
(">>" . >>)
(">>>" . >>>)
("&" . &)
("|" . bor)
("^" . ^)
("!" . !)
("~" . ~)
("&&" . &&)
("||" . or)
("?" . ?)
(":" . colon)
("=" . =)
("+=" . +=)
("-=" . -=)
("*=" . *=)
("%=" . %=)
("<<=" . <<=)
(">>=" . >>=)
(">>>=" . >>>=)
("&=" . &=)
("|=" . bor=)
("^=" . ^=)))
(define *div-punctuation*
'(("/" . /)
("/=" . /=)))
;; node ::= (char (symbol | #f) node*)
(define read-punctuation
(let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*))
(cond ((null? puncs)
nodes)
((assv-ref nodes (string-ref (caar puncs) 0))
=> (lambda (node-tail)
(if (= (string-length (caar puncs)) 1)
(set-car! node-tail (cdar puncs))
(set-cdr! node-tail
(lp (cdr node-tail)
`((,(substring (caar puncs) 1)
. ,(cdar puncs))))))
(lp nodes (cdr puncs))))
(else
(lp (cons (list (string-ref (caar puncs) 0) #f) nodes)
puncs))))))
(lambda (port loc)
(let lp ((c (peek-char port)) (tree punc-tree) (candidate #f))
(cond
((assv-ref tree c)
=> (lambda (node-tail)
(read-char port)
(lp (peek-char port) (cdr node-tail) (car node-tail))))
(candidate
(make-lexical-token candidate loc #f))
(else
(syntax-error "bad syntax: character not allowed" loc c)))))))
(define (next-token port div?)
(let ((c (peek-char port))
(loc (port-source-location port)))
(case c
((#\ht #\vt #\np #\space #\x00A0) ; whitespace
(read-char port)
(next-token port div?))
((#\newline #\cr) ; line break
(read-char port)
(next-token port div?))
((#\/)
;; division, single comment, double comment, or regexp
(read-slash port loc div?))
((#\" #\') ; string literal
(read-string port loc))
(else
(cond
((eof-object? c)
'*eoi*)
((or (char-alphabetic? c)
(char=? c #\$)
(char=? c #\_))
;; reserved word or identifier
(read-identifier port loc))
((char-numeric? c)
;; numeric -- also accept . FIXME, requires lookahead
(make-lexical-token 'NumericLiteral loc (read-numeric port loc)))
(else
;; punctuation
(read-punctuation port loc)))))))
(define (make-tokenizer port)
(let ((div? #f))
(lambda ()
(let ((tok (next-token port div?)))
(set! div? (and (lexical-token? tok)
(let ((cat (lexical-token-category tok)))
(or (eq? cat 'Identifier)
(eq? cat 'NumericLiteral)
(eq? cat 'StringLiteral)))))
tok))))
(define (make-tokenizer/1 port)
(let ((div? #f)
(eoi? #f)
(stack '()))
(lambda ()
(if eoi?
'*eoi*
(let ((tok (next-token port div?)))
(case (if (lexical-token? tok) (lexical-token-category tok) tok)
((lparen)
(set! stack (cons tok stack)))
((rparen)
(if (and (pair? stack)
(eq? (lexical-token-category (car stack)) 'lparen))
(set! stack (cdr stack))
(syntax-error "unexpected right parenthesis"
(lexical-token-source tok)
#f)))
((lbracket)
(set! stack (cons tok stack)))
((rbracket)
(if (and (pair? stack)
(eq? (lexical-token-category (car stack)) 'lbracket))
(set! stack (cdr stack))
(syntax-error "unexpected right bracket"
(lexical-token-source tok)
#f)))
((lbrace)
(set! stack (cons tok stack)))
((rbrace)
(if (and (pair? stack)
(eq? (lexical-token-category (car stack)) 'lbrace))
(set! stack (cdr stack))
(syntax-error "unexpected right brace"
(lexical-token-source tok)
#f)))
((semicolon)
(set! eoi? (null? stack))))
(set! div? (and (lexical-token? tok)
(let ((cat (lexical-token-category tok)))
(or (eq? cat 'Identifier)
(eq? cat 'NumericLiteral)
(eq? cat 'StringLiteral)))))
tok)))))
(define (tokenize port)
(let ((next (make-tokenizer port)))
(let lp ((out '()))
(let ((tok (next)))
(if (eq? tok '*eoi*)
(reverse! out)
(lp (cons tok out)))))))
(define (tokenize/1 port)
(let ((next (make-tokenizer/1 port)))
(let lp ((out '()))
(let ((tok (next)))
(if (eq? tok '*eoi*)
(reverse! out)
(lp (cons tok out)))))))