package com.hurlant.eval.parse { import com.hurlant.eval.Debug; public class Scanner { var src : String; var origin : String; var curIndex : int; var markIndex : int; var lastMarkIndex : int; var colCoord : int; var lnCoord : int; function Scanner (src:String, origin:String) { this.src = src; this.origin = origin; this.curIndex = 0; this.markIndex = 0; this.lastMarkIndex = 0; this.colCoord = 0; this.lnCoord = 0; // trace("scanning: ",src); } function next () : int { if (curIndex == src.length) { curIndex++; return Char.EOS; } else { return src.charCodeAt(curIndex++); } } function lexeme() : String { return src.slice (markIndex,curIndex) } function retract() : void { curIndex--; //trace("retract cur=",curIndex); } function mark () : void { markIndex = curIndex; //trace("mark mark=",markIndex); } function tokenList (lexPrefix) // : [[int],[[int,int]]] { import flash.utils.*; // trace ("scanning"); function pushToken (token) { if (token == Token.Eol) { lnCoord++; colCoord = 0; } else { //trace ("token ", token); //trace ("token ", token, " \t", Token.tokenText(token)); colCoord = colCoord + markIndex - lastMarkIndex; coordList.push ([lnCoord,colCoord]); tokenArray.writeInt (token); lastMarkIndex = markIndex; } } //var tokenList = new Array; var tokenArray = new ByteArray; var coordList = new Array; var token = lexPrefix (); pushToken (token); while (token != Token.BREAK && token != Token.EOS && token != Token.ERROR) { token = start (); pushToken (token); } //trace("tokenList = ",tokenList); //trace("coordList = ",coordList); return [tokenArray,coordList]; } function regexp () { var c = next (); switch (c) { case Char.Slash : return regexpFlags (); case Char.EOS : throw "unexpected end of program in regexp literal"; default: return regexp (); } } function regexpFlags () { var c /*: int*/ = next (); if (Char.isIdentifierPart (c)) { return regexpFlags (); } else { retract (); return Token.makeInstance (Token.RegexpLiteral,lexeme()); } } function start () : int { var c /*: int*/; while (true) { mark(); c = next(); //trace("c[",curIndex-1,"]=",String.fromCharCode(c)); switch (c) { // case 0xffffffef: return utf8sig (); case Char.EOS: return Token.EOS; case Char.Slash: return slash (); case Char.Newline: return Token.Eol; case Char.CarriageReturn: return Token.Eol; // was start (); // ignore case Char.Space: return start (); case Char.Tab: return start (); case Char.LeftParen: return Token.LeftParen; case Char.RightParen: return Token.RightParen; case Char.Comma: return Token.Comma; case Char.Semicolon: return Token.SemiColon; case Char.QuestionMark: return Token.QuestionMark; case Char.LeftBracket: return Token.LeftBracket; case Char.RightBracket: return Token.RightBracket; case Char.LeftBrace: return Token.LeftBrace; case Char.RightBrace: return Token.RightBrace; case Char.Tilde: return Token.BitwiseNot; case Char.At: return Token.At; case Char.SingleQuote: return stringLiteral (c); case Char.DoubleQuote: return stringLiteral (c); case Char.Dot: return dot (); case Char.Dash: return minus (); case Char.Bang: return not (); case Char.Percent: return remainder (); case Char.Ampersand: return and (); case Char.Asterisk: return mult (); case Char.Colon: return colon (); case Char.Caret: return bitwiseXor (); case Char.Bar: return bitwiseOr (); case Char.Plus: return plus (); case Char.LeftAngle: return leftAngle (); case Char.Equal: return equal (); case Char.RightAngle: return rightAngle (); case Char.b: return b_ (); case Char.c: return identifier ("c"); case Char.d: return d_ (); case Char.e: return identifier ("e"); case Char.f: return identifier ("f"); case Char.g: return identifier ("g"); case Char.i: return identifier ("i"); case Char.n: return n_ (); case Char.o: return identifier ("o"); case Char.p: return identifier ("p"); case Char.r: return identifier ("r"); case Char.s: return identifier ("s"); case Char.t: return identifier ("t"); case Char.u: return identifier ("u"); case Char.v: return identifier ("v"); case Char.w: return identifier ("w"); case Char.BackSlash: var c = escapeSequence (); return identifier (String.fromCharCode(c)); case Char.Zero: return zero (); case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: case Char.Eight: case Char.Nine: return decimalInteger (); default: if (Char.isIdentifierStart (c)) { return identifier (String.fromCharCode(c)); } else { trace ("prefix=",c); throw "scanning with invalid prefix ", c; } } } Debug.assert(false); return int.MIN_VALUE; // never executed, but needed by as3 compiler. } function zero () : int { var c /*: int*/ = next (); switch (c) { case Char.x: case Char.X: return hexLiteral (); case Char.Zero: case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: return octalLiteral (); case Char.Dot: return decimalInteger (); case Char.Eight: // what do we do with these? case Char.Nine: default : retract (); return numberSuffix (); } } function hexLiteral () : int { var c /*: int*/ = next (); switch (c) { case Char.Zero: case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: case Char.Eight: case Char.Nine: case Char.a: case Char.A: case Char.b: case Char.B: case Char.c: case Char.C: case Char.d: case Char.D: case Char.e: case Char.E: case Char.f: case Char.F: return hexLiteral (); default: retract (); return numberSuffix (); } } function octalLiteral () : int { var c /*: int*/ = next (); switch (c) { case Char.Zero: case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: return octalLiteral (); case Char.Eight: // what do we do with these? case Char.Nine: default: retract (); return numberSuffix (); } } function decimalInteger () : int { var c /*: int*/ = next (); switch (c) { case Char.Zero: case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: case Char.Eight: case Char.Nine: return decimalInteger (); case Char.Dot: return decimalFraction (); case Char.e: case Char.E: return decimalExponent (); default: retract (); return numberSuffix (); } } function decimalFraction () : int { var c /*: int*/ = next (); switch (c) { case Char.Zero: case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: case Char.Eight: case Char.Nine: return decimalFraction (); case Char.e: case Char.E: switch (next()) { case Char.Plus: case Char.Minus: return decimalExponent (); default: retract (); return decimalExponent (); } default: retract (); return numberSuffix (); } } function decimalExponent () : int { var c /*: int*/ = next (); switch (c) { case Char.Zero: case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: case Char.Eight: case Char.Nine: return decimalExponent (); default: retract (); return numberSuffix (); } } function numberSuffix () : int { var c /*: int*/ = next (); switch (c) { case Char.i: return Token.makeInstance (Token.ExplicitIntLiteral, lexeme ()); case Char.u: return Token.makeInstance (Token.ExplicitUIntLiteral, lexeme ()); case Char.d: return Token.makeInstance (Token.ExplicitDoubleLiteral, lexeme ()); case Char.m: return Token.makeInstance (Token.ExplicitDecimalLiteral, lexeme ()); default: retract (); return Token.makeInstance (Token.DecimalLiteral, lexeme ()); } } function slash () : int { var c /*: int*/ = next (); switch (c) { case Char.Slash: lineComment (); return start (); case Char.Asterisk: blockComment (); return start (); default: retract (); return Token.BREAK; } } function lineComment () : void { var c /*: int*/ = next (); switch (c) { case Char.Newline: case Char.CarriageReturn: case Char.EOS: retract (); // leave newline for asi return; default: return lineComment (); } } function blockComment () : void { var c /*: int*/ = next (); while (true) { switch (c) { case Char.Asterisk : switch (next()) { case Char.Slash: return; case Char.EOS : retract (); return; case Char.Asterisk: retract (); // leave in case next char is a slash break; case Char.Newline: colCoord = 0; lnCoord++; // count ln and fall through default: break; } break; case Char.EOS : retract (); return; case Char.Newline: lnCoord++; // fall through default : break; } c = next (); } } function stringLiteral (delimiter, text="") : int { var c /*: int*/ = next (); while (c != Char.EOS) { switch (c) { case delimiter: return Token.makeInstance (Token.StringLiteral, String.fromCharCode(delimiter)+text); // encode delimiter in string lexeme by appending to text case Char.BackSlash: c = escapeSequence (); text = text+String.fromCharCode(c); break; default: text = text+String.fromCharCode(c); break; } c = next (); } throw "unterminated string literal: " + text; /* switch (c) { case delimiter: return Token.makeInstance (Token.StringLiteral, String.fromCharCode(delimiter)+text); // encode delimiter in string lexeme by appending to text case Char.BackSlash: var c = escapeSequence (); return stringLiteral (delimiter, text+String.fromCharCode(c)); default: return stringLiteral (delimiter, text+String.fromCharCode (c)) } */ } /* */ function escapeSequence () : int { var c /*: int*/ = next (); switch (c) { case Char.Zero: case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: retract (); return octalOrNulEscape (0); case Char.x: return hexEscape (2); case Char.u: return hexEscape (4); case Char.b: return Char.Backspace; case Char.f: return Char.Formfeed; case Char.n: return Char.Newline; case Char.r: return Char.CarriageReturn; case Char.t: return Char.Tab; case Char.v: return Char.VerticalTab; case Char.SingleQuote: case Char.DoubleQuote: case Char.BackSlash: return c; default: throw "lexer error escapeSequence " + c; } } function octalOrNulEscape (n:int) : uint { var c /*: int*/ = next (); switch (c) { case Char.Zero: switch (next()) { case Char.One: case Char.Two: case Char.Three: case Char.Four: case Char.Five: case Char.Six: case Char.Seven: retract (); return octalEscapeFull (n+1); default: return 0; // \0 } case Char.One: case Char.Two: case Char.Three: return octalEscapeFull (n+1); case Char.Four: case Char.Five: case Char.Six: case Char.Seven: return octalEscapeShort (n+1); default: throw "internal error: expecting octal character"; } } function octalEscapeFull (n:int) : uint { if (n==3) { for (var i=0; i */ function div () : int { var c /*: int*/ = next (); switch (c) { case Char.Equal : return Token.DivAssign; case Char.RightAngle : return Token.XmlTagEndEnd; default : retract (); return Token.Div; } } /* < <= >= >> >>= >>> >>>= */ function rightAngle () : int { var c /*: int*/ = next (); switch (c) { case Char.Equal : return Token.GreaterThanOrEqual; case Char.RightAngle : return rightShift (); default : retract (); return Token.GreaterThan; } } function rightShift () : int { var c /*: int*/ = next (); switch (c) { case Char.Equal : return Token.RightShiftAssign; case Char.RightAngle : return unsignedRightShift (); default : retract (); return Token.RightShift; } } function unsignedRightShift () : int { var c /*: int*/ = next (); switch (c) { case Char.Equal : return Token.UnsignedRightShiftAssign; default : retract (); return Token.UnsignedRightShift; } } /* ^ ^= */ function bitwiseXor () : int { var c /*: int*/ = next (); switch (c) { case Char.Equal : return Token.BitwiseXorAssign; default : retract (); return Token.BitwiseXor; } } /* | |= || ||= */ function bitwiseOr () : int { var c /*: int*/ = next (); switch (c) { case Char.Equal : return Token.BitwiseOrAssign; case Char.Bar : return logicalOr (); default : retract (); return Token.BitwiseOr; } } function logicalOr () : int { var c /*: int*/ = next (); switch (c) { case Char.Equal : return Token.LogicalOrAssign; default : retract (); return Token.LogicalOr; } } /* : :: */ function colon () : int { var c /*: int*/ = next (); switch (c) { case Char.Colon : return Token.DoubleColon; default : retract (); return Token.Colon; } } /* identifier */ function identifier (str:String) : int { var c /*: int*/ = next (); //trace("c[",curIndex-1,"]=",String.fromCharCode(c)) switch (c) { case Char.a : case Char.b : case Char.c : case Char.d : case Char.e : case Char.f : case Char.g : case Char.h : case Char.i : case Char.j : case Char.k : case Char.l : case Char.m : case Char.n : case Char.o : case Char.p : case Char.q : case Char.r : case Char.s : case Char.t : case Char.u : case Char.v : case Char.w : case Char.x : case Char.y : case Char.z : case Char.A : case Char.B : case Char.B : case Char.C : case Char.D : case Char.E : case Char.F : case Char.G : case Char.H : case Char.I : case Char.J : case Char.K : case Char.L : case Char.M : case Char.N : case Char.O : case Char.P : case Char.Q : case Char.R : case Char.S : case Char.T : case Char.U : case Char.V : case Char.W : case Char.X : case Char.Y : case Char.Z : return identifier (str+String.fromCharCode(c)); case Char.BackSlash: var c = escapeSequence (); return identifier (str+String.fromCharCode(c)); default: if (Char.isIdentifierPart (c) && c != Char.EOS) { return identifier (str+String.fromCharCode(c)); } else { retract (); return Token.maybeReservedIdentifier (str); } } } function b_ () : int { var c /*: int*/ = next (); switch (c) { case Char.r: return br_ (); default: retract (); return identifier ("b"); } } function br_ () : int { var c /*: int*/ = next (); switch (c) { case Char.e : return identifier ("bre"); default: retract (); return identifier ("br"); } } function d_ () : int { var c /*: int*/ = next (); switch (c) { case Char.e : return identifier ("de"); default: retract (); return identifier ("d"); } } function n_ () : int { var c /*: int*/ = next(); switch (c) { case Char.a : return identifier ("na"); case Char.e : return identifier ("ne"); case Char.u : return nu_ (); default: retract (); return identifier ("n"); } } function nu_ () : int { var c /*: int*/ = next (); switch (c) { case Char.l : return nul_ (); default: retract (); return identifier ("nu"); } } function nul_ () : int { var c /*: int*/ = next (); switch (c) { case Char.l : return null_ (); default: retract (); return identifier ("nul"); } } function null_ () : int { var c /*: int*/ = next (); if (Char.isIdentifierPart (c)) { return identifier ("null"+String.fromCharCode(c)); } else { retract(); return Token.Null; } } } } import com.hurlant.eval.parse.Scanner; import com.hurlant.eval.parse.Token; function test() { trace ("testing lex-scan.es"); var testCases = [ "break case catch continue default delete do else enum extends" , "false finally for function if in instanceof new null return" , "super switch this throw true try typeof var void while with" , "call cast const decimal double dynamic each eval final get has" , "implements import int interface internal intrinsic is let namespace" , "native Number override package precision private protected prototype public" , "rounding standard strict static to type uint undefined use xml yield" , ". .< .. ... ! != !== % %= & && &&= * *= + +- ++ - -- -=" , "/ /= /> < <= >= >> >>= >>> >>>=" , "^ ^= | |= || ||= : :: ( ) [ ] { } ~ @ , ; ?" , "/* hello nobody */ hello // goodbye world" , "0 0i 00 001u 0123d 045m 0x0 0xCAFEBABE 0x12345678u 1. .0 .2e+3 1.23m" // , "\\u0050 \\x50gh \\073 \\73 \\073123 \\7398" , "/abc/ 'hi' \"bye\" null break /def/xyz" ]; for (var i = 0; i < testCases.length; ++i) { var scan = new Scanner (testCases[i],"test"+i); var tmp = scan.tokenList (scan.start); var tokens=tmp[0], coords=tmp[1] trace ("tokens ", tokens); trace ("coords ", coords); for (var j=0; j