/**
    Copyright 2010 Steve Hanov

    This file is part of qb.js

    qb.js is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    qb.js is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with qb.js.  If not, see <http://www.gnu.org/licenses/>.
*/    
//#include <debug.js>
var NextStateId = 0;

var POST_NEWLINE = -1;
var PRE_NEWLINE = -2;
var DIGIT_CHAR = -3;
var ANY_CHAR = -4;

/**
  Represents a location in the source file. (The name "location" cannot be used
  because it has a special meaning in browsers.) This is used throughout the
  compiler to map program statements to token positions.

  @constructor 
 */
function Locus( line, position )
{
    this.line = line;
    this.position = position;
}

Locus.prototype = 
{
    toString: function()
    {
        return "" + (this.line + 1) + ":" + (this.position + 1 );
    }
};

/** 
  When the match function is called, it will return true if the argument
  matches a particular character.

  @constructor 
 */
function CharMatcher( ch)
{
    this.mchar = ch;
}

CharMatcher.prototype = {
    match: function( ch ) 
    {
        //dbg.printf("Compare %s with %s\n", this.mchar, ch );
        if ( this.mchar == DIGIT_CHAR ) {
            return ch >= '0' && ch <= '9';
        } else if ( this.mchar == ANY_CHAR ) {
            return ch != POST_NEWLINE && ch != PRE_NEWLINE && ch != '\n';
        } else {
            return ch == this.mchar;
        }
    },

    toString: function()
    {
        if ( this.mchar == DIGIT_CHAR ) {
            return "\\d";
        } else {
            return this.mchar;
        }
    }
};

/** 
  When the match function is called, it will return true if the argument
  matches a particular character range.

  @constructor 
 */
function RangeMatcher( ranges, include )
{
    // list of [ start, end ]
    this.ranges = ranges;
    this.include = include; // boolean
}

RangeMatcher.prototype = {
    match: function( ch ) 
    {
        for ( var i = 0; i < this.ranges.length; i++ ) {
            var range = this.ranges[i];
            if ( ch >= range[0] && ch <= range[1] ) {
                return this.include;
            }
        }

        return !this.include;
    },

    toString: function()
    {
        var str = "[";
        if ( !this.include ) {
            str += "^";
        }
        for( var i = 0; i < this.ranges.length; i++ ) {
            if ( this.ranges[i][0] == this.ranges[i][1] ) {
                str += this.ranges[i][0];
            } else {
                str += this.ranges[i][0] + "-" + this.ranges[i][1];
            }
        }
        return str + "]";
    }
};

/** @constructor */
function NfaState(charMatcher)
{
    this.mchar = charMatcher;
    this.next = []; 
    this.id = NextStateId++;
    this.lastList = 0;
    this.accept = undefined;
}

/** @constructor */
function DfaState()
{
    this.nfaStates = [];
    this.next = {};
    this.accepts = [];
    this.id = NextStateId++;
}

NfaState.prototype.toString = function()
{
    var str = "\nState [" + this.id + "] ch=" + this.mchar + "\n";
    if ( this.accept !== undefined ) {
        str += "    Accept " + this.accept + "\n";
    }
    for ( var i = 0; i < this.next.length; i++ ) {
        str += "    ch=" + this.next[i].mchar + " goto [" + this.next[i].id +
            "]\n";
    }
    return str;
};

/** @constructor */
function NFA( start, end )
{
    this.start = start;
    this.end = end;
}

NFA.prototype.toString = function()
{
   var processed = {};
   var stack = [ this.start ];
   var str = "";

   while( stack.length > 0 ) {
        var state = stack.pop();
        if ( processed[state] ) {
            continue;
        }
        processed[state] = 1;

        for( var i = 0; i < state.next.length; i++ ) {
            stack.push( state.next[i] );
        }
        str += state.toString();
   }
   return str;
};

/** @constructor */
function Token( id, text, line, position )
{
    this.id = id;
    this.text = text;
    this.locus = new Locus( line, position );
}

Token.prototype = 
{
    toString: function() 
    {
        return "Token(" + this.text + ")";
    }
    
};

/** @constructor */
function Tokenizer()
{
    this.root = new NfaState(null);
    this.expr = null;
    this.index = 0;
    this.listId = 1;
    this.dfaCache = {};

    // text to tokenize.
    this.text = "";

    // for each line, the character position of that line in the text.
    this.lineNumbers = [];

    // users can redefine these if they want.
    this.EOF_TOKEN = {};
    this.IGNORE_TOKEN = {};

    // check this to determine if we have reached the end of the text.
    this.finished = true;
}

Tokenizer.prototype = {

    addToken: function( id, expr )
    {
        this.expr = expr;
        this.index = 0;
        var nfa = this.parseAlternation();
        this.root.next.push( nfa.start );
        nfa.end.accept = id;
    },

    ignore: function( expr )
    {
        this.addToken( this.IGNORE_TOKEN, expr );
    },

    eof: function()
    {
        return this.index == this.expr.length;
    },

    matchChar: function( ch )
    {
        if ( this.expr[this.index] == ch ) {
            this.index++;
            return true;
        }
        return false;
    },

    peek: function( ch ) 
    {
        return this.expr[this.index] == ch;
    },

    parseChar: function()
    {
        if ( this.matchChar( '\\' ) ) {
            if ( this.matchChar( 'n' ) ) {
                return '\n';
            } else if ( this.matchChar( 'r' ) ) {
                return '\r';
            } else if ( this.matchChar( 't' ) ) {
                return '\t';
            } else if ( this.matchChar( 'd' ) ) {
                return DIGIT_CHAR;
            } else {
                return this.expr[this.index++];
            }
        } else if ( this.matchChar( '.' ) ) {
            return ANY_CHAR;
        } else if ( this.matchChar( '$' ) ) {
            return PRE_NEWLINE;
        } else if ( this.matchChar( '^' ) ) {
            return POST_NEWLINE;
        } else {
            return this.expr[this.index++];
        }
    },

    parseRange: function()
    {
        var include = true;
        var ranges = [];

        while( !this.eof() && !this.peek(']' ) ) {
            if ( this.matchChar( '^' ) ) {
                include = false;
            }
            var first = this.parseChar();
            var last = first;
            if ( this.matchChar( '-' ) ) {
                last = this.parseChar();
            }

            if ( first == DIGIT_CHAR ) {
                first = '0';
                last = '9';
            }

            //dbg.printf("Pushing range %s..%s\n", first, last );
            ranges.push( [ first, last ] );
        }

        var state = new NfaState( new RangeMatcher( ranges, include ) );
        return new NFA( state, state );
    },

    parseBasic: function()
    {
        var nfa;

        if ( this.matchChar( '(' ) ) {
            nfa = this.parseAlternation();
            if ( !this.matchChar( ')' ) ) {
                throw "Expected ')'";
            }
        } else if ( this.matchChar( '[' ) ) {
            //dbg.printf("Encountered RANGE!\n");
            nfa = this.parseRange();
            if ( !this.matchChar( ']' ) ) {
                throw "Expected ']'";
            }
        } else {
            var state = new NfaState( new CharMatcher( this.parseChar() ) );
            nfa = new NFA( state, state );
        }

        return nfa;
    },

    parseKleene: function()
    {
        var nfa = this.parseBasic();
        var splitter;
        if ( this.matchChar( "+" ) ) {
            splitter = new NfaState( null );
            nfa.end.next.push( splitter );
            splitter.next.push( nfa.start );
            nfa.end = splitter;
        } else if ( this.matchChar( "*" ) ) {
            splitter = new NfaState(null);
            splitter.next.push( nfa.start );
            nfa.end.next.push( splitter );
            nfa.start = splitter;
            nfa.end = splitter;
        } else if ( this.matchChar( "?" ) ) {
            var start = new NfaState( null );
            var end = new NfaState( null );
            start.next.push( nfa.start );
            start.next.push( end );
            nfa.end.next.push( end );
            nfa.start = start;
            nfa.end = end;
        }

        return nfa;
    },

    parseConcat: function()
    {
        var start = new NfaState(null);
        var end = start;
        for( ;; ) {
            if ( this.peek( "|" ) || this.peek( ")" ) || this.eof() ) {
                break;
            }
            var nfa = this.parseKleene();
            end.next.push( nfa.start );
            end = nfa.end;
        }
        return new NFA( start, end );
    },

    parseAlternation: function()
    {
        var start = new NfaState(null);
        var end = new NfaState(null);
        do {
            var nfa = this.parseConcat();
            start.next.push( nfa.start );
            nfa.end.next.push( end );
        } while( this.matchChar( "|" ) );

        return new NFA(start,end);
    },

    addState: function( nfaStateList, accepts, nfaState )
    {
        if ( nfaState.lastList == this.listId ) {
            //dbg.printf("Skip adding nfa State [%d]\n", nfaState.id );
            return;
        }

        //dbg.printf("Add NFA state [%d]\n", nfaState.id );
        if ( nfaState.accept !== undefined ) {
            accepts.push( nfaState.accept );
        }

        nfaState.lastList = this.listId;
        nfaStateList.push( nfaState );

        if ( nfaState.mchar === null ) {
            for ( var i = 0; i < nfaState.next.length; i++ ) {
                this.addState( nfaStateList, accepts, nfaState.next[i] );
            }
        }
    },

    nextState: function( dfaState, ch )
    {
        var nfaStateList = [];
        var accepts = [];
        var i;
        //dbg.printf("Transition from DFA[%d] on ch=%s\n", dfaState.id, ch );

        this.listId++;

        for ( i = 0; i < dfaState.nfaStates.length; i++ ) {
            var nfaState = dfaState.nfaStates[i];
            if ( nfaState.mchar !== null ) {
                if ( nfaState.mchar.match( ch ) ) {
                    this.addState( nfaStateList, accepts, nfaState.next[0] );
                } else if ( ch == PRE_NEWLINE || ch == POST_NEWLINE ) {
                    this.addState( nfaStateList, accepts, nfaState );
                }
            }
        }

        nfaStateList.sort( function( a, b ) { return a.id - b.id; } );

        var key = "";
        for ( i = 0; i < nfaStateList.length; i++ ) {
            key += nfaStateList[i].id + ".";
        }

        if ( this.dfaCache[key] === undefined ) {
            dfaState = new DfaState();
            //dbg.printf("Created DFA state [%d] accepts=%s\n", dfaState.id, accepts );
            dfaState.nfaStates = nfaStateList;
            dfaState.accepts = accepts;
            this.dfaCache[key] = dfaState;
        } else {
            //dbg.printf("Returning cached DFA state [%d] accepts=%s\n",
            //        this.dfaCache[key].id, this.dfaCache[key].accepts );
        }

        return this.dfaCache[key];
    },

    setText: function(text)
    {
        this.text = text;
        this.lineNumbers.length = 0;
        this.lineNumbers.push( 0 );
        this.finished = false;

        for ( var i = 0; i < this.text.length; i++ ) {
            if ( this.text[i] == '\n' ) {
                this.lineNumbers.push( i + 1 );
            }
        }
    },

    getLine: function( lineno ) 
    {
        return this.text.substr( this.lineNumbers[lineno],
            this.lineNumbers[lineno+1] - this.lineNumbers[lineno]);
    },

    /**
        Retrieve a list of tokens that match at a given position. The list is
        returned sorted in order of length.

        @param text Text to match.
        @param line Line number to begin matching, starting from 0
        @param position Character position on the line to begin matching.
    */
    nextTokenInternal: function( line, position )
    {
        var last = 0;
        var i;
        var accept = null;

        if ( this.rootDfa === undefined ) {
            this.rootDfa = new DfaState();
            this.addState( this.rootDfa.nfaStates, this.rootDfa.accepts, 
                this.root );
        }

        var dfaState = this.rootDfa;

        var startPosition = this.lineNumbers[line] + position;
        //dbg.printf("Start match from %d:%d\n", line, position );

        if ( startPosition == this.text.length ) {
            this.finished = true;
            return new Token( this.EOF_TOKEN, "!EOF", line, position );
        }

        if ( startPosition > 0 ) {
            last = this.text[startPosition-1];
        }

        for( i = startPosition; i < this.text.length; i++ ) {
            //dbg.printf("Enter DFA state %d\n", dfaState.id );
            var ch = this.text[i];

            if ( ch === '\n' && last != PRE_NEWLINE ) {
                ch = PRE_NEWLINE;
                i--;
            } else if ( last === '\n' || last === 0 ) {
                ch = POST_NEWLINE;
                i--;
            }

            if ( last === '\n' ) {
                line++;
            }
            last = ch;

            if ( dfaState.next[ch] === undefined ) {
                dfaState.next[ch] = this.nextState( dfaState, ch );
            }
            dfaState = dfaState.next[ch];

            if ( dfaState.accepts.length ) {
                //dbg.printf("Would accept %s\n", dfaState.accepts[0] );
                //dbg.printf("i:%d line:%d lineNumbers=%d\n",
                //    i, line, this.lineNumbers[line] );
                accept = new Token( 
                        dfaState.accepts[0],
                        this.text.substr(startPosition, i - startPosition + 1),
                        line, 
                        startPosition - this.lineNumbers[line]);
            }

            if ( dfaState.nfaStates.length === 0 ) {
                break;
            }
        }

        if ( accept ) {
            //dbg.printf("Returning match id=%s at %d:%d text=%s\n", accept.id,
            //    accept.locus.line, accept.locus.position, accept.text );
        } else if ( 0 ) {
            dbg.printf("Bad token at '%s'\n", this.text.substr(startPosition,
            10));
            dbg.printf("ascii %d\n", this.text.charCodeAt( startPosition ));
        }

        return accept;
    },

    nextToken: function( line, position )
    {
        for( ;; ) {
            var token = this.nextTokenInternal( line, position );
            if ( token === null || token.id !== this.IGNORE_TOKEN ) {
                return token;
            }
            line = token.locus.line;
            position = token.locus.position + token.text.length;
        }
    }
};