/*
 * Tokenizer.cs
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 3
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307, USA.
 *
 * Copyright (c) 2003-2009 Per Cederberg. All rights reserved.
 */

using System;
using System.Collections;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using PerCederberg.Grammatica.Runtime.RE;

namespace PerCederberg.Grammatica.Runtime {

    /**
     * A character stream tokenizer. This class groups the characters read
     * from the stream together into tokens ("words"). The grouping is
     * controlled by token patterns that contain either a fixed string to
     * search for, or a regular expression. If the stream of characters
     * don't match any of the token patterns, a parse exception is thrown.
     *
     * @author   Per Cederberg, <per at percederberg dot net>
     * @version  1.5
     */
    public class Tokenizer {

        /**
         * The token list feature flag.
         */
        private bool useTokenList = false;

        /**
         * The string DFA token matcher. This token matcher uses a
         * deterministic finite automaton (DFA) implementation and is
         * used for all string token patterns. It has a slight speed
         * advantage to the NFA implementation, but should be equivalent
         * on memory usage.
         */
        private StringDFAMatcher stringDfaMatcher;

        /**
         * The regular expression NFA token matcher. This token matcher
         * uses a non-deterministic finite automaton (DFA) implementation
         * and is used for most regular expression token patterns. It is
         * somewhat faster than the other recursive regular expression
         * implementations available, but doesn't support the full
         * syntax. It conserves memory by using a fast queue instead of
         * the stack during processing (no stack overflow).
         */
        private NFAMatcher nfaMatcher;

        /**
         * The regular expression token matcher. This token matcher is
         * used for complex regular expressions, but should be avoided
         * due to possibly degraded speed and memory usage compared to
         * the automaton implementations.
         */
        private RegExpMatcher regExpMatcher;

        /**
        * The character stream reader buffer.
        */
        private ReaderBuffer buffer = null;

        /**
        * The last token match found.
        */
        private TokenMatch lastMatch = new TokenMatch();

        /**
         * The previous token in the token list.
         */
        private Token previousToken = null;

        /**
         * Creates a new case-sensitive tokenizer for the specified
         * input stream.
         *
         * @param input          the input stream to read
         */
        public Tokenizer(TextReader input)
            : this(input, false) {
        }

        /**
         * Creates a new tokenizer for the specified input stream. The
         * tokenizer can be set to process tokens either in
         * case-sensitive or case-insensitive mode.
         *
         * @param input          the input stream to read
         * @param ignoreCase     the character case ignore flag
         *
         * @since 1.5
         */
        public Tokenizer(TextReader input, bool ignoreCase) {
            this.stringDfaMatcher = new StringDFAMatcher(ignoreCase);
            this.nfaMatcher = new NFAMatcher(ignoreCase);
            this.regExpMatcher = new RegExpMatcher(ignoreCase);
            this.buffer = new ReaderBuffer(input);
        }

        /**
         * The token list flag property. If the token list flag is
         * set, all tokens (including ignored tokens) link to each
         * other in a double-linked list. By default the token list
         * flag is set to false.
         *
         * @see Token#Previous
         * @see Token#Next
         *
         * @since 1.5
         */
        public bool UseTokenList {
            get {
                return useTokenList;
            }
            set {
                useTokenList = value;
            }
        }

        /**
         * Checks if the token list feature is used. The token list
         * feature makes all tokens (including ignored tokens) link to
         * each other in a linked list. By default the token list feature
         * is not used.
         *
         * @return true if the token list feature is used, or
         *         false otherwise
         *
         * @see #UseTokenList
         * @see #SetUseTokenList
         * @see Token#GetPreviousToken
         * @see Token#GetNextToken
         *
         * @since 1.4
         *
         * @deprecated Use the UseTokenList property instead.
         */
        public bool GetUseTokenList() {
            return useTokenList;
        }

        /**
         * Sets the token list feature flag. The token list feature makes
         * all tokens (including ignored tokens) link to each other in a
         * linked list when active. By default the token list feature is
         * not used.
         *
         * @param useTokenList   the token list feature flag
         *
         * @see #UseTokenList
         * @see #GetUseTokenList
         * @see Token#GetPreviousToken
         * @see Token#GetNextToken
         *
         * @since 1.4
         *
         * @deprecated Use the UseTokenList property instead.
         */
        public void SetUseTokenList(bool useTokenList) {
            this.useTokenList = useTokenList;
        }

        /**
         * Returns a description of the token pattern with the
         * specified id.
         *
         * @param id             the token pattern id
         *
         * @return the token pattern description, or
         *         null if not present
         */
        public string GetPatternDescription(int id) {
            TokenPattern  pattern;

            pattern = stringDfaMatcher.GetPattern(id);
            if (pattern == null) {
                pattern = nfaMatcher.GetPattern(id);
            }
            if (pattern == null) {
                pattern = regExpMatcher.GetPattern(id);
            }
            return (pattern == null) ? null : pattern.ToShortString();
        }

        /**
         * Returns the current line number. This number will be the line
         * number of the next token returned.
         *
         * @return the current line number
         */
        public int GetCurrentLine() {
            return buffer.LineNumber;
        }

        /**
         * Returns the current column number. This number will be the
         * column number of the next token returned.
         *
         * @return the current column number
         */
        public int GetCurrentColumn() {
            return buffer.ColumnNumber;
        }

        /**
         * Adds a new token pattern to the tokenizer. The pattern will be
         * added last in the list, choosing a previous token pattern in
         * case two matches the same string.
         *
         * @param pattern        the pattern to add
         *
         * @throws ParserCreationException if the pattern couldn't be
         *             added to the tokenizer
         */
        public void AddPattern(TokenPattern pattern) {
            switch (pattern.Type) {
            case TokenPattern.PatternType.STRING:
                try {
                    stringDfaMatcher.AddPattern(pattern);
                } catch (Exception e) {
                    throw new ParserCreationException(
                        ParserCreationException.ErrorType.INVALID_TOKEN,
                        pattern.Name,
                        "error adding string token: " +
                        e.Message);
                }
                break;
            case TokenPattern.PatternType.REGEXP:
                try {
                    nfaMatcher.AddPattern(pattern);
                } catch (Exception) {
                    try {
                        regExpMatcher.AddPattern(pattern);
                    } catch (Exception e) {
                        throw new ParserCreationException(
                            ParserCreationException.ErrorType.INVALID_TOKEN,
                            pattern.Name,
                            "regular expression contains error(s): " +
                            e.Message);
                    }
                }
                break;
            default:
                throw new ParserCreationException(
                    ParserCreationException.ErrorType.INVALID_TOKEN,
                    pattern.Name,
                    "pattern type " + pattern.Type +
                    " is undefined");
            }
        }

        /**
         * Resets this tokenizer for usage with another input stream.
         * This method will clear all the internal state in the
         * tokenizer as well as close the previous input stream. It
         * is normally called in order to reuse a parser and
         * tokenizer pair with multiple input streams, thereby
         * avoiding the cost of re-analyzing the grammar structures.
         *
         * @param input          the new input stream to read
         *
         * @see Parser#reset(Reader)
         *
         * @since 1.5
         */
        public void Reset(TextReader input) {
            this.buffer.Dispose();
            this.buffer = new ReaderBuffer(input);
            this.previousToken = null;
            this.lastMatch.Clear();
        }

        /**
         * Finds the next token on the stream. This method will return
         * null when end of file has been reached. It will return a
         * parse exception if no token matched the input stream, or if
         * a token pattern with the error flag set matched. Any tokens
         * matching a token pattern with the ignore flag set will be
         * silently ignored and the next token will be returned.
         *
         * @return the next token found, or
         *         null if end of file was encountered
         *
         * @throws ParseException if the input stream couldn't be read or
         *             parsed correctly
         */
        public Token Next() {
            Token  token = null;

            do {
                token = NextToken();
                if (token == null) {
                    return null;
                }
                if (useTokenList) {
                    token.Previous = previousToken;
                    previousToken = token;
                }
                if (token.Pattern.Ignore) {
                    token = null;
                } else if (token.Pattern.Error) {
                    throw new ParseException(
                        ParseException.ErrorType.INVALID_TOKEN,
                        token.Pattern.ErrorMessage,
                        token.StartLine,
                        token.StartColumn);
                }
            } while (token == null);
            return token;
        }

        /**
         * Finds the next token on the stream. This method will return
         * null when end of file has been reached. It will return a
         * parse exception if no token matched the input stream.
         *
         * @return the next token found, or
         *         null if end of file was encountered
         *
         * @throws ParseException if the input stream couldn't be read or
         *             parsed correctly
         */
        private Token NextToken() {
            string  str;
            int     line;
            int     column;

            try {
                lastMatch.Clear();
                stringDfaMatcher.Match(buffer, lastMatch);
                nfaMatcher.Match(buffer, lastMatch);
                regExpMatcher.Match(buffer, lastMatch);
                if (lastMatch.Length > 0) {
                    line = buffer.LineNumber;
                    column = buffer.ColumnNumber;
                    str = buffer.Read(lastMatch.Length);
                    return NewToken(lastMatch.Pattern, str, line, column);
                } else if (buffer.Peek(0) < 0) {
                    return null;
                } else {
                    line = buffer.LineNumber;
                    column = buffer.ColumnNumber;
                    throw new ParseException(
                        ParseException.ErrorType.UNEXPECTED_CHAR,
                        buffer.Read(1),
                        line,
                        column);
                }
            } catch (IOException e) {
                throw new ParseException(ParseException.ErrorType.IO,
                                         e.Message,
                                         -1,
                                         -1);
            }
        }

        /**
         * Factory method for creating a new token. This method can be
         * overridden to provide other token implementations than the
         * default one.
         *
         * @param pattern        the token pattern
         * @param image          the token image (i.e. characters)
         * @param line           the line number of the first character
         * @param column         the column number of the first character
         *
         * @return the token created
         *
         * @since 1.5
         */
        protected virtual Token NewToken(TokenPattern pattern,
                                         string image,
                                         int line,
                                         int column) {

            return new Token(pattern, image, line, column);
        }
        
        /**
         * Returns a string representation of this object. The returned
         * string will contain the details of all the token patterns
         * contained in this tokenizer.
         *
         * @return a detailed string representation
         */
        public override string ToString() {
            StringBuilder  buffer = new StringBuilder();

            buffer.Append(stringDfaMatcher);
            buffer.Append(nfaMatcher);
            buffer.Append(regExpMatcher);
            return buffer.ToString();
        }
    }


    /**
     * A token pattern matcher. This class is the base class for the
     * various types of token matchers that exist. The token matcher
     * checks for matches with the tokenizer buffer, and maintains the
     * state of the last match.
     */
    internal abstract class TokenMatcher {

        /**
         * The array of token patterns.
         */
        protected TokenPattern[] patterns = new TokenPattern[0];

        /**
         * The ignore character case flag.
         */
        protected bool ignoreCase = false;

        /**
         * Creates a new token matcher.
         *
         * @param ignoreCase      the character case ignore flag
         */
        public TokenMatcher(bool ignoreCase) {
            this.ignoreCase = ignoreCase;
        }

        /**
         * Searches for matching token patterns at the start of the
         * input stream. If a match is found, the token match object
         * is updated.
         *
         * @param buffer         the input buffer to check
         * @param match          the token match to update
         *
         * @throws IOException if an I/O error occurred
         */
        public abstract void Match(ReaderBuffer buffer, TokenMatch match);

        /**
         * Returns the token pattern with the specified id. Only
         * token patterns handled by this matcher can be returned.
         *
         * @param id         the token pattern id
         *
         * @return the token pattern found, or
         *         null if not found
         */
        public TokenPattern GetPattern(int id) {
            for (int i = 0; i < patterns.Length; i++) {
                if (patterns[i].Id == id) {
                    return patterns[i];
                }
            }
            return null;
        }

        /**
         * Adds a string token pattern to this matcher.
         *
         * @param pattern        the pattern to add
         *
         * @throws Exception if the pattern couldn't be added to the matcher
         */
        public virtual void AddPattern(TokenPattern pattern) {
            Array.Resize(ref patterns, patterns.Length + 1);
            patterns[patterns.Length -1] = pattern;
        }

        /**
         * Returns a string representation of this matcher. This will
         * contain all the token patterns.
         *
         * @return a detailed string representation of this matcher
         */
        public override string ToString() {
            StringBuilder  buffer = new StringBuilder();

            for (int i = 0; i < patterns.Length; i++) {
                buffer.Append(patterns[i]);
                buffer.Append("\n\n");
            }
            return buffer.ToString();
        }
    }


    /**
     * A token pattern matcher using a DFA for string tokens. This
     * class only supports string tokens and must be complemented
     * with another matcher for regular expressions. Internally it
     * uses a DFA to provide high performance.
     */
    internal class StringDFAMatcher : TokenMatcher {

        /**
         * The deterministic finite state automaton used for
         * matching.
         */
        private TokenStringDFA automaton = new TokenStringDFA();

        /**
         * Creates a new string token matcher.
         *
         * @param ignoreCase      the character case ignore flag
         */
        public StringDFAMatcher(bool ignoreCase) : base(ignoreCase) {
        }

        /**
         * Adds a string token pattern to this matcher.
         *
         * @param pattern        the pattern to add
         */
        public override void AddPattern(TokenPattern pattern) {
            automaton.AddMatch(pattern.Pattern, ignoreCase, pattern);
            base.AddPattern(pattern);
        }

        /**
         * Searches for matching token patterns at the start of the
         * input stream. If a match is found, the token match object
         * is updated.
         *
         * @param buffer         the input buffer to check
         * @param match          the token match to update
         *
         * @throws IOException if an I/O error occurred
         */
        public override void Match(ReaderBuffer buffer, TokenMatch match) {
            TokenPattern  res = automaton.Match(buffer, ignoreCase);

            if (res != null) {
                match.Update(res.Pattern.Length, res);
            }
        }
    }


    /**
     * A token pattern matcher using a NFA for both string and
     * regular expression tokens. This class has limited support for
     * regular expressions and must be complemented with another
     * matcher providing full regular expression support. Internally
     * it uses a NFA to provide high performance and low memory
     * usage.
     */
    internal class NFAMatcher : TokenMatcher {

        /**
         * The non-deterministic finite state automaton used for
         * matching.
         */
        private TokenNFA automaton = new TokenNFA();

        /**
         * Creates a new NFA token matcher.
         *
         * @param ignoreCase      the character case ignore flag
         */
        public NFAMatcher(bool ignoreCase) : base(ignoreCase) {
        }

        /**
         * Adds a token pattern to this matcher.
         *
         * @param pattern        the pattern to add
         *
         * @throws Exception if the pattern couldn't be added to the matcher
         */
        public override void AddPattern(TokenPattern pattern) {
            if (pattern.Type == TokenPattern.PatternType.STRING) {
                automaton.AddTextMatch(pattern.Pattern, ignoreCase, pattern);
            } else {
                automaton.AddRegExpMatch(pattern.Pattern, ignoreCase, pattern);
            }
            base.AddPattern(pattern);
        }

        /**
         * Searches for matching token patterns at the start of the
         * input stream. If a match is found, the token match object
         * is updated.
         *
         * @param buffer         the input buffer to check
         * @param match          the token match to update
         *
         * @throws IOException if an I/O error occurred
         */
        public override void Match(ReaderBuffer buffer, TokenMatch match) {
            automaton.Match(buffer, match);
        }
    }


    /**
     * A token pattern matcher for complex regular expressions. This
     * class only supports regular expression tokens and must be
     * complemented with another matcher for string tokens.
     * Internally it uses the Grammatica RE package for high
     * performance or the native java.util.regex package for maximum
     * compatibility.
     */
    internal class RegExpMatcher : TokenMatcher {

        /**
         * The regular expression handlers.
         */
        private REHandler[] regExps = new REHandler[0];

        /**
         * Creates a new regular expression token matcher.
         *
         * @param ignoreCase      the character case ignore flag
         */
        public RegExpMatcher(bool ignoreCase) : base(ignoreCase) {
        }

        /**
         * Adds a regular expression token pattern to this matcher.
         *
         * @param pattern        the pattern to add
         *
         * @throws Exception if the pattern couldn't be added to the matcher
         */
        public override void AddPattern(TokenPattern pattern) {
            REHandler  re;

            try {
                re = new GrammaticaRE(pattern.Pattern, ignoreCase);
                pattern.DebugInfo = "Grammatica regexp\n" + re;
            } catch (Exception) {
                re = new SystemRE(pattern.Pattern, ignoreCase);
                pattern.DebugInfo = "native .NET regexp";
            }
            Array.Resize(ref regExps, regExps.Length + 1);
            regExps[regExps.Length -1] = re;
            base.AddPattern(pattern);
        }

        /**
         * Searches for matching token patterns at the start of the
         * input stream. If a match is found, the token match object
         * is updated.
         *
         * @param buffer         the input buffer to check
         * @param match          the token match to update
         *
         * @throws IOException if an I/O error occurred
         */
        public override void Match(ReaderBuffer buffer, TokenMatch match) {
            for (int i = 0; i < regExps.Length; i++) {
                int length = regExps[i].Match(buffer);
                if (length > 0) {
                    match.Update(length, patterns[i]);
                }
            }
        }
    }


    /**
     * The regular expression handler base class.
     */
    internal abstract class REHandler {

        /**
         * Checks if the start of the input stream matches this
         * regular expression.
         *
         * @param buffer         the input buffer to check
         *
         * @return the longest match found, or
         *         zero (0) if no match was found
         *
         * @throws IOException if an I/O error occurred
         */
        public abstract int Match(ReaderBuffer buffer);
    }


    /**
     * The Grammatica built-in regular expression handler.
     */
    internal class GrammaticaRE : REHandler {

        /**
         * The compiled regular expression.
         */
        private RegExp regExp;

        /**
         * The regular expression matcher to use.
         */
        private Matcher matcher = null;

        /**
         * Creates a new Grammatica regular expression handler.
         *
         * @param regex          the regular expression text
         * @param ignoreCase      the character case ignore flag
         *
         * @throws Exception if the regular expression contained
         *             invalid syntax
         */
        public GrammaticaRE(string regex, bool ignoreCase) {
            regExp = new RegExp(regex, ignoreCase);
        }

        /**
         * Checks if the start of the input stream matches this
         * regular expression.
         *
         * @param buffer         the input buffer to check
         *
         * @return the longest match found, or
         *         zero (0) if no match was found
         *
         * @throws IOException if an I/O error occurred
         */
        public override int Match(ReaderBuffer buffer) {
            if (matcher == null) {
                matcher = regExp.Matcher(buffer);
            } else {
                matcher.Reset(buffer);
            }
            return matcher.MatchFromBeginning() ? matcher.Length() : 0;
        }
    }


    /**
     * The .NET system regular expression handler.
     */
    internal class SystemRE : REHandler {

        /**
         * The parsed regular expression.
         */
        private Regex reg;

        /**
         * Creates a new .NET system regular expression handler.
         *
         * @param regex          the regular expression text
         * @param ignoreCase      the character case ignore flag
         *
         * @throws Exception if the regular expression contained
         *             invalid syntax
         */
        public SystemRE(string regex, bool ignoreCase) {
            if (ignoreCase) {
                reg = new Regex(regex, RegexOptions.IgnoreCase);
            } else {
                reg = new Regex(regex);
            }
        }

        /**
         * Checks if the start of the input stream matches this
         * regular expression.
         *
         * @param buffer         the input buffer to check
         *
         * @return the longest match found, or
         *         zero (0) if no match was found
         *
         * @throws IOException if an I/O error occurred
         */
        public override int Match(ReaderBuffer buffer) {
            Match  m;

            // Ugly hack since .NET doesn't have a flag for when the
            // end of the input string was encountered...
            buffer.Peek(1024 * 16);
            // Also, there is no API to limit the search to the specified
            // position, so we double-check the index afterwards instead.
            m = reg.Match(buffer.ToString(), buffer.Position);
            if (m.Success && m.Index == buffer.Position) {
                return m.Length;
            } else {
                return 0;
            }
        }
    }
}