/*
 * TokenRegExpParser.cs
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 3
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307, USA.
 *
 * Copyright (c) 2003-2009 Per Cederberg. All rights reserved.
 */

using System;
using System.Collections;
using System.Globalization;
using System.Text;
using PerCederberg.Grammatica.Runtime.RE;

namespace PerCederberg.Grammatica.Runtime {

    /**
     * A regular expression parser. The parser creates an NFA for the
     * regular expression having a single start and acceptance states.
     *
     * @author   Per Cederberg, <per at percederberg dot net>
     * @version  1.5
     * @since    1.5
     */
    internal class TokenRegExpParser {

        /**
        * The regular expression pattern.
        */
        private string pattern;

        /**
        * The character case ignore flag.
        */
        private bool ignoreCase;

        /**
        * The current position in the pattern. This variable is used by
        * the parsing methods.
        */
        private int pos;

        /**
        * The start NFA state for this regular expression.
        */
        internal NFAState start = new NFAState();

        /**
        * The end NFA state for this regular expression.
        */
        internal NFAState end = null;

        /**
        * The number of states found.
        */
        private int stateCount = 0;

        /**
        * The number of transitions found.
        */
        private int transitionCount = 0;

        /**
        * The number of epsilon transitions found.
        */
        private int epsilonCount = 0;

        /**
         * Creates a new case-sensitive regular expression parser. Note
         * that this will trigger the parsing of the regular expression.
         *
         * @param pattern        the regular expression pattern
         *
         * @throws RegExpException if the regular expression couldn't be
         *             parsed correctly
         */
        public TokenRegExpParser(string pattern) : this(pattern, false) {
        }

        /**
         * Creates a new regular expression parser. The regular
         * expression can be either case-sensitive or case-insensitive.
         * Note that this will trigger the parsing of the regular
         * expression.
         *
         * @param pattern        the regular expression pattern
         * @param ignoreCase     the character case ignore flag
         *
         * @throws RegExpException if the regular expression couldn't be
         *             parsed correctly
         */
        public TokenRegExpParser(string pattern, bool ignoreCase) {
            this.pattern = pattern;
            this.ignoreCase = ignoreCase;
            this.pos = 0;
            this.end = ParseExpr(start);
            if (pos < pattern.Length) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos,
                    pattern);
            }
        }

        /**
         * Returns the debug information for the generated NFA.
         *
         * @return the debug information for the generated NFA
         */
        public string GetDebugInfo() {
            if (stateCount == 0) {
                UpdateStats(start, new Hashtable());
            }
            return stateCount + " states, " +
                   transitionCount + " transitions, " +
                   epsilonCount + " epsilons";
        }

        /**
         * Updates the statistical counters for the NFA generated.
         *
         * @param state          the current state to visit
         * @param visited        the lookup map of visited states
         */
        private void UpdateStats(NFAState state, Hashtable visited) {
            if (!visited.ContainsKey(state)) {
                visited.Add(state, state);
                stateCount++;
                for (int i = 0; i < state.outgoing.Length; i++) {
                    transitionCount++;
                    if (state.outgoing[i] is NFAEpsilonTransition) {
                        epsilonCount++;
                    }
                    UpdateStats(state.outgoing[i].state, visited);
                }
            }
        }

        /**
         * Parses a regular expression. This method handles the Expr
         * production in the grammar (see regexp.grammar).
         *
         * @param start          the initial NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseExpr(NFAState start) {
            NFAState  end = new NFAState();
            NFAState  subStart;
            NFAState  subEnd;

            do {
                if (PeekChar(0) == '|') {
                    ReadChar('|');
                }
                subStart = new NFAState();
                subEnd = ParseTerm(subStart);
                if (subStart.incoming.Length == 0) {
                    subStart.MergeInto(start);
                } else {
                    start.AddOut(new NFAEpsilonTransition(subStart));
                }
                if (subEnd.outgoing.Length == 0 ||
                    (!end.HasTransitions() && PeekChar(0) != '|')) {
                    subEnd.MergeInto(end);
                } else {
                    subEnd.AddOut(new NFAEpsilonTransition(end));
                }
            } while (PeekChar(0) == '|');
            return end;
        }

        /**
         * Parses a regular expression term. This method handles the
         * Term production in the grammar (see regexp.grammar).
         *
         * @param start          the initial NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseTerm(NFAState start) {
            NFAState  end;

            end = ParseFact(start);
            while (true) {
                switch (PeekChar(0)) {
                case -1:
                case ')':
                case ']':
                case '{':
                case '}':
                case '?':
                case '+':
                case '|':
                    return end;
                default:
                    end = ParseFact(end);
                    break;
                }
            }
        }

        /**
         * Parses a regular expression factor. This method handles the
         * Fact production in the grammar (see regexp.grammar).
         *
         * @param start          the initial NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseFact(NFAState start) {
            NFAState  placeholder = new NFAState();
            NFAState  end;

            end = ParseAtom(placeholder);
            switch (PeekChar(0)) {
            case '?':
            case '*':
            case '+':
            case '{':
                end = ParseAtomModifier(placeholder, end);
                break;
            }
            if (placeholder.incoming.Length > 0 && start.outgoing.Length > 0) {
                start.AddOut(new NFAEpsilonTransition(placeholder));
                return end;
            } else {
                placeholder.MergeInto(start);
                return (end == placeholder) ? start : end;
            }
        }

        /**
         * Parses a regular expression atom. This method handles the
         * Atom production in the grammar (see regexp.grammar).
         *
         * @param start          the initial NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseAtom(NFAState start) {
            NFAState  end;

            switch (PeekChar(0)) {
            case '.':
                ReadChar('.');
                return start.AddOut(new NFADotTransition(new NFAState()));
            case '(':
                ReadChar('(');
                end = ParseExpr(start);
                ReadChar(')');
                return end;
            case '[':
                ReadChar('[');
                end = ParseCharSet(start);
                ReadChar(']');
                return end;
            case -1:
            case ')':
            case ']':
            case '{':
            case '}':
            case '?':
            case '*':
            case '+':
            case '|':
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos,
                    pattern);
            default:
                return ParseChar(start);
            }
        }

        /**
         * Parses a regular expression atom modifier. This method handles
         * the AtomModifier production in the grammar (see regexp.grammar).
         *
         * @param start          the initial NFA state
         * @param end            the terminal NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseAtomModifier(NFAState start, NFAState end) {
            int  min = 0;
            int  max = -1;
            int  firstPos = pos;

            // Read min and max
            switch (ReadChar()) {
            case '?':
                min = 0;
                max = 1;
                break;
            case '*':
                min = 0;
                max = -1;
                break;
            case '+':
                min = 1;
                max = -1;
                break;
            case '{':
                min = ReadNumber();
                max = min;
                if (PeekChar(0) == ',') {
                    ReadChar(',');
                    max = -1;
                    if (PeekChar(0) != '}') {
                        max = ReadNumber();
                    }
                }
                ReadChar('}');
                if (max == 0 || (max > 0 && min > max)) {
                    throw new RegExpException(
                        RegExpException.ErrorType.INVALID_REPEAT_COUNT,
                        firstPos,
                        pattern);
                }
                break;
            default:
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos -1,
                    pattern);
            }

            // Read possessive or reluctant modifiers
            if (PeekChar(0) == '?') {
                throw new RegExpException(
                    RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
                    pos,
                    pattern);
            } else if (PeekChar(0) == '+') {
                throw new RegExpException(
                    RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
                    pos,
                    pattern);
            }

            // Handle supported repeaters
            if (min == 0 && max == 1) {
                return start.AddOut(new NFAEpsilonTransition(end));
            } else if (min == 0 && max == -1) {
                if (end.outgoing.Length == 0) {
                    end.MergeInto(start);
                } else {
                    end.AddOut(new NFAEpsilonTransition(start));
                }
                return start;
            } else if (min == 1 && max == -1) {
                if (start.outgoing.Length == 1 &&
                    end.outgoing.Length == 0 &&
                    end.incoming.Length == 1 &&
                    start.outgoing[0] == end.incoming[0]) {

                    end.AddOut(start.outgoing[0].Copy(end));
                } else {
                    end.AddOut(new NFAEpsilonTransition(start));
                }
                return end;
            } else {
                throw new RegExpException(
                    RegExpException.ErrorType.INVALID_REPEAT_COUNT,
                    firstPos,
                    pattern);
            }
        }

        /**
         * Parses a regular expression character set. This method handles
         * the contents of the '[...]' construct in a regular expression.
         *
         * @param start          the initial NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseCharSet(NFAState start) {
            NFAState                end = new NFAState();
            NFACharRangeTransition  range;
            char                    min;
            char                    max;

            if (PeekChar(0) == '^') {
                ReadChar('^');
                range = new NFACharRangeTransition(true, ignoreCase, end);
            } else {
                range = new NFACharRangeTransition(false, ignoreCase, end);
            }
            start.AddOut(range);
            while (PeekChar(0) > 0) {
                min = (char) PeekChar(0);
                switch (min) {
                case ']':
                    return end;
                case '\\':
                    range.AddCharacter(ReadEscapeChar());
                    break;
                default:
                    ReadChar(min);
                    if (PeekChar(0) == '-' &&
                        PeekChar(1) > 0 &&
                        PeekChar(1) != ']') {

                        ReadChar('-');
                        max = ReadChar();
                        range.AddRange(min, max);
                    } else {
                        range.AddCharacter(min);
                    }
                    break;
                }
            }
            return end;
        }

        /**
         * Parses a regular expression character. This method handles
         * a single normal character in a regular expression.
         *
         * @param start          the initial NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseChar(NFAState start) {
            switch (PeekChar(0)) {
            case '\\':
                return ParseEscapeChar(start);
            case '^':
            case '$':
                throw new RegExpException(
                    RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
                    pos,
                    pattern);
            default:
                return start.AddOut(ReadChar(), ignoreCase, new NFAState());
            }
        }

        /**
         * Parses a regular expression character escape. This method
         * handles a single character escape in a regular expression.
         *
         * @param start          the initial NFA state
         *
         * @return the terminating NFA state
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private NFAState ParseEscapeChar(NFAState start) {
            NFAState  end = new NFAState();

            if (PeekChar(0) == '\\' && PeekChar(1) > 0) {
                switch ((char) PeekChar(1)) {
                case 'd':
                    ReadChar();
                    ReadChar();
                    return start.AddOut(new NFADigitTransition(end));
                case 'D':
                    ReadChar();
                    ReadChar();
                    return start.AddOut(new NFANonDigitTransition(end));
                case 's':
                    ReadChar();
                    ReadChar();
                    return start.AddOut(new NFAWhitespaceTransition(end));
                case 'S':
                    ReadChar();
                    ReadChar();
                    return start.AddOut(new NFANonWhitespaceTransition(end));
                case 'w':
                    ReadChar();
                    ReadChar();
                    return start.AddOut(new NFAWordTransition(end));
                case 'W':
                    ReadChar();
                    ReadChar();
                    return start.AddOut(new NFANonWordTransition(end));
                }
            }
            return start.AddOut(ReadEscapeChar(), ignoreCase, end);
        }

        /**
         * Reads a regular expression character escape. This method
         * handles a single character escape in a regular expression.
         *
         * @return the character read
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private char ReadEscapeChar() {
            char    c;
            string  str;
            int     value;

            ReadChar('\\');
            c = ReadChar();
            switch (c) {
            case '0':
                c = ReadChar();
                if (c < '0' || c > '3') {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -3,
                        pattern);
                }
                value = c -'0';
                c = (char) PeekChar(0);
                if ('0' <= c && c <= '7') {
                    value *= 8;
                    value += ReadChar() -'0';
                    c = (char) PeekChar(0);
                    if ('0' <= c && c <= '7') {
                        value *= 8;
                        value += ReadChar() -'0';
                    }
                }
                return (char) value;
            case 'x':
                str = ReadChar().ToString() + ReadChar().ToString();
                try {
                    value = Int32.Parse(str, NumberStyles.AllowHexSpecifier);
                    return (char) value;
                } catch (FormatException) {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -str.Length -2,
                        pattern);
                }
            case 'u':
                str = ReadChar().ToString() +
                      ReadChar().ToString() +
                      ReadChar().ToString() +
                      ReadChar().ToString();
                try {
                    value = Int32.Parse(str, NumberStyles.AllowHexSpecifier);
                    return (char) value;
                } catch (FormatException) {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -str.Length -2,
                        pattern);
                }
            case 't':
                return '\t';
            case 'n':
                return '\n';
            case 'r':
                return '\r';
            case 'f':
                return '\f';
            case 'a':
                return '\u0007';
            case 'e':
                return '\u001B';
            default:
                if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -2,
                        pattern);
                }
                return c;
            }
        }

        /**
         * Reads a number from the pattern. If the next character isn't a
         * numeric character, an exception is thrown. This method reads
         * several consecutive numeric characters.
         *
         * @return the numeric value read
         *
         * @throws RegExpException if an error was encountered in the
         *             pattern string
         */
        private int ReadNumber() {
            StringBuilder  buf = new StringBuilder();
            int            c;

            c = PeekChar(0);
            while ('0' <= c && c <= '9') {
                buf.Append(ReadChar());
                c = PeekChar(0);
            }
            if (buf.Length <= 0) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos,
                    pattern);
            }
            return Int32.Parse(buf.ToString());
        }

        /**
         * Reads the next character in the pattern. If no next character
         * exists, an exception is thrown.
         *
         * @return the character read
         *
         * @throws RegExpException if no next character was available in
         *             the pattern string
         */
        private char ReadChar() {
            int  c = PeekChar(0);

            if (c < 0) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNTERMINATED_PATTERN,
                    pos,
                    pattern);
            } else {
                pos++;
                return (char) c;
            }
        }

        /**
         * Reads the next character in the pattern. If the character
         * wasn't the specified one, an exception is thrown.
         *
         * @param c              the character to read
         *
         * @return the character read
         *
         * @throws RegExpException if the character read didn't match the
         *             specified one, or if no next character was
         *             available in the pattern string
         */
        private char ReadChar(char c) {
            if (c != ReadChar()) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos -1,
                    pattern);
            }
            return c;
        }

        /**
         * Returns a character that has not yet been read from the
         * pattern. If the requested position is beyond the end of the
         * pattern string, -1 is returned.
         *
         * @param count          the preview position, from zero (0)
         *
         * @return the character found, or
         *         -1 if beyond the end of the pattern string
         */
        private int PeekChar(int count) {
            if (pos + count < pattern.Length) {
                return pattern[pos + count];
            } else {
                return -1;
            }
        }
    }
}