/*
* TokenStringDFA.cs
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307, USA.
*
* Copyright (c) 2004-2009 Per Cederberg. All rights reserved.
*/
using System;
using System.Text;
namespace PerCederberg.Grammatica.Runtime {
/**
* A deterministic finite state automaton for matching exact strings.
* It uses a sorted binary tree representation of the state
* transitions in order to enable quick matches with a minimal memory
* footprint. It only supports a single character transition between
* states, but may be run in an all case-insensitive mode.
*
* @author Per Cederberg, <per at percederberg dot net>
* @version 1.5
* @since 1.5
*/
internal class TokenStringDFA {
/**
* The lookup table for root states, indexed by the first ASCII
* character. This array is used to for speed optimizing the
* first step in the match.
*/
private DFAState[] ascii = new DFAState[128];
/**
* The automaton state transition tree for non-ASCII characters.
* Each transition from one state to another is added to the tree
* with the corresponding character.
*/
private DFAState nonAscii = new DFAState();
/**
* Creates a new empty string automaton.
*/
public TokenStringDFA() {
}
/**
* Adds a string match to this automaton. New states and
* transitions will be added to extend this automaton to
* support the specified string.
*
* @param str the string to match
* @param caseInsensitive the case-insensitive flag
* @param value the match value
*/
public void AddMatch(string str, bool caseInsensitive, TokenPattern value) {
DFAState state;
DFAState next;
char c = str[0];
int start = 0;
if (caseInsensitive) {
c = Char.ToLower(c);
}
if (c < 128) {
state = ascii[c];
if (state == null) {
state = ascii[c] = new DFAState();
}
start++;
} else {
state = nonAscii;
}
for (int i = start; i < str.Length; i++) {
next = state.tree.Find(str[i], caseInsensitive);
if (next == null) {
next = new DFAState();
state.tree.Add(str[i], caseInsensitive, next);
}
state = next;
}
state.value = value;
}
/**
* Checks if the automaton matches an input stream. The
* matching will be performed from a specified position. This
* method will not read any characters from the stream, just
* peek ahead. The comparison can be done either in
* case-sensitive or case-insensitive mode.
*
* @param input the input stream to check
* @param pos the starting position
* @param caseInsensitive the case-insensitive flag
*
* @return the match value, or
* null if no match was found
*
* @throws IOException if an I/O error occurred
*/
public TokenPattern Match(ReaderBuffer buffer, bool caseInsensitive) {
TokenPattern result = null;
DFAState state;
int pos = 0;
int c;
c = buffer.Peek(0);
if (c < 0) {
return null;
}
if (caseInsensitive) {
c = Char.ToLower((char) c);
}
if (c < 128) {
state = ascii[c];
if (state == null) {
return null;
} else if (state.value != null) {
result = state.value;
}
pos++;
} else {
state = nonAscii;
}
while ((c = buffer.Peek(pos)) >= 0) {
state = state.tree.Find((char) c, caseInsensitive);
if (state == null) {
break;
} else if (state.value != null) {
result = state.value;
}
pos++;
}
return result;
}
/**
* Returns a detailed string representation of this automaton.
*
* @return a detailed string representation of this automaton
*/
public override string ToString() {
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < ascii.Length; i++) {
if (ascii[i] != null) {
buffer.Append((char) i);
if (ascii[i].value != null) {
buffer.Append(": ");
buffer.Append(ascii[i].value);
buffer.Append("\n");
}
ascii[i].tree.PrintTo(buffer, " ");
}
}
nonAscii.tree.PrintTo(buffer, "");
return buffer.ToString();
}
}
/**
* An automaton state. This class represents a state in the DFA
* graph.
*
* @author Per Cederberg, <per at percederberg dot net>
* @version 1.5
* @since 1.5
*/
internal class DFAState {
/**
* The token pattern matched at this state.
*/
internal TokenPattern value = null;
/**
* The automaton state transition tree. Each transition from one
* state to another is added to the tree with the corresponding
* character.
*/
internal TransitionTree tree = new TransitionTree();
}
/**
* An automaton state transition tree. This class contains a
* binary search tree for the automaton transitions from one
* state to another. All transitions are linked to a single
* character.
*
* @author Per Cederberg, <per at percederberg dot net>
* @version 1.5
* @since 1.5
*/
internal class TransitionTree {
/**
* The transition character. If this value is set to the zero
* character ('\0'), this tree is empty.
*/
private char value = '\0';
/**
* The transition target state.
*/
private DFAState state = null;
/**
* The left subtree.
*/
private TransitionTree left = null;
/**
* The right subtree.
*/
private TransitionTree right = null;
/**
* Creates a new empty automaton transition tree.
*/
public TransitionTree() {
}
/**
* Finds an automaton state from the specified transition
* character. This method searches this transition tree for a
* matching transition. The comparison can optionally be done
* with a lower-case conversion of the character.
*
* @param c the character to search for
* @param lowerCase the lower-case conversion flag
*
* @return the automaton state found, or
* null if no transition exists
*/
public DFAState Find(char c, bool lowerCase) {
if (lowerCase) {
c = Char.ToLower(c);
}
if (value == '\0' || value == c) {
return state;
} else if (value > c) {
return left.Find(c, false);
} else {
return right.Find(c, false);
}
}
/**
* Adds a transition to this tree. If the lower-case flag is
* set, the character will be converted to lower-case before
* being added.
*
* @param c the character to transition for
* @param lowerCase the lower-case conversion flag
* @param state the state to transition to
*/
public void Add(char c, bool lowerCase, DFAState state) {
if (lowerCase) {
c = Char.ToLower(c);
}
if (value == '\0') {
this.value = c;
this.state = state;
this.left = new TransitionTree();
this.right = new TransitionTree();
} else if (value > c) {
left.Add(c, false, state);
} else {
right.Add(c, false, state);
}
}
/**
* Prints the automaton tree to the specified string buffer.
*
* @param buffer the string buffer
* @param indent the current indentation
*/
public void PrintTo(StringBuilder buffer, String indent) {
if (this.left != null) {
this.left.PrintTo(buffer, indent);
}
if (this.value != '\0') {
if (buffer.Length > 0 && buffer[buffer.Length -1] == '\n') {
buffer.Append(indent);
}
buffer.Append(this.value);
if (this.state.value != null) {
buffer.Append(": ");
buffer.Append(this.state.value);
buffer.Append("\n");
}
this.state.tree.PrintTo(buffer, indent + " ");
}
if (this.right != null) {
this.right.PrintTo(buffer, indent);
}
}
}
}