#include <Utf8Tokenizer.hpp>
Inheritance diagram for Ionflux::Tools::Utf8Tokenizer:
Public Member Functions | |
Utf8Tokenizer () | |
Constructor. | |
Utf8Tokenizer (const std::string &initInput) | |
Constructor. | |
Utf8Tokenizer (const std::vector< Utf8TokenType > &initTokenTypes, const std::string &initInput="") | |
Constructor. | |
virtual | ~Utf8Tokenizer () |
Destructor. | |
virtual void | reset () |
Reset. | |
virtual void | clearTokenTypes () |
Clear token types. | |
virtual void | useDefaultTokenTypes () |
Use default token types. | |
virtual void | addDefaultTokenType () |
Add default token type. | |
virtual void | setTokenTypes (const std::vector< Utf8TokenType > &newTokenTypes) |
Set token types. | |
virtual void | addTokenTypes (const std::vector< Utf8TokenType > &newTokenTypes) |
Add token types. | |
virtual void | addTokenType (const Utf8TokenType &newTokenType) |
Add token type. | |
virtual void | setInput (const std::string &newInput) |
Set input. | |
virtual void | setInput (const std::vector< unsigned int > &newInput) |
Set input. | |
virtual Utf8Token | getNextToken (Utf8TokenTypeMap *otherTypeMap=0) |
Get next token. | |
virtual Utf8Token | getCurrentToken () |
Get current token. | |
virtual int | getCurrentTokenType () |
Get current token type. | |
virtual unsigned int | getCurrentPos () |
Get current position. | |
virtual unsigned int | getCurrentTokenPos () |
Get current token position. | |
virtual unsigned int | getQuoteChar () |
Get quote character. | |
virtual void | setExtractQuoted (bool newExtractQuoted) |
Set extract quoted strings flag. | |
virtual bool | getExtractQuoted () const |
Get extract quoted strings flag. | |
virtual void | setExtractEscaped (bool newExtractEscaped) |
Set extract escaped characters flag. | |
virtual bool | getExtractEscaped () const |
Get extract escaped characters flag. | |
Static Public Member Functions | |
static bool | isValid (const Utf8Token &checkToken) |
Validate token. | |
Static Public Attributes | |
static const Utf8TokenType | TT_INVALID |
Token type: invalid (special). | |
static const Utf8TokenType | TT_NONE |
Token type: none (special). | |
static const Utf8TokenType | TT_DEFAULT = {1, "", 0} |
Token type: default (special). | |
static const Utf8TokenType | TT_QUOTED = {2, "", 0} |
Token type: quoted (special). | |
static const Utf8TokenType | TT_ESCAPED = {3, "", 0} |
Token type: escaped (special). | |
static const Utf8TokenType | TT_LINEAR_WHITESPACE = {4, " \t", 0} |
Token type: linear whitespace. | |
static const Utf8TokenType | TT_LINETERM = {5, "\n\r", 1} |
Token type: linear whitespace. | |
static const Utf8TokenType | TT_IDENTIFIER |
Token type: identifier. | |
static const Utf8TokenType | TT_NUMBER = {7, "0123456789", 0} |
Token type: identifier. | |
static const Utf8TokenType | TT_ALPHA |
Token type: latin alphabet. | |
static const Utf8TokenType | TT_DEFAULT_SEP = {9, "_-.", 0} |
Token type: default separators. | |
static const Utf8TokenType | TT_LATIN |
Token type: lots of latin characters. | |
static const Utf8Token | TOK_INVALID |
Token type: invalid (special). | |
static const Utf8Token | TOK_NONE |
Token type: none (special). | |
static const std::string | QUOTE_CHARS = "'\"" |
Quote characters. | |
static const unsigned int | ESCAPE_CHAR = '\\' |
Escape character. | |
static const Utf8TokenizerClassInfo | utf8TokenizerClassInfo |
Class information instance. | |
static const Ionflux::Tools::ClassInfo * | CLASS_INFO |
Class information. | |
Protected Attributes | |
std::vector< unsigned int > | theInput |
Input characters to be tokenized. | |
std::vector< unsigned int > | quoteChars |
Quote characters. | |
unsigned int | currentPos |
Current position in the input character string. | |
unsigned int | currentTokenPos |
Position of the current token in the input character string. | |
unsigned int | currentQuoteChar |
The current quote character. | |
Utf8TokenTypeMap * | typeMap |
Token type map. | |
Utf8Token | currentToken |
Current token. | |
bool | extractQuoted |
Extract quoted strings flag. | |
bool | extractEscaped |
Extract escaped characters flag. |
A generic tokenizer for parsing UTF-8 strings. To set up a tokenizer, first create a Utf8Tokenizer object. This will be set up using the default token types Utf8Tokenizer::TT_WHITESPACE, Utf8Tokenizer::TT_LINETERM and Utf8Tokenizer::TT_IDENTIFIER. You may then add your own custom token types and optionally set up the Utf8Tokenizer::TT_ANYTHING token type (which will match anything not matched by previously defined token types). To enable extraction of quoted strings and escaped characters, call Utf8Tokenizer::setExtractQuoted() with true
as an argument.
To get a token from the token stream, call Utf8Tokenizer::getNextToken(). Make sure your code handles the Utf8Tokenizer::TT_NONE and Utf8Tokenizer::TT_INVALID special token types (which cannot be disabled). Utf8Tokenizer::getNextToken() will always return Utf8Tokenizer::TT_NONE at the end of the token stream and Utf8Tokenizer::TT_INVALID if an invalid token is encountered.
|
Constructor. Construct new Utf8Tokenizer object. |
|
Constructor. Construct new Utf8Tokenizer object.
|
|
Constructor. Construct new Utf8Tokenizer object.
|
|
Destructor. Destruct Utf8Tokenizer object. |
|
Add default token type. Add a special token type TT_DEFAULT which will be returned if a token is not recognized. |
|
Add token type. Add the specified token type.
|
|
Add token types. Add the specified token types.
|
|
Clear token types. Remove all token types. |
|
Get current position. Get the current position in the input string.
|
|
Get current token. Get the current token from the input string.
|
|
Get current token position. Get the position of the current token in the input string.
|
|
Get current token type. Get the type ID of the current token.
|
|
Get extract escaped characters flag.
|
|
Get extract quoted strings flag.
|
|
Get next token.
Get the next token from the input string. If the optional
|
|
Get quote character. Get the quote character for the current token.
|
|
Validate token. Check whether the specified token is valid (i.e. it is not invalid or empty).
|
|
Reset. Reset the tokenizer. |
|
Set extract escaped characters flag. Set new value of extract escaped characters flag.
|
|
Set extract quoted strings flag. Set new value of extract quoted strings flag.
|
|
Set input. Set the unicode input characters.
|
|
Set input. Set the UTF-8 encoded input string.
|
|
Set token types. Set the token types for the tokenizer.
|
|
Use default token types. Use default token types (TT_LINEAR_WHITESPACE, TT_IDENTIFIER, TT_LINETERM). |
|
Initial value: Class information.
Reimplemented from Ionflux::Tools::ManagedObject. |
|
Current position in the input character string.
|
|
The current quote character.
|
|
Current token.
|
|
Position of the current token in the input character string.
|
|
Escape character.
|
|
Extract escaped characters flag.
|
|
Extract quoted strings flag.
|
|
Quote characters.
|
|
Quote characters.
|
|
Input characters to be tokenized.
|
|
Initial value: { Utf8TokenType::INVALID_ID, ""}
|
|
Initial value: { Utf8TokenType::EMPTY_ID, ""}
|
|
Initial value: {8, "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ", 0}
|
|
Token type: default (special).
|
|
Token type: default separators.
|
|
Token type: escaped (special).
|
|
Initial value: {6, "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_", 0}
|
|
Initial value: { Utf8TokenType::INVALID_ID, "", 0}
|
|
Initial value: {10, "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíî" "ïðñòóôõöøùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜ" "ĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌ" "ōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſ", 0}
|
|
Token type: linear whitespace.
|
|
Token type: linear whitespace.
|
|
Initial value: { Utf8TokenType::EMPTY_ID, "", 0}
|
|
Token type: identifier.
|
|
Token type: quoted (special).
|
|
Token type map.
|
|
Class information instance.
|