00001 #ifndef IONFLUX_TOOLS_UTF8TOKENIZER 00002 #define IONFLUX_TOOLS_UTF8TOKENIZER 00003 /* ========================================================================== 00004 * Ionflux Tools 00005 * Copyright (c) 2005 Joern P. Meier 00006 * mail@ionflux.org 00007 * -------------------------------------------------------------------------- 00008 * Utf8Tokenizer.hpp Tokenizer with UTF-8 support. 00009 * ========================================================================== 00010 * 00011 * This file is part of Ionflux Tools. 00012 * 00013 * Ionflux Tools is free software; you can redistribute it and/or modify it 00014 * under the terms of the GNU General Public License as published by the 00015 * Free Software Foundation; either version 2 of the License, or (at your 00016 * option) any later version. 00017 * 00018 * Ionflux Tools is distributed in the hope that it will be useful, but 00019 * WITHOUT ANY WARRANTY; without even the implied warranty of 00020 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00021 * General Public License for more details. 00022 * 00023 * You should have received a copy of the GNU General Public License along 00024 * with Ionflux Tools; if not, write to the Free Software Foundation, Inc., 00025 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00026 * 00027 * ========================================================================== */ 00028 00029 #include "ionflux/Utf8TokenTypeMap.hpp" 00030 #include "ionflux/ManagedObject.hpp" 00031 00032 namespace Ionflux 00033 { 00034 00035 namespace Tools 00036 { 00037 00041 struct Utf8Token 00042 { 00044 int typeID; 00046 std::string value; 00047 }; 00048 00050 class Utf8TokenizerClassInfo 00051 : public Ionflux::Tools::ClassInfo 00052 { 00053 public: 00055 Utf8TokenizerClassInfo(); 00057 virtual ~Utf8TokenizerClassInfo() { }; 00058 }; 00059 00079 class Utf8Tokenizer 00080 : public Ionflux::Tools::ManagedObject 00081 { 00082 private: 00083 00084 protected: 00086 std::vector<unsigned int> theInput; 00088 std::vector<unsigned int> quoteChars; 00090 unsigned int currentPos; 00092 unsigned int currentTokenPos; 00094 unsigned int currentQuoteChar; 00096 Utf8TokenTypeMap* typeMap; 00098 Utf8Token currentToken; 00100 bool extractQuoted; 00102 bool extractEscaped; 00103 00104 public: 00106 static const Utf8TokenType TT_INVALID; 00108 static const Utf8TokenType TT_NONE; 00110 static const Utf8TokenType TT_DEFAULT; 00112 static const Utf8TokenType TT_QUOTED; 00114 static const Utf8TokenType TT_ESCAPED; 00116 static const Utf8TokenType TT_LINEAR_WHITESPACE; 00118 static const Utf8TokenType TT_LINETERM; 00120 static const Utf8TokenType TT_IDENTIFIER; 00122 static const Utf8TokenType TT_NUMBER; 00124 static const Utf8TokenType TT_ALPHA; 00126 static const Utf8TokenType TT_DEFAULT_SEP; 00128 static const Utf8TokenType TT_LATIN; 00130 static const Utf8Token TOK_INVALID; 00132 static const Utf8Token TOK_NONE; 00134 static const std::string QUOTE_CHARS; 00136 static const unsigned int ESCAPE_CHAR; 00138 static const Utf8TokenizerClassInfo utf8TokenizerClassInfo; 00140 static const Ionflux::Tools::ClassInfo* CLASS_INFO; 00141 00146 Utf8Tokenizer(); 00147 00154 Utf8Tokenizer(const std::string& initInput); 00155 00163 Utf8Tokenizer(const std::vector<Utf8TokenType>& initTokenTypes, const 00164 std::string& initInput = ""); 00165 00170 virtual ~Utf8Tokenizer(); 00171 00176 virtual void reset(); 00177 00182 virtual void clearTokenTypes(); 00183 00189 virtual void useDefaultTokenTypes(); 00190 00196 virtual void addDefaultTokenType(); 00197 00204 virtual void setTokenTypes(const std::vector<Utf8TokenType>& 00205 newTokenTypes); 00206 00213 virtual void addTokenTypes(const std::vector<Utf8TokenType>& 00214 newTokenTypes); 00215 00222 virtual void addTokenType(const Utf8TokenType& newTokenType); 00223 00230 virtual void setInput(const std::string& newInput); 00231 00238 virtual void setInput(const std::vector<unsigned int>& newInput); 00239 00250 virtual Utf8Token getNextToken(Utf8TokenTypeMap* otherTypeMap = 0); 00251 00258 virtual Utf8Token getCurrentToken(); 00259 00266 virtual int getCurrentTokenType(); 00267 00274 virtual unsigned int getCurrentPos(); 00275 00282 virtual unsigned int getCurrentTokenPos(); 00283 00290 virtual unsigned int getQuoteChar(); 00291 00301 static bool isValid(const Utf8Token& checkToken); 00302 00309 virtual void setExtractQuoted(bool newExtractQuoted); 00310 00315 virtual bool getExtractQuoted() const; 00316 00323 virtual void setExtractEscaped(bool newExtractEscaped); 00324 00329 virtual bool getExtractEscaped() const; 00330 }; 00331 00332 } 00333 00334 } 00335 00339 #endif