PixelLightAPI
.
|
00001 /*********************************************************\ 00002 * File: Tokenizer.h * 00003 * 00004 * Copyright (C) 2002-2012 The PixelLight Team (http://www.pixellight.org/) 00005 * 00006 * This file is part of PixelLight. 00007 * 00008 * PixelLight is free software: you can redistribute it and/or modify 00009 * it under the terms of the GNU Lesser General Public License as published by 00010 * the Free Software Foundation, either version 3 of the License, or 00011 * (at your option) any later version. 00012 * 00013 * PixelLight is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 * GNU Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public License 00019 * along with PixelLight. If not, see <http://www.gnu.org/licenses/>. 00020 \*********************************************************/ 00021 00022 00023 #ifndef __PLCORE_TOKENIZER_H__ 00024 #define __PLCORE_TOKENIZER_H__ 00025 #pragma once 00026 00027 00028 //[-------------------------------------------------------] 00029 //[ Includes ] 00030 //[-------------------------------------------------------] 00031 #include "PLCore/String/String.h" 00032 #include "PLCore/Container/Stack.h" 00033 #include "PLCore/Container/Array.h" 00034 00035 00036 //[-------------------------------------------------------] 00037 //[ Namespace ] 00038 //[-------------------------------------------------------] 00039 namespace PLCore { 00040 00041 00042 //[-------------------------------------------------------] 00043 //[ Forward declarations ] 00044 //[-------------------------------------------------------] 00045 class File; 00046 class BufferedReader; 00047 00048 00049 //[-------------------------------------------------------] 00050 //[ Classes ] 00051 //[-------------------------------------------------------] 00052 /** 00053 * @brief 00054 * Tokenizer class for parsing texts (also called 'scanner' or 'lexer' -> lexical analysis) 00055 * 00056 * @remarks 00057 * The tokenizer parses an input stream (e.g. a file or string) and produces 00058 * a subsequent list of tokens by filtering out whitespace, comments and such. 00059 * There are many settings that can be manipulated to serve your needs, like choosing 00060 * the type of comment tags or the characters that shall be considered whitespace. 00061 * In general, the tokenizer should only be used to produce a list of tokens, 00062 * after that a real parser is used to check the syntax and semantics of the 00063 * parsed code. In addition to this basic functionality the class also provides 00064 * some advanced functions to parse typical syntax like equations (a=b) or vectors 00065 * (a b c). These functions can be used if no strict syntax is needed, e.g. for 00066 * config files. But they are not intended to replace a decent parser. 00067 * 00068 * @verbatim 00069 * Usage example: 00070 * Tokenizer cTokenizer; // Tokenizer instance 00071 * String sText = "This is a test"; // Text to be parsed 00072 * cTokenizer.Start(sText); // Start tokenizer 00073 * String sToken; // String for current token 00074 * sToken = cTokenizer.GetNextToken(); // Will return 'This' 00075 * sToken = cTokenizer.GetNextToken(); // Will return 'is' 00076 * sToken = cTokenizer.GetNextToken(); // Will return 'a' 00077 * sToken = cTokenizer.GetNextToken(); // Will return 'test' 00078 * @endverbatim 00079 */ 00080 class Tokenizer { 00081 00082 00083 //[-------------------------------------------------------] 00084 //[ Public functions ] 00085 //[-------------------------------------------------------] 00086 public: 00087 /** 00088 * @brief 00089 * Constructor 00090 */ 00091 PLCORE_API Tokenizer(); 00092 00093 /** 00094 * @brief 00095 * Destructor 00096 */ 00097 inline ~Tokenizer(); 00098 00099 /** 00100 * @brief 00101 * Get delimiters (characters that a treated as whitespace) 00102 * 00103 * @return 00104 * Delimiter characters 00105 */ 00106 inline String GetDelimiters() const; 00107 00108 /** 00109 * @brief 00110 * Set delimiters (characters that a treated as whitespace) 00111 * 00112 * @param[in] sDelimiters 00113 * Delimiter characters 00114 * 00115 * @remarks 00116 * Default: " \t\r\n" 00117 */ 00118 inline void SetDelimiters(const String &sDelimiters); 00119 00120 /** 00121 * @brief 00122 * Get single characters 00123 * 00124 * @return 00125 * Single characters 00126 */ 00127 inline String GetSingleChars() const; 00128 00129 /** 00130 * @brief 00131 * Set single characters 00132 * 00133 * @param[in] sSingleChars 00134 * Single characters 00135 * 00136 * @remarks 00137 * Default: "{}()[]<*>/=,;\"" 00138 */ 00139 inline void SetSingleChars(const String &sSingleChars); 00140 00141 /** 00142 * @brief 00143 * Get characters that are used for quotes 00144 * 00145 * @return 00146 * Quote characters 00147 */ 00148 inline String GetQuotes() const; 00149 00150 /** 00151 * @brief 00152 * Set characters that are used for quotes 00153 * 00154 * @param[in] sQuotes 00155 * Quote characters 00156 * 00157 * @remarks 00158 * Default: "\"\'" 00159 */ 00160 inline void SetQuotes(const String &sQuotes); 00161 00162 /** 00163 * @brief 00164 * Get the string that starts a multi-line comment 00165 * 00166 * @return 00167 * Comment start tag 00168 */ 00169 inline String GetCommentStartTag() const; 00170 00171 /** 00172 * @brief 00173 * Set the string that starts a multi-line comment 00174 * 00175 * @param[in] sCommentStartTag 00176 * Comment start tag 00177 * 00178 * @remarks 00179 * Default is slash-star ("/ *" without the space in the middle, just written with a space to keep C++ compilers happy). 00180 */ 00181 inline void SetCommentStartTag(const String &sCommentStartTag); 00182 00183 /** 00184 * @brief 00185 * Get the string that ends a multi-line comment 00186 * 00187 * @return 00188 * Comment end tag 00189 */ 00190 inline String GetCommentEndTag() const; 00191 00192 /** 00193 * @brief 00194 * Set the string that ends a multi-line comment 00195 * 00196 * @param[in] sCommentEndTag 00197 * Comment end tag 00198 * 00199 * @remarks 00200 * Default: "*\/" 00201 */ 00202 inline void SetCommentEndTag(const String &sCommentEndTag); 00203 00204 /** 00205 * @brief 00206 * Get the string that starts a single-line comment 00207 * 00208 * @return 00209 * Comment start tag 00210 */ 00211 inline String GetSingleLineComment() const; 00212 00213 /** 00214 * @brief 00215 * Set the string that starts a single-line comment 00216 * 00217 * @param[in] sSingleLineComment 00218 * Comment start tag 00219 * 00220 * @remarks 00221 * Default: "//" 00222 */ 00223 inline void SetSingleLineComment(const String &sSingleLineComment); 00224 00225 /** 00226 * @brief 00227 * Get case sensitivity flag 00228 * 00229 * @return 00230 * 'true' if the text is parsed case sensitive 00231 */ 00232 inline bool IsCaseSensitive() const; 00233 00234 /** 00235 * @brief 00236 * Set case sensitivity flag 00237 * 00238 * @param[in] bCaseSensitive 00239 * 'true' if the text is parsed case sensitive (default is false) 00240 * 00241 * @remarks 00242 * Default: false 00243 */ 00244 inline void SetCaseSensitive(bool bCaseSensitive); 00245 00246 /** 00247 * @brief 00248 * Starts the tokenizer on a string 00249 * 00250 * @param[in] sBuffer 00251 * String buffer 00252 */ 00253 PLCORE_API void Start(const String &sBuffer); 00254 00255 /** 00256 * @brief 00257 * Starts the tokenizer on a file 00258 * 00259 * @param[in] cFile 00260 * File to read, released automatically if Stop() is called 00261 */ 00262 PLCORE_API void Start(File &cFile); 00263 00264 /** 00265 * @brief 00266 * Stops the tokenizer 00267 */ 00268 PLCORE_API void Stop(); 00269 00270 /** 00271 * @brief 00272 * Reads all tokens until the end of the stream 00273 * 00274 * @return 00275 * Array of all tokens of the stream 00276 */ 00277 PLCORE_API Array<String> GetTokens(); 00278 00279 /** 00280 * @brief 00281 * Reads the next token from the stream 00282 * 00283 * @return 00284 * Next token 00285 * 00286 * @note 00287 * - After the token has been read this function goes to the next token in the stream 00288 * - To get the read token again, use GetToken() 00289 * 00290 * @see 00291 * - GetToken() 00292 */ 00293 PLCORE_API String GetNextToken(); 00294 00295 /** 00296 * @brief 00297 * Expects the next token to be equal to a given string 00298 * 00299 * @return 00300 * 'true' if the next token is equal to the string 00301 * 00302 * @note 00303 * - If the expected token has been found, the tokenizer goes to the next token in the stream, 00304 * otherwise it stays at the current token 00305 * - The comparison of strings is done according to the settings given in SetCaseSensitive() 00306 * 00307 * @see 00308 * - IsCaseSensitive() 00309 * - SetCaseSensitive() 00310 */ 00311 PLCORE_API bool ExpectToken(const String &sExpected); 00312 00313 /** 00314 * @brief 00315 * Finds a given token in the stream 00316 * 00317 * @return 00318 * 'true' if the token has been found 00319 * 00320 * @note 00321 * - Reads the next token until the expected token has been found or the end of the stream has been reached 00322 * - If the function has succeed, the next call of GetNextToken() will return the desired token 00323 * - The comparison of strings is done according to the settings given in SetCaseSensitive() 00324 * 00325 * @see 00326 * - IsCaseSensitive() 00327 * - SetCaseSensitive() 00328 */ 00329 PLCORE_API bool FindToken(const String &sExpected); 00330 00331 /** 00332 * @brief 00333 * Returns the current token 00334 * 00335 * @return 00336 * Current token 00337 * 00338 * @note 00339 * - Does not go to the next token in the stream, so multiple calls of this function will 00340 * always return the same token. 00341 */ 00342 inline String GetToken() const; 00343 00344 /** 00345 * @brief 00346 * Compares the current token with a given string 00347 * 00348 * @return 00349 * 'true' if the token is equal to the string 00350 * 00351 * @note 00352 * - Does not go to the next token in the stream, so multiple calls of this function will 00353 * always return the same token 00354 * - The comparison of strings is done according to the settings given in SetCaseSensitive() 00355 * 00356 * @see 00357 * - IsCaseSensitive() 00358 * - SetCaseSensitive() 00359 */ 00360 inline bool CompareToken(const String &sExpected); 00361 00362 /** 00363 * @brief 00364 * Returns the current position in the stream 00365 * 00366 * @return 00367 * Position in the stream 00368 */ 00369 inline uint32 GetPosition() const; 00370 00371 /** 00372 * @brief 00373 * Returns the current line (counted by '\n' occurrences) 00374 * 00375 * @return 00376 * Line in the file 00377 */ 00378 inline uint32 GetLine() const; 00379 00380 /** 00381 * @brief 00382 * Saves the current state of the tokenizer on a state stack 00383 */ 00384 PLCORE_API void PushState(); 00385 00386 /** 00387 * @brief 00388 * Restores the last saved state from the stack 00389 */ 00390 PLCORE_API void PopState(); 00391 00392 /** 00393 * @brief 00394 * Deletes the last saved state from the stack 00395 */ 00396 inline void DropState(); 00397 00398 /** 00399 * @brief 00400 * Expects the next token to be a number and returns it as an integer value 00401 * 00402 * @param[out] nNumber 00403 * Receives the number 00404 * 00405 * @return 00406 * 'true' on success, else 'false' 00407 * 00408 * @note 00409 * - If a number has been found, the tokenizer goes to the next token in the stream, 00410 * otherwise it stays at the current token 00411 */ 00412 inline bool ParseNumber(int &nNumber); 00413 00414 /** 00415 * @brief 00416 * Expects the next token to be a floating point number and returns it as a float value 00417 * 00418 * @param[out] fNumber 00419 * Receives the number 00420 * 00421 * @return 00422 * 'true' on success, else 'false' 00423 * 00424 * @note 00425 * - If a number has been found, the tokenizer goes to the next token in the stream, 00426 * otherwise it stays at the current token 00427 */ 00428 inline bool ParseNumber(float &fNumber); 00429 00430 /** 00431 * @brief 00432 * Expects the next token to be a floating point number and returns it as a double value 00433 * 00434 * @param[out] dNumber 00435 * Receives the number 00436 * 00437 * @return 00438 * 'true' on success, else 'false' 00439 * 00440 * @note 00441 * - If a number has been found, the tokenizer goes to the next token in the stream, 00442 * otherwise it stays at the current token 00443 */ 00444 inline bool ParseNumber(double &dNumber); 00445 00446 /** 00447 * @brief 00448 * Expects the next tokens to be a vector and returns it as an array of strings 00449 * 00450 * @param[out] cVector 00451 * Receives the vector elements 00452 * @param[in] sStart 00453 * Open bracket (e.g. "[") 00454 * @param[in] sEnd 00455 * Closed bracket (e.g. "]") 00456 * @param[in] sSeparator 00457 * Separator between the elements (e.g. ","). Can also be "" 00458 * 00459 * @return 00460 * 'true' on success, else 'false' 00461 * 00462 * @remarks 00463 * Example: [one, two, three] 00464 * 00465 * @note 00466 * - If a vector has been found, the tokenizer goes to the next token in the stream, 00467 * otherwise it stays at the current token 00468 */ 00469 PLCORE_API bool ParseVector(Array<String> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ","); 00470 00471 /** 00472 * @brief 00473 * Expects the next tokens to be a vector and returns it as an array of ints 00474 * 00475 * @param[out] cVector 00476 * Receives the vector elements 00477 * @param[in] sStart 00478 * Open bracket (e.g. "[") 00479 * @param[in] sEnd 00480 * Closed bracket (e.g. "]") 00481 * @param[in] sSeparator 00482 * Separator between the elements (e.g. ","). Can also be "" 00483 * 00484 * @return 00485 * 'true' on success, else 'false' 00486 * 00487 * @remarks 00488 * Example: [1, 2, 3] 00489 * 00490 * @note 00491 * - If a vector has been found, the tokenizer goes to the next token in the stream, 00492 * otherwise it stays at the current token 00493 */ 00494 PLCORE_API bool ParseVector(Array<int> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ","); 00495 00496 /** 00497 * @brief 00498 * Expects the next tokens to be a vector and returns it as an array of floats 00499 * 00500 * @param[out] cVector 00501 * Receives the vector elements 00502 * @param[in] sStart 00503 * Open bracket (e.g. "[") 00504 * @param[in] sEnd 00505 * Closed bracket (e.g. "]") 00506 * @param[in] sSeparator 00507 * Separator between the elements (e.g. ","). Can also be "" 00508 * 00509 * @return 00510 * 'true' on success, else 'false' 00511 * 00512 * @remarks 00513 * Example: [1.0, 2.1, 3.2] 00514 * 00515 * @note 00516 * - If a vector has been found, the tokenizer goes to the next token in the stream, 00517 * otherwise it stays at the current token 00518 */ 00519 PLCORE_API bool ParseVector(Array<float> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ","); 00520 00521 /** 00522 * @brief 00523 * Expects the next tokens to be a vector and returns it as an array of doubles 00524 * 00525 * @param[out] cVector 00526 * Receives the vector elements 00527 * @param[in] sStart 00528 * Open bracket (e.g. "[") 00529 * @param[in] sEnd 00530 * Closed bracket (e.g. "]") 00531 * @param[in] sSeparator 00532 * Separator between the elements (e.g. ","). Can also be "" 00533 * 00534 * @return 00535 * 'true' on success, else 'false' 00536 * 00537 * @remarks 00538 * Example: [1.0, 2.1, 3.2] 00539 * 00540 * @note 00541 * - If a vector has been found, the tokenizer goes to the next token in the stream, 00542 * otherwise it stays at the current token 00543 */ 00544 PLCORE_API bool ParseVector(Array<double> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ","); 00545 00546 /** 00547 * @brief 00548 * Expects the next tokens to be an equation and returns it 00549 * 00550 * @param[out] sName 00551 * Name of the element 00552 * @param[out] sValue 00553 * Value as a string 00554 * @param[in] sEquation 00555 * Equation sign (e.g. "=") 00556 * 00557 * @return 00558 * 'true' on success, else 'false' 00559 * 00560 * @remarks 00561 * Example: Console = On 00562 * 00563 * @note 00564 * - If an equation has been found, the tokenizer goes to the next token in the stream, 00565 * otherwise it stays at the current token 00566 */ 00567 PLCORE_API bool ParseEquation(String &sName, String &sValue, const String &sEquation = ""); 00568 00569 /** 00570 * @brief 00571 * Expects the next tokens to be an equation and returns it 00572 * 00573 * @param[out] sName 00574 * Name of the element 00575 * @param[out] nValue 00576 * Value as an int 00577 * @param[in] sEquation 00578 * Equation sign (e.g. "=") 00579 * 00580 * @return 00581 * 'true' on success, else 'false' 00582 * 00583 * @remarks 00584 * Example: Health = 100 00585 * 00586 * @note 00587 * - If an equation has been found, the tokenizer goes to the next token in the stream, 00588 * otherwise it stays at the current token 00589 */ 00590 inline bool ParseEquation(String &sName, int &nValue, const String &sEquation = ""); 00591 00592 /** 00593 * @brief 00594 * Expects the next tokens to be an equation and returns it 00595 * 00596 * @param[out] sName 00597 * Name of the element 00598 * @param[out] fValue 00599 * Value as a float 00600 * @param[in] sEquation 00601 * Equation sign (e.g. "=") 00602 * 00603 * @return 00604 * 'true' on success, else 'false' 00605 * 00606 * @remarks 00607 * Example: Gamma = 2.1 00608 * 00609 * @note 00610 * - If an equation has been found, the tokenizer goes to the next token in the stream, 00611 * otherwise it stays at the current token 00612 */ 00613 inline bool ParseEquation(String &sName, float &fValue, const String &sEquation = ""); 00614 00615 /** 00616 * @brief 00617 * Expects the next tokens to be an equation and returns it 00618 * 00619 * @param[out] sName 00620 * Name of the element 00621 * @param[out] dValue 00622 * Value as a double 00623 * @param[in] sEquation 00624 * Equation sign (e.g. "=") 00625 * 00626 * @return 00627 * 'true' on success, else 'false' 00628 * 00629 * @remarks 00630 * Example: Speed = 3.25 00631 * 00632 * @note 00633 * - If an equation has been found, the tokenizer goes to the next token in the stream, 00634 * otherwise it stays at the current token 00635 */ 00636 inline bool ParseEquation(String &sName, double &dValue, const String &sEquation = ""); 00637 00638 00639 //[-------------------------------------------------------] 00640 //[ Private functions ] 00641 //[-------------------------------------------------------] 00642 private: 00643 /** 00644 * @brief 00645 * Copy constructor 00646 * 00647 * @param[in] cSource 00648 * Source to copy from 00649 */ 00650 Tokenizer(const Tokenizer &cSource); 00651 00652 /** 00653 * @brief 00654 * Copy operator 00655 * 00656 * @param[in] cSource 00657 * Source to copy from 00658 * 00659 * @return 00660 * Reference to this instance 00661 */ 00662 Tokenizer &operator =(const Tokenizer &cSource); 00663 00664 /** 00665 * @brief 00666 * Checks if the next string in the stream equals the given one 00667 * 00668 * @param[in] sString 00669 * String to compare with 00670 * 00671 * @return 00672 * 'true' if the next string in the stream equals the given one, else 'false' 00673 * 00674 * @note 00675 * - 'm_pStream' must be valid! 00676 */ 00677 inline bool StreamIsString(const String &sString); 00678 00679 /** 00680 * @brief 00681 * Reads the next characters of the stream 00682 * 00683 * @param[in] nSize 00684 * Number of characters to read from the stream 00685 * 00686 * @note 00687 * - 'm_pStream' must be valid! 00688 */ 00689 void StreamRead(uint32 nSize); 00690 00691 00692 //[-------------------------------------------------------] 00693 //[ Private static data ] 00694 //[-------------------------------------------------------] 00695 private: 00696 static const String EndOfLine; /**< End of line ("\n") */ 00697 00698 00699 //[-------------------------------------------------------] 00700 //[ Private data ] 00701 //[-------------------------------------------------------] 00702 private: 00703 /** 00704 * @brief 00705 * Internal parse mode of the state machine 00706 */ 00707 enum EParseMode { 00708 eEatGarbage, /**< Eat garbage until the first character of a word has been found */ 00709 eReadWord, /**< Read a word until a delimiter has been found */ 00710 eSingleChar, /**< Read a single character word */ 00711 eReadQuote, /**< read the beginning of a quote */ 00712 eReadQuoteInside, /**< Read until the end quote has been found */ 00713 eReadQuoteEnd, /**< Read the end of a quote */ 00714 eSkipComment /**< Skip a comment until the comment end-tag has been found */ 00715 }; 00716 00717 /** 00718 * @brief 00719 * Data structure to store the state of the tokenizer 00720 */ 00721 struct STokenizerState { 00722 String m_sToken; /**< The current token */ 00723 uint32 m_nPosition; /**< The current position */ 00724 uint32 m_nLine; /**< The current line */ 00725 EParseMode m_nParseMode; /**< Current parse mode */ 00726 String m_sEndTag; /**< End tag of the current mode, used for quotes and comments */ 00727 }; 00728 00729 // Tokenizer options 00730 String m_sDelimiters; /**< List of delimiters (white space) */ 00731 String m_sSingleChars; /**< List of characters that count as words (e.g. : or ,) */ 00732 String m_sQuotes; /**< List of characters that start quotes (e.g. " or ') */ 00733 String m_sCommentStartTag; /**< Start tag for multi line comments (e.g. "(*") */ 00734 String m_sCommentEndTag; /**< End tag for multi line comments (e.g. "*)") */ 00735 String m_sSingleLineComment; /**< Tag for single line comments (e.g. "//") */ 00736 bool m_bCaseSensitive; /**< Are comparisons case sensitive? */ 00737 00738 // Tokenizer status 00739 BufferedReader *m_pStream; /**< Character stream to read from, can be a null pointer */ 00740 String m_sToken; /**< The current token */ 00741 uint32 m_nPosition; /**< The current position */ 00742 uint32 m_nLine; /**< The current line */ 00743 EParseMode m_nParseMode; /**< Current parse mode */ 00744 String m_sEndTag; /**< End tag of the current mode, used for quotes and comments */ 00745 Stack<STokenizerState> m_cStateStack; /**< Stack of tokenizer states */ 00746 00747 00748 }; 00749 00750 00751 //[-------------------------------------------------------] 00752 //[ Namespace ] 00753 //[-------------------------------------------------------] 00754 } // PLCore 00755 00756 00757 //[-------------------------------------------------------] 00758 //[ Implementation ] 00759 //[-------------------------------------------------------] 00760 #include "PLCore/String/Tokenizer.inl" 00761 00762 00763 #endif // __PLCORE_TOKENIZER_H__
|