PixelLightAPI: Tokenizer.h Source File

Go to the documentation of this file.
00001 /*********************************************************\
00002  *  File: Tokenizer.h                                    *
00003  *
00004  *  Copyright (C) 2002-2012 The PixelLight Team (http://www.pixellight.org/)
00005  *
00006  *  This file is part of PixelLight.
00007  *
00008  *  PixelLight is free software: you can redistribute it and/or modify
00009  *  it under the terms of the GNU Lesser General Public License as published by
00010  *  the Free Software Foundation, either version 3 of the License, or
00011  *  (at your option) any later version.
00012  *
00013  *  PixelLight is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00016  *  GNU Lesser General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU Lesser General Public License
00019  *  along with PixelLight. If not, see <http://www.gnu.org/licenses/>.
00020 \*********************************************************/
00021 
00022 
00023 #ifndef __PLCORE_TOKENIZER_H__
00024 #define __PLCORE_TOKENIZER_H__
00025 #pragma once
00026 
00027 
00028 //[-------------------------------------------------------]
00029 //[ Includes                                              ]
00030 //[-------------------------------------------------------]
00031 #include "PLCore/String/String.h"
00032 #include "PLCore/Container/Stack.h"
00033 #include "PLCore/Container/Array.h"
00034 
00035 
00036 //[-------------------------------------------------------]
00037 //[ Namespace                                             ]
00038 //[-------------------------------------------------------]
00039 namespace PLCore {
00040 
00041 
00042 //[-------------------------------------------------------]
00043 //[ Forward declarations                                  ]
00044 //[-------------------------------------------------------]
00045 class File;
00046 class BufferedReader;
00047 
00048 
00049 //[-------------------------------------------------------]
00050 //[ Classes                                               ]
00051 //[-------------------------------------------------------]
00052 /**
00053 *  @brief
00054 *    Tokenizer class for parsing texts (also called 'scanner' or 'lexer' -> lexical analysis)
00055 *
00056 *  @remarks
00057 *    The tokenizer parses an input stream (e.g. a file or string) and produces
00058 *    a subsequent list of tokens by filtering out whitespace, comments and such.
00059 *    There are many settings that can be manipulated to serve your needs, like choosing
00060 *    the type of comment tags or the characters that shall be considered whitespace.
00061 *    In general, the tokenizer should only be used to produce a list of tokens,
00062 *    after that a real parser is used to check the syntax and semantics of the
00063 *    parsed code. In addition to this basic functionality the class also provides
00064 *    some advanced functions to parse typical syntax like equations (a=b) or vectors
00065 *    (a b c). These functions can be used if no strict syntax is needed, e.g. for
00066 *    config files. But they are not intended to replace a decent parser.
00067 *
00068 *  @verbatim
00069 *    Usage example:
00070 *    Tokenizer cTokenizer;                         // Tokenizer instance
00071 *    String sText = "This is a test";              // Text to be parsed
00072 *    cTokenizer.Start(sText);                      // Start tokenizer
00073 *    String sToken;                                // String for current token
00074 *    sToken = cTokenizer.GetNextToken();           // Will return 'This'
00075 *    sToken = cTokenizer.GetNextToken();           // Will return 'is'
00076 *    sToken = cTokenizer.GetNextToken();           // Will return 'a'
00077 *    sToken = cTokenizer.GetNextToken();           // Will return 'test'
00078 *  @endverbatim
00079 */
00080 class Tokenizer {
00081 
00082 
00083     //[-------------------------------------------------------]
00084     //[ Public functions                                      ]
00085     //[-------------------------------------------------------]
00086     public:
00087         /**
00088         *  @brief
00089         *    Constructor
00090         */
00091         PLCORE_API Tokenizer();
00092 
00093         /**
00094         *  @brief
00095         *    Destructor
00096         */
00097         inline ~Tokenizer();
00098 
00099         /**
00100         *  @brief
00101         *    Get delimiters (characters that a treated as whitespace)
00102         *
00103         *  @return
00104         *    Delimiter characters
00105         */
00106         inline String GetDelimiters() const;
00107 
00108         /**
00109         *  @brief
00110         *    Set delimiters (characters that a treated as whitespace)
00111         *
00112         *  @param[in] sDelimiters
00113         *    Delimiter characters
00114         *
00115         *  @remarks
00116         *    Default: " \t\r\n"
00117         */
00118         inline void SetDelimiters(const String &sDelimiters);
00119 
00120         /**
00121         *  @brief
00122         *    Get single characters
00123         *
00124         *  @return
00125         *    Single characters
00126         */
00127         inline String GetSingleChars() const;
00128 
00129         /**
00130         *  @brief
00131         *    Set single characters
00132         *
00133         *  @param[in] sSingleChars
00134         *    Single characters
00135         *
00136         *  @remarks
00137         *    Default: "{}()[]<*>/=,;\""
00138         */
00139         inline void SetSingleChars(const String &sSingleChars);
00140 
00141         /**
00142         *  @brief
00143         *    Get characters that are used for quotes
00144         *
00145         *  @return
00146         *    Quote characters
00147         */
00148         inline String GetQuotes() const;
00149 
00150         /**
00151         *  @brief
00152         *    Set characters that are used for quotes
00153         *
00154         *  @param[in] sQuotes
00155         *    Quote characters
00156         *
00157         *  @remarks
00158         *    Default: "\"\'"
00159         */
00160         inline void SetQuotes(const String &sQuotes);
00161 
00162         /**
00163         *  @brief
00164         *    Get the string that starts a multi-line comment
00165         *
00166         *  @return
00167         *    Comment start tag
00168         */
00169         inline String GetCommentStartTag() const;
00170 
00171         /**
00172         *  @brief
00173         *    Set the string that starts a multi-line comment
00174         *
00175         *  @param[in] sCommentStartTag
00176         *    Comment start tag
00177         *
00178         *  @remarks
00179         *    Default is slash-star ("/ *" without the space in the middle, just written with a space to keep C++ compilers happy).
00180         */
00181         inline void SetCommentStartTag(const String &sCommentStartTag);
00182 
00183         /**
00184         *  @brief
00185         *    Get the string that ends a multi-line comment
00186         *
00187         *  @return
00188         *    Comment end tag
00189         */
00190         inline String GetCommentEndTag() const;
00191 
00192         /**
00193         *  @brief
00194         *    Set the string that ends a multi-line comment
00195         *
00196         *  @param[in] sCommentEndTag
00197         *    Comment end tag
00198         *
00199         *  @remarks
00200         *    Default: "*\/"
00201         */
00202         inline void SetCommentEndTag(const String &sCommentEndTag);
00203 
00204         /**
00205         *  @brief
00206         *    Get the string that starts a single-line comment
00207         *
00208         *  @return
00209         *    Comment start tag
00210         */
00211         inline String GetSingleLineComment() const;
00212 
00213         /**
00214         *  @brief
00215         *    Set the string that starts a single-line comment
00216         *
00217         *  @param[in] sSingleLineComment
00218         *    Comment start tag
00219         *
00220         *  @remarks
00221         *    Default: "//"
00222         */
00223         inline void SetSingleLineComment(const String &sSingleLineComment);
00224 
00225         /**
00226         *  @brief
00227         *    Get case sensitivity flag
00228         *
00229         *  @return
00230         *    'true' if the text is parsed case sensitive
00231         */
00232         inline bool IsCaseSensitive() const;
00233 
00234         /**
00235         *  @brief
00236         *    Set case sensitivity flag
00237         *
00238         *  @param[in] bCaseSensitive
00239         *    'true' if the text is parsed case sensitive (default is false)
00240         *
00241         *  @remarks
00242         *    Default: false
00243         */
00244         inline void SetCaseSensitive(bool bCaseSensitive);
00245 
00246         /**
00247         *  @brief
00248         *    Starts the tokenizer on a string
00249         *
00250         *  @param[in] sBuffer
00251         *    String buffer
00252         */
00253         PLCORE_API void Start(const String &sBuffer);
00254 
00255         /**
00256         *  @brief
00257         *    Starts the tokenizer on a file
00258         *
00259         *  @param[in] cFile
00260         *    File to read, released automatically if Stop() is called
00261         */
00262         PLCORE_API void Start(File &cFile);
00263 
00264         /**
00265         *  @brief
00266         *    Stops the tokenizer
00267         */
00268         PLCORE_API void Stop();
00269 
00270         /**
00271         *  @brief
00272         *    Reads all tokens until the end of the stream
00273         *
00274         *  @return
00275         *    Array of all tokens of the stream
00276         */
00277         PLCORE_API Array<String> GetTokens();
00278 
00279         /**
00280         *  @brief
00281         *    Reads the next token from the stream
00282         *
00283         *  @return
00284         *    Next token
00285         *
00286         *  @note
00287         *    - After the token has been read this function goes to the next token in the stream
00288         *    - To get the read token again, use GetToken()
00289         *
00290         *  @see
00291         *    - GetToken()
00292         */
00293         PLCORE_API String GetNextToken();
00294 
00295         /**
00296         *  @brief
00297         *    Expects the next token to be equal to a given string
00298         *
00299         *  @return
00300         *    'true' if the next token is equal to the string
00301         *
00302         *  @note
00303         *    - If the expected token has been found, the tokenizer goes to the next token in the stream,
00304         *      otherwise it stays at the current token
00305         *    - The comparison of strings is done according to the settings given in SetCaseSensitive()
00306         *
00307         *  @see
00308         *    - IsCaseSensitive()
00309         *    - SetCaseSensitive()
00310         */
00311         PLCORE_API bool ExpectToken(const String &sExpected);
00312 
00313         /**
00314         *  @brief
00315         *    Finds a given token in the stream
00316         *
00317         *  @return
00318         *    'true' if the token has been found
00319         *
00320         *  @note
00321         *    - Reads the next token until the expected token has been found or the end of the stream has been reached
00322         *    - If the function has succeed, the next call of GetNextToken() will return the desired token
00323         *    - The comparison of strings is done according to the settings given in SetCaseSensitive()
00324         *
00325         *  @see
00326         *    - IsCaseSensitive()
00327         *    - SetCaseSensitive()
00328         */
00329         PLCORE_API bool FindToken(const String &sExpected);
00330 
00331         /**
00332         *  @brief
00333         *    Returns the current token
00334         *
00335         *  @return
00336         *    Current token
00337         *
00338         *  @note
00339         *    - Does not go to the next token in the stream, so multiple calls of this function will
00340         *      always return the same token.
00341         */
00342         inline String GetToken() const;
00343 
00344         /**
00345         *  @brief
00346         *    Compares the current token with a given string
00347         *
00348         *  @return
00349         *    'true' if the token is equal to the string
00350         *
00351         *  @note
00352         *    - Does not go to the next token in the stream, so multiple calls of this function will
00353         *      always return the same token
00354         *    - The comparison of strings is done according to the settings given in SetCaseSensitive()
00355         *
00356         *  @see
00357         *    - IsCaseSensitive()
00358         *    - SetCaseSensitive()
00359         */
00360         inline bool CompareToken(const String &sExpected);
00361 
00362         /**
00363         *  @brief
00364         *    Returns the current position in the stream
00365         *
00366         *  @return
00367         *    Position in the stream
00368         */
00369         inline uint32 GetPosition() const;
00370 
00371         /**
00372         *  @brief
00373         *    Returns the current line (counted by '\n' occurrences)
00374         *
00375         *  @return
00376         *    Line in the file
00377         */
00378         inline uint32 GetLine() const;
00379 
00380         /**
00381         *  @brief
00382         *    Saves the current state of the tokenizer on a state stack
00383         */
00384         PLCORE_API void PushState();
00385 
00386         /**
00387         *  @brief
00388         *    Restores the last saved state from the stack
00389         */
00390         PLCORE_API void PopState();
00391 
00392         /**
00393         *  @brief
00394         *    Deletes the last saved state from the stack
00395         */
00396         inline void DropState();
00397 
00398         /**
00399         *  @brief
00400         *    Expects the next token to be a number and returns it as an integer value
00401         *
00402         *  @param[out] nNumber
00403         *    Receives the number
00404         *
00405         *  @return
00406         *    'true' on success, else 'false'
00407         *
00408         *  @note
00409         *    - If a number has been found, the tokenizer goes to the next token in the stream,
00410         *      otherwise it stays at the current token
00411         */
00412         inline bool ParseNumber(int &nNumber);
00413 
00414         /**
00415         *  @brief
00416         *    Expects the next token to be a floating point number and returns it as a float value
00417         *
00418         *  @param[out] fNumber
00419         *    Receives the number
00420         *
00421         *  @return
00422         *    'true' on success, else 'false'
00423         *
00424         *  @note
00425         *    - If a number has been found, the tokenizer goes to the next token in the stream,
00426         *      otherwise it stays at the current token
00427         */
00428         inline bool ParseNumber(float &fNumber);
00429 
00430         /**
00431         *  @brief
00432         *    Expects the next token to be a floating point number and returns it as a double value
00433         *
00434         *  @param[out] dNumber
00435         *    Receives the number
00436         *
00437         *  @return
00438         *    'true' on success, else 'false'
00439         *
00440         *  @note
00441         *    - If a number has been found, the tokenizer goes to the next token in the stream,
00442         *      otherwise it stays at the current token
00443         */
00444         inline bool ParseNumber(double &dNumber);
00445 
00446         /**
00447         *  @brief
00448         *    Expects the next tokens to be a vector and returns it as an array of strings
00449         *
00450         *  @param[out] cVector
00451         *    Receives the vector elements
00452         *  @param[in]  sStart
00453         *    Open bracket (e.g. "[")
00454         *  @param[in]  sEnd
00455         *    Closed bracket (e.g. "]")
00456         *  @param[in]  sSeparator
00457         *    Separator between the elements (e.g. ","). Can also be ""
00458         *
00459         *  @return
00460         *    'true' on success, else 'false'
00461         *
00462         *  @remarks
00463         *    Example: [one, two, three]
00464         *
00465         *  @note
00466         *    - If a vector has been found, the tokenizer goes to the next token in the stream,
00467         *      otherwise it stays at the current token
00468         */
00469         PLCORE_API bool ParseVector(Array<String> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ",");
00470 
00471         /**
00472         *  @brief
00473         *    Expects the next tokens to be a vector and returns it as an array of ints
00474         *
00475         *  @param[out] cVector
00476         *    Receives the vector elements
00477         *  @param[in]  sStart
00478         *    Open bracket (e.g. "[")
00479         *  @param[in]  sEnd
00480         *    Closed bracket (e.g. "]")
00481         *  @param[in]  sSeparator
00482         *    Separator between the elements (e.g. ","). Can also be ""
00483         *
00484         *  @return
00485         *    'true' on success, else 'false'
00486         *
00487         *  @remarks
00488         *    Example: [1, 2, 3]
00489         *
00490         *  @note
00491         *    - If a vector has been found, the tokenizer goes to the next token in the stream,
00492         *      otherwise it stays at the current token
00493         */
00494         PLCORE_API bool ParseVector(Array<int> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ",");
00495 
00496         /**
00497         *  @brief
00498         *    Expects the next tokens to be a vector and returns it as an array of floats
00499         *
00500         *  @param[out] cVector
00501         *    Receives the vector elements
00502         *  @param[in]  sStart
00503         *    Open bracket (e.g. "[")
00504         *  @param[in]  sEnd
00505         *    Closed bracket (e.g. "]")
00506         *  @param[in]  sSeparator
00507         *    Separator between the elements (e.g. ","). Can also be ""
00508         *
00509         *  @return
00510         *    'true' on success, else 'false'
00511         *
00512         *  @remarks
00513         *    Example: [1.0, 2.1, 3.2]
00514         *
00515         *  @note
00516         *    - If a vector has been found, the tokenizer goes to the next token in the stream,
00517         *      otherwise it stays at the current token
00518         */
00519         PLCORE_API bool ParseVector(Array<float> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ",");
00520 
00521         /**
00522         *  @brief
00523         *    Expects the next tokens to be a vector and returns it as an array of doubles
00524         *
00525         *  @param[out] cVector
00526         *    Receives the vector elements
00527         *  @param[in]  sStart
00528         *    Open bracket (e.g. "[")
00529         *  @param[in]  sEnd
00530         *    Closed bracket (e.g. "]")
00531         *  @param[in]  sSeparator
00532         *    Separator between the elements (e.g. ","). Can also be ""
00533         *
00534         *  @return
00535         *    'true' on success, else 'false'
00536         *
00537         *  @remarks
00538         *    Example: [1.0, 2.1, 3.2]
00539         *
00540         *  @note
00541         *    - If a vector has been found, the tokenizer goes to the next token in the stream,
00542         *      otherwise it stays at the current token
00543         */
00544         PLCORE_API bool ParseVector(Array<double> &cVector, const String &sStart = "[", const String &sEnd = "]", const String &sSeparator = ",");
00545 
00546         /**
00547         *  @brief
00548         *    Expects the next tokens to be an equation and returns it
00549         *
00550         *  @param[out] sName
00551         *    Name of the element
00552         *  @param[out] sValue
00553         *    Value as a string
00554         *  @param[in]  sEquation
00555         *    Equation sign (e.g. "=")
00556         *
00557         *  @return
00558         *    'true' on success, else 'false'
00559         *
00560         *  @remarks
00561         *    Example: Console = On
00562         *
00563         *  @note
00564         *    - If an equation has been found, the tokenizer goes to the next token in the stream,
00565         *      otherwise it stays at the current token
00566         */
00567         PLCORE_API bool ParseEquation(String &sName, String &sValue, const String &sEquation = "");
00568 
00569         /**
00570         *  @brief
00571         *    Expects the next tokens to be an equation and returns it
00572         *
00573         *  @param[out] sName
00574         *    Name of the element
00575         *  @param[out] nValue
00576         *    Value as an int
00577         *  @param[in]  sEquation
00578         *    Equation sign (e.g. "=")
00579         *
00580         *  @return
00581         *    'true' on success, else 'false'
00582         *
00583         *  @remarks
00584         *    Example: Health = 100
00585         *
00586         *  @note
00587         *    - If an equation has been found, the tokenizer goes to the next token in the stream,
00588         *      otherwise it stays at the current token
00589         */
00590         inline bool ParseEquation(String &sName, int &nValue, const String &sEquation = "");
00591 
00592         /**
00593         *  @brief
00594         *    Expects the next tokens to be an equation and returns it
00595         *
00596         *  @param[out] sName
00597         *    Name of the element
00598         *  @param[out] fValue
00599         *    Value as a float
00600         *  @param[in]  sEquation
00601         *    Equation sign (e.g. "=")
00602         *
00603         *  @return
00604         *    'true' on success, else 'false'
00605         *
00606         *  @remarks
00607         *    Example: Gamma = 2.1
00608         *
00609         *  @note
00610         *    - If an equation has been found, the tokenizer goes to the next token in the stream,
00611         *      otherwise it stays at the current token
00612         */
00613         inline bool ParseEquation(String &sName, float &fValue, const String &sEquation = "");
00614 
00615         /**
00616         *  @brief
00617         *    Expects the next tokens to be an equation and returns it
00618         *
00619         *  @param[out] sName
00620         *    Name of the element
00621         *  @param[out] dValue
00622         *    Value as a double
00623         *  @param[in]  sEquation
00624         *    Equation sign (e.g. "=")
00625         *
00626         *  @return
00627         *    'true' on success, else 'false'
00628         *
00629         *  @remarks
00630         *    Example: Speed = 3.25
00631         *
00632         *  @note
00633         *    - If an equation has been found, the tokenizer goes to the next token in the stream,
00634         *      otherwise it stays at the current token
00635         */
00636         inline bool ParseEquation(String &sName, double &dValue, const String &sEquation = "");
00637 
00638 
00639     //[-------------------------------------------------------]
00640     //[ Private functions                                     ]
00641     //[-------------------------------------------------------]
00642     private:
00643         /**
00644         *  @brief
00645         *    Copy constructor
00646         *
00647         *  @param[in] cSource
00648         *    Source to copy from
00649         */
00650         Tokenizer(const Tokenizer &cSource);
00651 
00652         /**
00653         *  @brief
00654         *    Copy operator
00655         *
00656         *  @param[in] cSource
00657         *    Source to copy from
00658         *
00659         *  @return
00660         *    Reference to this instance
00661         */
00662         Tokenizer &operator =(const Tokenizer &cSource);
00663 
00664         /**
00665         *  @brief
00666         *    Checks if the next string in the stream equals the given one
00667         *
00668         *  @param[in] sString
00669         *    String to compare with
00670         *
00671         *  @return
00672         *    'true' if the next string in the stream equals the given one, else 'false'
00673         *
00674         *  @note
00675         *    - 'm_pStream' must be valid!
00676         */
00677         inline bool StreamIsString(const String &sString);
00678 
00679         /**
00680         *  @brief
00681         *    Reads the next characters of the stream
00682         *
00683         *  @param[in] nSize
00684         *    Number of characters to read from the stream
00685         *
00686         *  @note
00687         *    - 'm_pStream' must be valid!
00688         */
00689         void StreamRead(uint32 nSize);
00690 
00691 
00692     //[-------------------------------------------------------]
00693     //[ Private static data                                   ]
00694     //[-------------------------------------------------------]
00695     private:
00696         static const String EndOfLine;  /**< End of line ("\n") */
00697 
00698 
00699     //[-------------------------------------------------------]
00700     //[ Private data                                          ]
00701     //[-------------------------------------------------------]
00702     private:
00703         /**
00704         *  @brief
00705         *    Internal parse mode of the state machine
00706         */
00707         enum EParseMode {
00708             eEatGarbage,                            /**< Eat garbage until the first character of a word has been found */
00709             eReadWord,                              /**< Read a word until a delimiter has been found */
00710             eSingleChar,                            /**< Read a single character word */
00711             eReadQuote,                             /**< read the beginning of a quote */
00712             eReadQuoteInside,                       /**< Read until the end quote has been found */
00713             eReadQuoteEnd,                          /**< Read the end of a quote */
00714             eSkipComment                            /**< Skip a comment until the comment end-tag has been found */
00715         };
00716 
00717         /**
00718         *  @brief
00719         *    Data structure to store the state of the tokenizer
00720         */
00721         struct STokenizerState {
00722             String      m_sToken;                   /**< The current token */
00723             uint32      m_nPosition;                /**< The current position */
00724             uint32      m_nLine;                    /**< The current line */
00725             EParseMode  m_nParseMode;               /**< Current parse mode */
00726             String      m_sEndTag;                  /**< End tag of the current mode, used for quotes and comments */
00727         };
00728 
00729         // Tokenizer options
00730         String m_sDelimiters;                       /**< List of delimiters (white space) */
00731         String m_sSingleChars;                      /**< List of characters that count as words (e.g. : or ,) */
00732         String m_sQuotes;                           /**< List of characters that start quotes (e.g. " or ') */
00733         String m_sCommentStartTag;                  /**< Start tag for multi line comments (e.g. "(*") */
00734         String m_sCommentEndTag;                    /**< End tag for multi line comments (e.g. "*)") */
00735         String m_sSingleLineComment;                /**< Tag for single line comments (e.g. "//") */
00736         bool   m_bCaseSensitive;                    /**< Are comparisons case sensitive? */
00737 
00738         // Tokenizer status
00739         BufferedReader          *m_pStream;         /**< Character stream to read from, can be a null pointer */
00740         String                   m_sToken;          /**< The current token */
00741         uint32                   m_nPosition;       /**< The current position */
00742         uint32                   m_nLine;           /**< The current line */
00743         EParseMode               m_nParseMode;      /**< Current parse mode */
00744         String                   m_sEndTag;         /**< End tag of the current mode, used for quotes and comments */
00745         Stack<STokenizerState>   m_cStateStack;     /**< Stack of tokenizer states */
00746 
00747 
00748 };
00749 
00750 
00751 //[-------------------------------------------------------]
00752 //[ Namespace                                             ]
00753 //[-------------------------------------------------------]
00754 } // PLCore
00755 
00756 
00757 //[-------------------------------------------------------]
00758 //[ Implementation                                        ]
00759 //[-------------------------------------------------------]
00760 #include "PLCore/String/Tokenizer.inl"
00761 
00762 
00763 #endif // __PLCORE_TOKENIZER_H__