PixelLightAPI: HTMLParser.h Source File

Go to the documentation of this file.
00001 /*********************************************************\
00002  *  File: HTMLParser.h                                   *
00003  *
00004  *  Copyright (C) 2002-2012 The PixelLight Team (http://www.pixellight.org/)
00005  *
00006  *  This file is part of PixelLight.
00007  *
00008  *  PixelLight is free software: you can redistribute it and/or modify
00009  *  it under the terms of the GNU Lesser General Public License as published by
00010  *  the Free Software Foundation, either version 3 of the License, or
00011  *  (at your option) any later version.
00012  *
00013  *  PixelLight is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00016  *  GNU Lesser General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU Lesser General Public License
00019  *  along with PixelLight. If not, see <http://www.gnu.org/licenses/>.
00020 \*********************************************************/
00021 
00022 
00023 #ifndef __PLCORE_HTMLPARSER_H__
00024 #define __PLCORE_HTMLPARSER_H__
00025 #pragma once
00026 
00027 
00028 //[-------------------------------------------------------]
00029 //[ Includes                                              ]
00030 //[-------------------------------------------------------]
00031 #include "PLCore/String/String.h"
00032 
00033 
00034 //[-------------------------------------------------------]
00035 //[ Namespace                                             ]
00036 //[-------------------------------------------------------]
00037 namespace PLCore {
00038 
00039 
00040 //[-------------------------------------------------------]
00041 //[ Forward declarations                                  ]
00042 //[-------------------------------------------------------]
00043 class XmlDocument;
00044 
00045 
00046 //[-------------------------------------------------------]
00047 //[ Classes                                               ]
00048 //[-------------------------------------------------------]
00049 /**
00050 *  @brief
00051 *    HTML parser
00052 *
00053 *  @remarks
00054 *    This class provides a simple HTML parser. It does NOT comply with any existent HTML standard,
00055 *    but it tries to be as flexible as possible in allowing many typical errors such as mixed HTML
00056 *    and XML style as well as not properly closed tags. So don't expect this class to be able of parsing
00057 *    any existing homepage out there. However, the class can be used to read in an HTML file and
00058 *    produce an XML syntax tree as if it were an XML file. You can then use standard XML classes to
00059 *    access the parsed document tree.
00060 */
00061 class HTMLParser {
00062 
00063 
00064     //[-------------------------------------------------------]
00065     //[ Public functions                                      ]
00066     //[-------------------------------------------------------]
00067     public:
00068         /**
00069         *  @brief
00070         *    Constructor
00071         */
00072         inline HTMLParser();
00073 
00074         /**
00075         *  @brief
00076         *    Destructor
00077         */
00078         inline ~HTMLParser();
00079 
00080         /**
00081         *  @brief
00082         *    Clear all data
00083         */
00084         PLCORE_API void Clear();
00085 
00086         /**
00087         *  @brief
00088         *    Load file
00089         *
00090         *  @param[in] sFilename
00091         *    Filename of the HTML document to load in
00092         */
00093         PLCORE_API void Load(const String &sFilename);
00094 
00095         /**
00096         *  @brief
00097         *    Get parsed HTML as an XML document
00098         *
00099         *  @return
00100         *    Pointer to XML document, a null pointer if there's currently no document
00101         */
00102         inline XmlDocument *GetXML() const;
00103 
00104         /**
00105         *  @brief
00106         *    Get number of errors occurred while parsing the HTML file
00107         *
00108         *  @return
00109         *    Number of errors
00110         */
00111         inline uint32 GetNumOfErrors() const;
00112 
00113 
00114     //[-------------------------------------------------------]
00115     //[ Private functions                                     ]
00116     //[-------------------------------------------------------]
00117     private:
00118         /**
00119         *  @brief
00120         *    Parse whole HTML file and generate a XML parsing tree
00121         *
00122         *  @return
00123         *    'true' if HTML file could be parsed, 'false' on error
00124         */
00125         bool Parse();
00126 
00127         /**
00128         *  @brief
00129         *    Check if there is another token waiting
00130         *
00131         *  @return
00132         *    'true' if next token is available, else 'false'
00133         */
00134         bool HasNextToken();
00135 
00136         /**
00137         *  @brief
00138         *    Get next token from HTML parser
00139         *
00140         *  @return
00141         *    Next token, or "" when there are no more tokens
00142         */
00143         String GetNextToken();
00144 
00145         /**
00146         *  @brief
00147         *    Returns whether or not the given string is a single tag (no closing tag required to be well-formed!)
00148         *
00149         *  @param[in] sTag
00150         *    Tag to check
00151         *
00152         *  @return
00153         *    'true' the given string is a single tag, else 'false'
00154         */
00155         inline bool IsSingleTag(const String &sTag) const;
00156 
00157 
00158     //[-------------------------------------------------------]
00159     //[ Private data                                          ]
00160     //[-------------------------------------------------------]
00161     private:
00162         String       m_sFilename;   /**< File name */
00163         String       m_sText;       /**< Content of HTML file */
00164         String       m_sTextLower;  /**< Text version in lower case */
00165         uint32       m_nPos;        /**< Parsing position */
00166         String       m_sToken;      /**< Currently parsed token */
00167         XmlDocument *m_pXML;        /**< Parsed HTML, can be a null pointer */
00168         uint32       m_nErrors;     /**< Number of errors */
00169 
00170 
00171 };
00172 
00173 
00174 //[-------------------------------------------------------]
00175 //[ Namespace                                             ]
00176 //[-------------------------------------------------------]
00177 } // PLCore
00178 
00179 
00180 //[-------------------------------------------------------]
00181 //[ Implementation                                        ]
00182 //[-------------------------------------------------------]
00183 #include "PLCore/Tools/HTMLParser.inl"
00184 
00185 
00186 #endif // __PLCORE_HTMLPARSER_H__