Reactos
at master 343 lines 10 kB view raw
1/* 2 * Summary: interface for an HTML 4.0 non-verifying parser 3 * Description: this module implements an HTML 4.0 non-verifying parser 4 * with API compatible with the XML parser ones. It should 5 * be able to parse "real world" HTML, even if severely 6 * broken from a specification point of view. 7 * 8 * Copy: See Copyright for the status of this software. 9 * 10 * Author: Daniel Veillard 11 */ 12 13#ifndef __HTML_PARSER_H__ 14#define __HTML_PARSER_H__ 15#include <libxml/xmlversion.h> 16#include <libxml/parser.h> 17 18#ifdef LIBXML_HTML_ENABLED 19 20#ifdef __cplusplus 21extern "C" { 22#endif 23 24/* 25 * Most of the back-end structures from XML and HTML are shared. 26 */ 27typedef xmlParserCtxt htmlParserCtxt; 28typedef xmlParserCtxtPtr htmlParserCtxtPtr; 29typedef xmlParserNodeInfo htmlParserNodeInfo; 30typedef xmlSAXHandler htmlSAXHandler; 31typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 32typedef xmlParserInput htmlParserInput; 33typedef xmlParserInputPtr htmlParserInputPtr; 34typedef xmlDocPtr htmlDocPtr; 35typedef xmlNodePtr htmlNodePtr; 36 37/* 38 * Internal description of an HTML element, representing HTML 4.01 39 * and XHTML 1.0 (which share the same structure). 40 */ 41typedef struct _htmlElemDesc htmlElemDesc; 42typedef htmlElemDesc *htmlElemDescPtr; 43struct _htmlElemDesc { 44 const char *name; /* The tag name */ 45 char startTag; /* Whether the start tag can be implied */ 46 char endTag; /* Whether the end tag can be implied */ 47 char saveEndTag; /* Whether the end tag should be saved */ 48 char empty; /* Is this an empty element ? */ 49 char depr; /* Is this a deprecated element ? */ 50 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ 51 char isinline; /* is this a block 0 or inline 1 element */ 52 const char *desc; /* the description */ 53 54/* NRK Jan.2003 55 * New fields encapsulating HTML structure 56 * 57 * Bugs: 58 * This is a very limited representation. It fails to tell us when 59 * an element *requires* subelements (we only have whether they're 60 * allowed or not), and it doesn't tell us where CDATA and PCDATA 61 * are allowed. Some element relationships are not fully represented: 62 * these are flagged with the word MODIFIER 63 */ 64 const char** subelts; /* allowed sub-elements of this element */ 65 const char* defaultsubelt; /* subelement for suggested auto-repair 66 if necessary or NULL */ 67 const char** attrs_opt; /* Optional Attributes */ 68 const char** attrs_depr; /* Additional deprecated attributes */ 69 const char** attrs_req; /* Required attributes */ 70}; 71 72/* 73 * Internal description of an HTML entity. 74 */ 75typedef struct _htmlEntityDesc htmlEntityDesc; 76typedef htmlEntityDesc *htmlEntityDescPtr; 77struct _htmlEntityDesc { 78 unsigned int value; /* the UNICODE value for the character */ 79 const char *name; /* The entity name */ 80 const char *desc; /* the description */ 81}; 82 83/** DOC_DISABLE */ 84#ifdef LIBXML_SAX1_ENABLED 85 #define XML_GLOBALS_HTML \ 86 XML_OP(htmlDefaultSAXHandler, xmlSAXHandlerV1, XML_DEPRECATED) 87#else 88 #define XML_GLOBALS_HTML 89#endif 90 91#define XML_OP XML_DECLARE_GLOBAL 92XML_GLOBALS_HTML 93#undef XML_OP 94 95#if defined(LIBXML_THREAD_ENABLED) && !defined(XML_GLOBALS_NO_REDEFINITION) 96 #define htmlDefaultSAXHandler XML_GLOBAL_MACRO(htmlDefaultSAXHandler) 97#endif 98/** DOC_ENABLE */ 99 100/* 101 * There is only few public functions. 102 */ 103XML_DEPRECATED 104XMLPUBFUN void 105 htmlInitAutoClose (void); 106XMLPUBFUN const htmlElemDesc * 107 htmlTagLookup (const xmlChar *tag); 108XMLPUBFUN const htmlEntityDesc * 109 htmlEntityLookup(const xmlChar *name); 110XMLPUBFUN const htmlEntityDesc * 111 htmlEntityValueLookup(unsigned int value); 112 113XMLPUBFUN int 114 htmlIsAutoClosed(htmlDocPtr doc, 115 htmlNodePtr elem); 116XMLPUBFUN int 117 htmlAutoCloseTag(htmlDocPtr doc, 118 const xmlChar *name, 119 htmlNodePtr elem); 120XML_DEPRECATED 121XMLPUBFUN const htmlEntityDesc * 122 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 123 const xmlChar **str); 124XML_DEPRECATED 125XMLPUBFUN int 126 htmlParseCharRef(htmlParserCtxtPtr ctxt); 127XML_DEPRECATED 128XMLPUBFUN void 129 htmlParseElement(htmlParserCtxtPtr ctxt); 130 131XMLPUBFUN htmlParserCtxtPtr 132 htmlNewParserCtxt(void); 133XMLPUBFUN htmlParserCtxtPtr 134 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, 135 void *userData); 136 137XMLPUBFUN htmlParserCtxtPtr 138 htmlCreateMemoryParserCtxt(const char *buffer, 139 int size); 140 141XMLPUBFUN int 142 htmlParseDocument(htmlParserCtxtPtr ctxt); 143XML_DEPRECATED 144XMLPUBFUN htmlDocPtr 145 htmlSAXParseDoc (const xmlChar *cur, 146 const char *encoding, 147 htmlSAXHandlerPtr sax, 148 void *userData); 149XMLPUBFUN htmlDocPtr 150 htmlParseDoc (const xmlChar *cur, 151 const char *encoding); 152XMLPUBFUN htmlParserCtxtPtr 153 htmlCreateFileParserCtxt(const char *filename, 154 const char *encoding); 155XML_DEPRECATED 156XMLPUBFUN htmlDocPtr 157 htmlSAXParseFile(const char *filename, 158 const char *encoding, 159 htmlSAXHandlerPtr sax, 160 void *userData); 161XMLPUBFUN htmlDocPtr 162 htmlParseFile (const char *filename, 163 const char *encoding); 164XMLPUBFUN int 165 UTF8ToHtml (unsigned char *out, 166 int *outlen, 167 const unsigned char *in, 168 int *inlen); 169XMLPUBFUN int 170 htmlEncodeEntities(unsigned char *out, 171 int *outlen, 172 const unsigned char *in, 173 int *inlen, int quoteChar); 174XMLPUBFUN int 175 htmlIsScriptAttribute(const xmlChar *name); 176XMLPUBFUN int 177 htmlHandleOmittedElem(int val); 178 179#ifdef LIBXML_PUSH_ENABLED 180/** 181 * Interfaces for the Push mode. 182 */ 183XMLPUBFUN htmlParserCtxtPtr 184 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 185 void *user_data, 186 const char *chunk, 187 int size, 188 const char *filename, 189 xmlCharEncoding enc); 190XMLPUBFUN int 191 htmlParseChunk (htmlParserCtxtPtr ctxt, 192 const char *chunk, 193 int size, 194 int terminate); 195#endif /* LIBXML_PUSH_ENABLED */ 196 197XMLPUBFUN void 198 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 199 200/* 201 * New set of simpler/more flexible APIs 202 */ 203/** 204 * xmlParserOption: 205 * 206 * This is the set of XML parser options that can be passed down 207 * to the xmlReadDoc() and similar calls. 208 */ 209typedef enum { 210 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ 211 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ 212 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 213 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 214 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ 215 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 216 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ 217 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ 218 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ 219 HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ 220} htmlParserOption; 221 222XMLPUBFUN void 223 htmlCtxtReset (htmlParserCtxtPtr ctxt); 224XMLPUBFUN int 225 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 226 int options); 227XMLPUBFUN htmlDocPtr 228 htmlReadDoc (const xmlChar *cur, 229 const char *URL, 230 const char *encoding, 231 int options); 232XMLPUBFUN htmlDocPtr 233 htmlReadFile (const char *URL, 234 const char *encoding, 235 int options); 236XMLPUBFUN htmlDocPtr 237 htmlReadMemory (const char *buffer, 238 int size, 239 const char *URL, 240 const char *encoding, 241 int options); 242XMLPUBFUN htmlDocPtr 243 htmlReadFd (int fd, 244 const char *URL, 245 const char *encoding, 246 int options); 247XMLPUBFUN htmlDocPtr 248 htmlReadIO (xmlInputReadCallback ioread, 249 xmlInputCloseCallback ioclose, 250 void *ioctx, 251 const char *URL, 252 const char *encoding, 253 int options); 254XMLPUBFUN htmlDocPtr 255 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 256 const xmlChar *cur, 257 const char *URL, 258 const char *encoding, 259 int options); 260XMLPUBFUN htmlDocPtr 261 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 262 const char *filename, 263 const char *encoding, 264 int options); 265XMLPUBFUN htmlDocPtr 266 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 267 const char *buffer, 268 int size, 269 const char *URL, 270 const char *encoding, 271 int options); 272XMLPUBFUN htmlDocPtr 273 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 274 int fd, 275 const char *URL, 276 const char *encoding, 277 int options); 278XMLPUBFUN htmlDocPtr 279 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 280 xmlInputReadCallback ioread, 281 xmlInputCloseCallback ioclose, 282 void *ioctx, 283 const char *URL, 284 const char *encoding, 285 int options); 286 287/* NRK/Jan2003: further knowledge of HTML structure 288 */ 289typedef enum { 290 HTML_NA = 0 , /* something we don't check at all */ 291 HTML_INVALID = 0x1 , 292 HTML_DEPRECATED = 0x2 , 293 HTML_VALID = 0x4 , 294 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 295} htmlStatus ; 296 297/* Using htmlElemDesc rather than name here, to emphasise the fact 298 that otherwise there's a lookup overhead 299*/ 300XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 301XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 302XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 303XMLPUBFUN htmlStatus htmlNodeStatus(const htmlNodePtr, int) ; 304/** 305 * htmlDefaultSubelement: 306 * @elt: HTML element 307 * 308 * Returns the default subelement for this element 309 */ 310#define htmlDefaultSubelement(elt) elt->defaultsubelt 311/** 312 * htmlElementAllowedHereDesc: 313 * @parent: HTML parent element 314 * @elt: HTML element 315 * 316 * Checks whether an HTML element description may be a 317 * direct child of the specified element. 318 * 319 * Returns 1 if allowed; 0 otherwise. 320 */ 321#define htmlElementAllowedHereDesc(parent,elt) \ 322 htmlElementAllowedHere((parent), (elt)->name) 323/** 324 * htmlRequiredAttrs: 325 * @elt: HTML element 326 * 327 * Returns the attributes required for the specified element. 328 */ 329#define htmlRequiredAttrs(elt) (elt)->attrs_req 330 331 332#ifdef __cplusplus 333} 334#endif 335 336#else /* LIBXML_HTML_ENABLED */ 337 338/** DOC_DISABLE */ 339#define XML_GLOBALS_HTML 340/** DOC_ENABLE */ 341 342#endif /* LIBXML_HTML_ENABLED */ 343#endif /* __HTML_PARSER_H__ */