Reactos
at master 1199 lines 34 kB view raw
1/* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10#define IN_LIBXML 11#include "libxml.h" 12#ifdef LIBXML_HTML_ENABLED 13 14#include <string.h> /* for memset() only ! */ 15#include <ctype.h> 16#include <stdlib.h> 17 18#include <libxml/xmlmemory.h> 19#include <libxml/HTMLparser.h> 20#include <libxml/HTMLtree.h> 21#include <libxml/entities.h> 22#include <libxml/xmlerror.h> 23#include <libxml/parserInternals.h> 24#include <libxml/uri.h> 25 26#include "private/buf.h" 27#include "private/error.h" 28#include "private/io.h" 29#include "private/save.h" 30 31/************************************************************************ 32 * * 33 * Getting/Setting encoding meta tags * 34 * * 35 ************************************************************************/ 36 37/** 38 * htmlGetMetaEncoding: 39 * @doc: the document 40 * 41 * Encoding definition lookup in the Meta tags 42 * 43 * Returns the current encoding as flagged in the HTML source 44 */ 45const xmlChar * 46htmlGetMetaEncoding(htmlDocPtr doc) { 47 htmlNodePtr cur; 48 const xmlChar *content; 49 const xmlChar *encoding; 50 51 if (doc == NULL) 52 return(NULL); 53 cur = doc->children; 54 55 /* 56 * Search the html 57 */ 58 while (cur != NULL) { 59 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 60 if (xmlStrEqual(cur->name, BAD_CAST"html")) 61 break; 62 if (xmlStrEqual(cur->name, BAD_CAST"head")) 63 goto found_head; 64 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 65 goto found_meta; 66 } 67 cur = cur->next; 68 } 69 if (cur == NULL) 70 return(NULL); 71 cur = cur->children; 72 73 /* 74 * Search the head 75 */ 76 while (cur != NULL) { 77 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 78 if (xmlStrEqual(cur->name, BAD_CAST"head")) 79 break; 80 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 81 goto found_meta; 82 } 83 cur = cur->next; 84 } 85 if (cur == NULL) 86 return(NULL); 87found_head: 88 cur = cur->children; 89 90 /* 91 * Search the meta elements 92 */ 93found_meta: 94 while (cur != NULL) { 95 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 96 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 97 xmlAttrPtr attr = cur->properties; 98 int http; 99 const xmlChar *value; 100 101 content = NULL; 102 http = 0; 103 while (attr != NULL) { 104 if ((attr->children != NULL) && 105 (attr->children->type == XML_TEXT_NODE) && 106 (attr->children->next == NULL)) { 107 value = attr->children->content; 108 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 109 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 110 http = 1; 111 else if ((value != NULL) 112 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 113 content = value; 114 if ((http != 0) && (content != NULL)) 115 goto found_content; 116 } 117 attr = attr->next; 118 } 119 } 120 } 121 cur = cur->next; 122 } 123 return(NULL); 124 125found_content: 126 encoding = xmlStrstr(content, BAD_CAST"charset="); 127 if (encoding == NULL) 128 encoding = xmlStrstr(content, BAD_CAST"Charset="); 129 if (encoding == NULL) 130 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 131 if (encoding != NULL) { 132 encoding += 8; 133 } else { 134 encoding = xmlStrstr(content, BAD_CAST"charset ="); 135 if (encoding == NULL) 136 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 137 if (encoding == NULL) 138 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 139 if (encoding != NULL) 140 encoding += 9; 141 } 142 if (encoding != NULL) { 143 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 144 } 145 return(encoding); 146} 147 148/** 149 * htmlSetMetaEncoding: 150 * @doc: the document 151 * @encoding: the encoding string 152 * 153 * Sets the current encoding in the Meta tags 154 * NOTE: this will not change the document content encoding, just 155 * the META flag associated. 156 * 157 * Returns 0 in case of success and -1 in case of error 158 */ 159int 160htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 161 htmlNodePtr cur, meta = NULL, head = NULL; 162 const xmlChar *content = NULL; 163 char newcontent[100]; 164 165 newcontent[0] = 0; 166 167 if (doc == NULL) 168 return(-1); 169 170 /* html isn't a real encoding it's just libxml2 way to get entities */ 171 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 172 return(-1); 173 174 if (encoding != NULL) { 175 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 176 (char *)encoding); 177 newcontent[sizeof(newcontent) - 1] = 0; 178 } 179 180 cur = doc->children; 181 182 /* 183 * Search the html 184 */ 185 while (cur != NULL) { 186 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 187 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 188 break; 189 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 190 goto found_head; 191 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 192 goto found_meta; 193 } 194 cur = cur->next; 195 } 196 if (cur == NULL) 197 return(-1); 198 cur = cur->children; 199 200 /* 201 * Search the head 202 */ 203 while (cur != NULL) { 204 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 205 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 206 break; 207 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 208 head = cur->parent; 209 goto found_meta; 210 } 211 } 212 cur = cur->next; 213 } 214 if (cur == NULL) 215 return(-1); 216found_head: 217 head = cur; 218 if (cur->children == NULL) 219 goto create; 220 cur = cur->children; 221 222found_meta: 223 /* 224 * Search and update all the remaining the meta elements carrying 225 * encoding information 226 */ 227 while (cur != NULL) { 228 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 229 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 230 xmlAttrPtr attr = cur->properties; 231 int http; 232 const xmlChar *value; 233 234 content = NULL; 235 http = 0; 236 while (attr != NULL) { 237 if ((attr->children != NULL) && 238 (attr->children->type == XML_TEXT_NODE) && 239 (attr->children->next == NULL)) { 240 value = attr->children->content; 241 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 242 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 243 http = 1; 244 else 245 { 246 if ((value != NULL) && 247 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 248 content = value; 249 } 250 if ((http != 0) && (content != NULL)) 251 break; 252 } 253 attr = attr->next; 254 } 255 if ((http != 0) && (content != NULL)) { 256 meta = cur; 257 break; 258 } 259 260 } 261 } 262 cur = cur->next; 263 } 264create: 265 if (meta == NULL) { 266 if ((encoding != NULL) && (head != NULL)) { 267 /* 268 * Create a new Meta element with the right attributes 269 */ 270 271 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 272 if (head->children == NULL) 273 xmlAddChild(head, meta); 274 else 275 xmlAddPrevSibling(head->children, meta); 276 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 277 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 278 } 279 } else { 280 /* remove the meta tag if NULL is passed */ 281 if (encoding == NULL) { 282 xmlUnlinkNode(meta); 283 xmlFreeNode(meta); 284 } 285 /* change the document only if there is a real encoding change */ 286 else if (xmlStrcasestr(content, encoding) == NULL) { 287 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 288 } 289 } 290 291 292 return(0); 293} 294 295/** 296 * booleanHTMLAttrs: 297 * 298 * These are the HTML attributes which will be output 299 * in minimized form, i.e. <option selected="selected"> will be 300 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 301 * 302 */ 303static const char* const htmlBooleanAttrs[] = { 304 "checked", "compact", "declare", "defer", "disabled", "ismap", 305 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 306 "selected", NULL 307}; 308 309 310/** 311 * htmlIsBooleanAttr: 312 * @name: the name of the attribute to check 313 * 314 * Determine if a given attribute is a boolean attribute. 315 * 316 * returns: false if the attribute is not boolean, true otherwise. 317 */ 318int 319htmlIsBooleanAttr(const xmlChar *name) 320{ 321 int i = 0; 322 323 while (htmlBooleanAttrs[i] != NULL) { 324 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 325 return 1; 326 i++; 327 } 328 return 0; 329} 330 331#ifdef LIBXML_OUTPUT_ENABLED 332/************************************************************************ 333 * * 334 * Output error handlers * 335 * * 336 ************************************************************************/ 337/** 338 * htmlSaveErrMemory: 339 * @extra: extra information 340 * 341 * Handle an out of memory condition 342 */ 343static void 344htmlSaveErrMemory(const char *extra) 345{ 346 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 347} 348 349/** 350 * htmlSaveErr: 351 * @code: the error number 352 * @node: the location of the error. 353 * @extra: extra information 354 * 355 * Handle an out of memory condition 356 */ 357static void 358htmlSaveErr(int code, xmlNodePtr node, const char *extra) 359{ 360 const char *msg = NULL; 361 362 switch(code) { 363 case XML_SAVE_NOT_UTF8: 364 msg = "string is not in UTF-8\n"; 365 break; 366 case XML_SAVE_CHAR_INVALID: 367 msg = "invalid character value\n"; 368 break; 369 case XML_SAVE_UNKNOWN_ENCODING: 370 msg = "unknown encoding %s\n"; 371 break; 372 case XML_SAVE_NO_DOCTYPE: 373 msg = "HTML has no DOCTYPE\n"; 374 break; 375 default: 376 msg = "unexpected error number\n"; 377 } 378 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 379} 380 381/************************************************************************ 382 * * 383 * Dumping HTML tree content to a simple buffer * 384 * * 385 ************************************************************************/ 386 387/** 388 * htmlBufNodeDumpFormat: 389 * @buf: the xmlBufPtr output 390 * @doc: the document 391 * @cur: the current node 392 * @format: should formatting spaces been added 393 * 394 * Dump an HTML node, recursive behaviour,children are printed too. 395 * 396 * Returns the number of byte written or -1 in case of error 397 */ 398static size_t 399htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, 400 int format) { 401 size_t use; 402 int ret; 403 xmlOutputBufferPtr outbuf; 404 405 if (cur == NULL) { 406 return (-1); 407 } 408 if (buf == NULL) { 409 return (-1); 410 } 411 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 412 if (outbuf == NULL) { 413 htmlSaveErrMemory("allocating HTML output buffer"); 414 return (-1); 415 } 416 memset(outbuf, 0, sizeof(xmlOutputBuffer)); 417 outbuf->buffer = buf; 418 outbuf->encoder = NULL; 419 outbuf->writecallback = NULL; 420 outbuf->closecallback = NULL; 421 outbuf->context = NULL; 422 outbuf->written = 0; 423 424 use = xmlBufUse(buf); 425 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 426 xmlFree(outbuf); 427 ret = xmlBufUse(buf) - use; 428 return (ret); 429} 430 431/** 432 * htmlNodeDump: 433 * @buf: the HTML buffer output 434 * @doc: the document 435 * @cur: the current node 436 * 437 * Dump an HTML node, recursive behaviour,children are printed too, 438 * and formatting returns are added. 439 * 440 * Returns the number of byte written or -1 in case of error 441 */ 442int 443htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 444 xmlBufPtr buffer; 445 size_t ret; 446 447 if ((buf == NULL) || (cur == NULL)) 448 return(-1); 449 450 xmlInitParser(); 451 buffer = xmlBufFromBuffer(buf); 452 if (buffer == NULL) 453 return(-1); 454 455 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1); 456 457 xmlBufBackToBuffer(buffer); 458 459 if (ret > INT_MAX) 460 return(-1); 461 return((int) ret); 462} 463 464/** 465 * htmlNodeDumpFileFormat: 466 * @out: the FILE pointer 467 * @doc: the document 468 * @cur: the current node 469 * @encoding: the document encoding 470 * @format: should formatting spaces been added 471 * 472 * Dump an HTML node, recursive behaviour,children are printed too. 473 * 474 * TODO: if encoding == NULL try to save in the doc encoding 475 * 476 * returns: the number of byte written or -1 in case of failure. 477 */ 478int 479htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 480 xmlNodePtr cur, const char *encoding, int format) { 481 xmlOutputBufferPtr buf; 482 xmlCharEncodingHandlerPtr handler = NULL; 483 int ret; 484 485 xmlInitParser(); 486 487 if (encoding != NULL) { 488 xmlCharEncoding enc; 489 490 enc = xmlParseCharEncoding(encoding); 491 if (enc != XML_CHAR_ENCODING_UTF8) { 492 handler = xmlFindCharEncodingHandler(encoding); 493 if (handler == NULL) 494 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 495 } 496 } else { 497 /* 498 * Fallback to HTML or ASCII when the encoding is unspecified 499 */ 500 if (handler == NULL) 501 handler = xmlFindCharEncodingHandler("HTML"); 502 if (handler == NULL) 503 handler = xmlFindCharEncodingHandler("ascii"); 504 } 505 506 /* 507 * save the content to a temp buffer. 508 */ 509 buf = xmlOutputBufferCreateFile(out, handler); 510 if (buf == NULL) return(0); 511 512 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format); 513 514 ret = xmlOutputBufferClose(buf); 515 return(ret); 516} 517 518/** 519 * htmlNodeDumpFile: 520 * @out: the FILE pointer 521 * @doc: the document 522 * @cur: the current node 523 * 524 * Dump an HTML node, recursive behaviour,children are printed too, 525 * and formatting returns are added. 526 */ 527void 528htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 529 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 530} 531 532/** 533 * htmlDocDumpMemoryFormat: 534 * @cur: the document 535 * @mem: OUT: the memory pointer 536 * @size: OUT: the memory length 537 * @format: should formatting spaces been added 538 * 539 * Dump an HTML document in memory and return the xmlChar * and it's size. 540 * It's up to the caller to free the memory. 541 */ 542void 543htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 544 xmlOutputBufferPtr buf; 545 xmlCharEncodingHandlerPtr handler = NULL; 546 const char *encoding; 547 548 xmlInitParser(); 549 550 if ((mem == NULL) || (size == NULL)) 551 return; 552 if (cur == NULL) { 553 *mem = NULL; 554 *size = 0; 555 return; 556 } 557 558 encoding = (const char *) htmlGetMetaEncoding(cur); 559 560 if (encoding != NULL) { 561 xmlCharEncoding enc; 562 563 enc = xmlParseCharEncoding(encoding); 564 if (enc != XML_CHAR_ENCODING_UTF8) { 565 handler = xmlFindCharEncodingHandler(encoding); 566 if (handler == NULL) 567 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 568 569 } 570 } else { 571 /* 572 * Fallback to HTML or ASCII when the encoding is unspecified 573 */ 574 if (handler == NULL) 575 handler = xmlFindCharEncodingHandler("HTML"); 576 if (handler == NULL) 577 handler = xmlFindCharEncodingHandler("ascii"); 578 } 579 580 buf = xmlAllocOutputBufferInternal(handler); 581 if (buf == NULL) { 582 *mem = NULL; 583 *size = 0; 584 return; 585 } 586 587 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 588 589 xmlOutputBufferFlush(buf); 590 if (buf->conv != NULL) { 591 *size = xmlBufUse(buf->conv); 592 *mem = xmlStrndup(xmlBufContent(buf->conv), *size); 593 } else { 594 *size = xmlBufUse(buf->buffer); 595 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); 596 } 597 (void)xmlOutputBufferClose(buf); 598} 599 600/** 601 * htmlDocDumpMemory: 602 * @cur: the document 603 * @mem: OUT: the memory pointer 604 * @size: OUT: the memory length 605 * 606 * Dump an HTML document in memory and return the xmlChar * and it's size. 607 * It's up to the caller to free the memory. 608 */ 609void 610htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 611 htmlDocDumpMemoryFormat(cur, mem, size, 1); 612} 613 614 615/************************************************************************ 616 * * 617 * Dumping HTML tree content to an I/O output buffer * 618 * * 619 ************************************************************************/ 620 621/** 622 * htmlDtdDumpOutput: 623 * @buf: the HTML buffer output 624 * @doc: the document 625 * @encoding: the encoding string 626 * 627 * TODO: check whether encoding is needed 628 * 629 * Dump the HTML document DTD, if any. 630 */ 631static void 632htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 633 const char *encoding ATTRIBUTE_UNUSED) { 634 xmlDtdPtr cur = doc->intSubset; 635 636 if (cur == NULL) { 637 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 638 return; 639 } 640 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 641 xmlOutputBufferWriteString(buf, (const char *)cur->name); 642 if (cur->ExternalID != NULL) { 643 xmlOutputBufferWriteString(buf, " PUBLIC "); 644 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID); 645 if (cur->SystemID != NULL) { 646 xmlOutputBufferWriteString(buf, " "); 647 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 648 } 649 } else if (cur->SystemID != NULL && 650 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { 651 xmlOutputBufferWriteString(buf, " SYSTEM "); 652 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 653 } 654 xmlOutputBufferWriteString(buf, ">\n"); 655} 656 657/** 658 * htmlAttrDumpOutput: 659 * @buf: the HTML buffer output 660 * @doc: the document 661 * @cur: the attribute pointer 662 * 663 * Dump an HTML attribute 664 */ 665static void 666htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { 667 xmlChar *value; 668 669 /* 670 * The html output method should not escape a & character 671 * occurring in an attribute value immediately followed by 672 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 673 * This is implemented in xmlEncodeEntitiesReentrant 674 */ 675 676 if (cur == NULL) { 677 return; 678 } 679 xmlOutputBufferWriteString(buf, " "); 680 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 681 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 682 xmlOutputBufferWriteString(buf, ":"); 683 } 684 xmlOutputBufferWriteString(buf, (const char *)cur->name); 685 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 686 value = xmlNodeListGetString(doc, cur->children, 0); 687 if (value) { 688 xmlOutputBufferWriteString(buf, "="); 689 if ((cur->ns == NULL) && (cur->parent != NULL) && 690 (cur->parent->ns == NULL) && 691 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 692 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 693 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 694 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 695 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 696 xmlChar *escaped; 697 xmlChar *tmp = value; 698 699 while (IS_BLANK_CH(*tmp)) tmp++; 700 701 /* 702 * Angle brackets are technically illegal in URIs, but they're 703 * used in server side includes, for example. Curly brackets 704 * are illegal as well and often used in templates. 705 * Don't escape non-whitespace, printable ASCII chars for 706 * improved interoperability. Only escape space, control 707 * and non-ASCII chars. 708 */ 709 escaped = xmlURIEscapeStr(tmp, 710 BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}"); 711 if (escaped != NULL) { 712 xmlBufWriteQuotedString(buf->buffer, escaped); 713 xmlFree(escaped); 714 } else { 715 xmlBufWriteQuotedString(buf->buffer, value); 716 } 717 } else { 718 xmlBufWriteQuotedString(buf->buffer, value); 719 } 720 xmlFree(value); 721 } else { 722 xmlOutputBufferWriteString(buf, "=\"\""); 723 } 724 } 725} 726 727/** 728 * htmlNodeDumpFormatOutput: 729 * @buf: the HTML buffer output 730 * @doc: the document 731 * @cur: the current node 732 * @encoding: the encoding string (unused) 733 * @format: should formatting spaces been added 734 * 735 * Dump an HTML node, recursive behaviour,children are printed too. 736 */ 737void 738htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 739 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, 740 int format) { 741 xmlNodePtr root, parent; 742 xmlAttrPtr attr; 743 const htmlElemDesc * info; 744 745 xmlInitParser(); 746 747 if ((cur == NULL) || (buf == NULL)) { 748 return; 749 } 750 751 root = cur; 752 parent = cur->parent; 753 while (1) { 754 switch (cur->type) { 755 case XML_HTML_DOCUMENT_NODE: 756 case XML_DOCUMENT_NODE: 757 if (((xmlDocPtr) cur)->intSubset != NULL) { 758 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); 759 } 760 if (cur->children != NULL) { 761 /* Always validate cur->parent when descending. */ 762 if (cur->parent == parent) { 763 parent = cur; 764 cur = cur->children; 765 continue; 766 } 767 } else { 768 xmlOutputBufferWriteString(buf, "\n"); 769 } 770 break; 771 772 case XML_ELEMENT_NODE: 773 /* 774 * Some users like lxml are known to pass nodes with a corrupted 775 * tree structure. Fall back to a recursive call to handle this 776 * case. 777 */ 778 if ((cur->parent != parent) && (cur->children != NULL)) { 779 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 780 break; 781 } 782 783 /* 784 * Get specific HTML info for that node. 785 */ 786 if (cur->ns == NULL) 787 info = htmlTagLookup(cur->name); 788 else 789 info = NULL; 790 791 xmlOutputBufferWriteString(buf, "<"); 792 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 793 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 794 xmlOutputBufferWriteString(buf, ":"); 795 } 796 xmlOutputBufferWriteString(buf, (const char *)cur->name); 797 if (cur->nsDef) 798 xmlNsListDumpOutput(buf, cur->nsDef); 799 attr = cur->properties; 800 while (attr != NULL) { 801 htmlAttrDumpOutput(buf, doc, attr); 802 attr = attr->next; 803 } 804 805 if ((info != NULL) && (info->empty)) { 806 xmlOutputBufferWriteString(buf, ">"); 807 } else if (cur->children == NULL) { 808 if ((info != NULL) && (info->saveEndTag != 0) && 809 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 810 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 811 xmlOutputBufferWriteString(buf, ">"); 812 } else { 813 xmlOutputBufferWriteString(buf, "></"); 814 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 815 xmlOutputBufferWriteString(buf, 816 (const char *)cur->ns->prefix); 817 xmlOutputBufferWriteString(buf, ":"); 818 } 819 xmlOutputBufferWriteString(buf, (const char *)cur->name); 820 xmlOutputBufferWriteString(buf, ">"); 821 } 822 } else { 823 xmlOutputBufferWriteString(buf, ">"); 824 if ((format) && (info != NULL) && (!info->isinline) && 825 (cur->children->type != HTML_TEXT_NODE) && 826 (cur->children->type != HTML_ENTITY_REF_NODE) && 827 (cur->children != cur->last) && 828 (cur->name != NULL) && 829 (cur->name[0] != 'p')) /* p, pre, param */ 830 xmlOutputBufferWriteString(buf, "\n"); 831 parent = cur; 832 cur = cur->children; 833 continue; 834 } 835 836 if ((format) && (cur->next != NULL) && 837 (info != NULL) && (!info->isinline)) { 838 if ((cur->next->type != HTML_TEXT_NODE) && 839 (cur->next->type != HTML_ENTITY_REF_NODE) && 840 (parent != NULL) && 841 (parent->name != NULL) && 842 (parent->name[0] != 'p')) /* p, pre, param */ 843 xmlOutputBufferWriteString(buf, "\n"); 844 } 845 846 break; 847 848 case XML_ATTRIBUTE_NODE: 849 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur); 850 break; 851 852 case HTML_TEXT_NODE: 853 if (cur->content == NULL) 854 break; 855 if (((cur->name == (const xmlChar *)xmlStringText) || 856 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 857 ((parent == NULL) || 858 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && 859 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { 860 xmlChar *buffer; 861 862 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 863 if (buffer != NULL) { 864 xmlOutputBufferWriteString(buf, (const char *)buffer); 865 xmlFree(buffer); 866 } 867 } else { 868 xmlOutputBufferWriteString(buf, (const char *)cur->content); 869 } 870 break; 871 872 case HTML_COMMENT_NODE: 873 if (cur->content != NULL) { 874 xmlOutputBufferWriteString(buf, "<!--"); 875 xmlOutputBufferWriteString(buf, (const char *)cur->content); 876 xmlOutputBufferWriteString(buf, "-->"); 877 } 878 break; 879 880 case HTML_PI_NODE: 881 if (cur->name != NULL) { 882 xmlOutputBufferWriteString(buf, "<?"); 883 xmlOutputBufferWriteString(buf, (const char *)cur->name); 884 if (cur->content != NULL) { 885 xmlOutputBufferWriteString(buf, " "); 886 xmlOutputBufferWriteString(buf, 887 (const char *)cur->content); 888 } 889 xmlOutputBufferWriteString(buf, ">"); 890 } 891 break; 892 893 case HTML_ENTITY_REF_NODE: 894 xmlOutputBufferWriteString(buf, "&"); 895 xmlOutputBufferWriteString(buf, (const char *)cur->name); 896 xmlOutputBufferWriteString(buf, ";"); 897 break; 898 899 case HTML_PRESERVE_NODE: 900 if (cur->content != NULL) { 901 xmlOutputBufferWriteString(buf, (const char *)cur->content); 902 } 903 break; 904 905 default: 906 break; 907 } 908 909 while (1) { 910 if (cur == root) 911 return; 912 if (cur->next != NULL) { 913 cur = cur->next; 914 break; 915 } 916 917 cur = parent; 918 /* cur->parent was validated when descending. */ 919 parent = cur->parent; 920 921 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 922 (cur->type == XML_DOCUMENT_NODE)) { 923 xmlOutputBufferWriteString(buf, "\n"); 924 } else { 925 if ((format) && (cur->ns == NULL)) 926 info = htmlTagLookup(cur->name); 927 else 928 info = NULL; 929 930 if ((format) && (info != NULL) && (!info->isinline) && 931 (cur->last->type != HTML_TEXT_NODE) && 932 (cur->last->type != HTML_ENTITY_REF_NODE) && 933 (cur->children != cur->last) && 934 (cur->name != NULL) && 935 (cur->name[0] != 'p')) /* p, pre, param */ 936 xmlOutputBufferWriteString(buf, "\n"); 937 938 xmlOutputBufferWriteString(buf, "</"); 939 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 940 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 941 xmlOutputBufferWriteString(buf, ":"); 942 } 943 xmlOutputBufferWriteString(buf, (const char *)cur->name); 944 xmlOutputBufferWriteString(buf, ">"); 945 946 if ((format) && (info != NULL) && (!info->isinline) && 947 (cur->next != NULL)) { 948 if ((cur->next->type != HTML_TEXT_NODE) && 949 (cur->next->type != HTML_ENTITY_REF_NODE) && 950 (parent != NULL) && 951 (parent->name != NULL) && 952 (parent->name[0] != 'p')) /* p, pre, param */ 953 xmlOutputBufferWriteString(buf, "\n"); 954 } 955 } 956 } 957 } 958} 959 960/** 961 * htmlNodeDumpOutput: 962 * @buf: the HTML buffer output 963 * @doc: the document 964 * @cur: the current node 965 * @encoding: the encoding string (unused) 966 * 967 * Dump an HTML node, recursive behaviour,children are printed too, 968 * and formatting returns/spaces are added. 969 */ 970void 971htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 972 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) { 973 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1); 974} 975 976/** 977 * htmlDocContentDumpFormatOutput: 978 * @buf: the HTML buffer output 979 * @cur: the document 980 * @encoding: the encoding string (unused) 981 * @format: should formatting spaces been added 982 * 983 * Dump an HTML document. 984 */ 985void 986htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 987 const char *encoding ATTRIBUTE_UNUSED, 988 int format) { 989 int type = 0; 990 if (cur) { 991 type = cur->type; 992 cur->type = XML_HTML_DOCUMENT_NODE; 993 } 994 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format); 995 if (cur) 996 cur->type = (xmlElementType) type; 997} 998 999/** 1000 * htmlDocContentDumpOutput: 1001 * @buf: the HTML buffer output 1002 * @cur: the document 1003 * @encoding: the encoding string (unused) 1004 * 1005 * Dump an HTML document. Formatting return/spaces are added. 1006 */ 1007void 1008htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1009 const char *encoding ATTRIBUTE_UNUSED) { 1010 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1); 1011} 1012 1013/************************************************************************ 1014 * * 1015 * Saving functions front-ends * 1016 * * 1017 ************************************************************************/ 1018 1019/** 1020 * htmlDocDump: 1021 * @f: the FILE* 1022 * @cur: the document 1023 * 1024 * Dump an HTML document to an open FILE. 1025 * 1026 * returns: the number of byte written or -1 in case of failure. 1027 */ 1028int 1029htmlDocDump(FILE *f, xmlDocPtr cur) { 1030 xmlOutputBufferPtr buf; 1031 xmlCharEncodingHandlerPtr handler = NULL; 1032 const char *encoding; 1033 int ret; 1034 1035 xmlInitParser(); 1036 1037 if ((cur == NULL) || (f == NULL)) { 1038 return(-1); 1039 } 1040 1041 encoding = (const char *) htmlGetMetaEncoding(cur); 1042 1043 if (encoding != NULL) { 1044 xmlCharEncoding enc; 1045 1046 enc = xmlParseCharEncoding(encoding); 1047 if (enc != XML_CHAR_ENCODING_UTF8) { 1048 handler = xmlFindCharEncodingHandler(encoding); 1049 if (handler == NULL) 1050 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1051 } 1052 } else { 1053 /* 1054 * Fallback to HTML or ASCII when the encoding is unspecified 1055 */ 1056 if (handler == NULL) 1057 handler = xmlFindCharEncodingHandler("HTML"); 1058 if (handler == NULL) 1059 handler = xmlFindCharEncodingHandler("ascii"); 1060 } 1061 1062 buf = xmlOutputBufferCreateFile(f, handler); 1063 if (buf == NULL) return(-1); 1064 htmlDocContentDumpOutput(buf, cur, NULL); 1065 1066 ret = xmlOutputBufferClose(buf); 1067 return(ret); 1068} 1069 1070/** 1071 * htmlSaveFile: 1072 * @filename: the filename (or URL) 1073 * @cur: the document 1074 * 1075 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1076 * used. 1077 * returns: the number of byte written or -1 in case of failure. 1078 */ 1079int 1080htmlSaveFile(const char *filename, xmlDocPtr cur) { 1081 xmlOutputBufferPtr buf; 1082 xmlCharEncodingHandlerPtr handler = NULL; 1083 const char *encoding; 1084 int ret; 1085 1086 if ((cur == NULL) || (filename == NULL)) 1087 return(-1); 1088 1089 xmlInitParser(); 1090 1091 encoding = (const char *) htmlGetMetaEncoding(cur); 1092 1093 if (encoding != NULL) { 1094 xmlCharEncoding enc; 1095 1096 enc = xmlParseCharEncoding(encoding); 1097 if (enc != XML_CHAR_ENCODING_UTF8) { 1098 handler = xmlFindCharEncodingHandler(encoding); 1099 if (handler == NULL) 1100 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1101 } 1102 } else { 1103 /* 1104 * Fallback to HTML or ASCII when the encoding is unspecified 1105 */ 1106 if (handler == NULL) 1107 handler = xmlFindCharEncodingHandler("HTML"); 1108 if (handler == NULL) 1109 handler = xmlFindCharEncodingHandler("ascii"); 1110 } 1111 1112 /* 1113 * save the content to a temp buffer. 1114 */ 1115 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1116 if (buf == NULL) return(0); 1117 1118 htmlDocContentDumpOutput(buf, cur, NULL); 1119 1120 ret = xmlOutputBufferClose(buf); 1121 return(ret); 1122} 1123 1124/** 1125 * htmlSaveFileFormat: 1126 * @filename: the filename 1127 * @cur: the document 1128 * @format: should formatting spaces been added 1129 * @encoding: the document encoding 1130 * 1131 * Dump an HTML document to a file using a given encoding. 1132 * 1133 * returns: the number of byte written or -1 in case of failure. 1134 */ 1135int 1136htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1137 const char *encoding, int format) { 1138 xmlOutputBufferPtr buf; 1139 xmlCharEncodingHandlerPtr handler = NULL; 1140 int ret; 1141 1142 if ((cur == NULL) || (filename == NULL)) 1143 return(-1); 1144 1145 xmlInitParser(); 1146 1147 if (encoding != NULL) { 1148 xmlCharEncoding enc; 1149 1150 enc = xmlParseCharEncoding(encoding); 1151 if (enc != XML_CHAR_ENCODING_UTF8) { 1152 handler = xmlFindCharEncodingHandler(encoding); 1153 if (handler == NULL) 1154 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1155 } 1156 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1157 } else { 1158 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1159 1160 /* 1161 * Fallback to HTML or ASCII when the encoding is unspecified 1162 */ 1163 if (handler == NULL) 1164 handler = xmlFindCharEncodingHandler("HTML"); 1165 if (handler == NULL) 1166 handler = xmlFindCharEncodingHandler("ascii"); 1167 } 1168 1169 /* 1170 * save the content to a temp buffer. 1171 */ 1172 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1173 if (buf == NULL) return(0); 1174 1175 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1176 1177 ret = xmlOutputBufferClose(buf); 1178 return(ret); 1179} 1180 1181/** 1182 * htmlSaveFileEnc: 1183 * @filename: the filename 1184 * @cur: the document 1185 * @encoding: the document encoding 1186 * 1187 * Dump an HTML document to a file using a given encoding 1188 * and formatting returns/spaces are added. 1189 * 1190 * returns: the number of byte written or -1 in case of failure. 1191 */ 1192int 1193htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1194 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1195} 1196 1197#endif /* LIBXML_OUTPUT_ENABLED */ 1198 1199#endif /* LIBXML_HTML_ENABLED */