sdk/lib/3rdparty/libxml2/xmlstring.c at master

huwcampbell.com / reactos
fork atom
Reactos
fork atom
reactos / sdk / lib / 3rdparty / libxml2 / xmlstring.c
at master 1045 lines 27 kB view raw
wrap content
Timo Kreuzer [LIBXML2] Update to release 2.12.8 (from Wine-10.0) 4mo ago
8b7ade14
   1/*
   2 * string.c : an XML string utilities module
   3 *
   4 * This module provides various utility functions for manipulating
   5 * the xmlChar* type. All functions named xmlStr* have been moved here
   6 * from the parser.c file (their original home).
   7 *
   8 * See Copyright for the status of this software.
   9 *
  10 * UTF8 string routines from:
  11 * William Brack <wbrack@mmm.com.hk>
  12 *
  13 * daniel@veillard.com
  14 */
  15
  16#define IN_LIBXML
  17#include "libxml.h"
  18
  19#include <stdlib.h>
  20#include <string.h>
  21#include <limits.h>
  22#include <libxml/xmlmemory.h>
  23#include <libxml/parserInternals.h>
  24#include <libxml/xmlstring.h>
  25
  26#include "private/parser.h"
  27#include "private/string.h"
  28
  29/************************************************************************
  30 *                                                                      *
  31 *                Commodity functions to handle xmlChars                *
  32 *                                                                      *
  33 ************************************************************************/
  34
  35/**
  36 * xmlStrndup:
  37 * @cur:  the input xmlChar *
  38 * @len:  the len of @cur
  39 *
  40 * a strndup for array of xmlChar's
  41 *
  42 * Returns a new xmlChar * or NULL
  43 */
  44xmlChar *
  45xmlStrndup(const xmlChar *cur, int len) {
  46    xmlChar *ret;
  47
  48    if ((cur == NULL) || (len < 0)) return(NULL);
  49    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
  50    if (ret == NULL) {
  51        return(NULL);
  52    }
  53    memcpy(ret, cur, len);
  54    ret[len] = 0;
  55    return(ret);
  56}
  57
  58/**
  59 * xmlStrdup:
  60 * @cur:  the input xmlChar *
  61 *
  62 * a strdup for array of xmlChar's. Since they are supposed to be
  63 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
  64 * a termination mark of '0'.
  65 *
  66 * Returns a new xmlChar * or NULL
  67 */
  68xmlChar *
  69xmlStrdup(const xmlChar *cur) {
  70    const xmlChar *p = cur;
  71
  72    if (cur == NULL) return(NULL);
  73    while (*p != 0) p++; /* non input consuming */
  74    return(xmlStrndup(cur, p - cur));
  75}
  76
  77/**
  78 * xmlCharStrndup:
  79 * @cur:  the input char *
  80 * @len:  the len of @cur
  81 *
  82 * a strndup for char's to xmlChar's
  83 *
  84 * Returns a new xmlChar * or NULL
  85 */
  86
  87xmlChar *
  88xmlCharStrndup(const char *cur, int len) {
  89    int i;
  90    xmlChar *ret;
  91
  92    if ((cur == NULL) || (len < 0)) return(NULL);
  93    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
  94    if (ret == NULL) {
  95        return(NULL);
  96    }
  97    for (i = 0;i < len;i++) {
  98        /* Explicit sign change */
  99        ret[i] = (xmlChar) cur[i];
 100        if (ret[i] == 0) return(ret);
 101    }
 102    ret[len] = 0;
 103    return(ret);
 104}
 105
 106/**
 107 * xmlCharStrdup:
 108 * @cur:  the input char *
 109 *
 110 * a strdup for char's to xmlChar's
 111 *
 112 * Returns a new xmlChar * or NULL
 113 */
 114
 115xmlChar *
 116xmlCharStrdup(const char *cur) {
 117    const char *p = cur;
 118
 119    if (cur == NULL) return(NULL);
 120    while (*p != '\0') p++; /* non input consuming */
 121    return(xmlCharStrndup(cur, p - cur));
 122}
 123
 124/**
 125 * xmlStrcmp:
 126 * @str1:  the first xmlChar *
 127 * @str2:  the second xmlChar *
 128 *
 129 * a strcmp for xmlChar's
 130 *
 131 * Returns the integer result of the comparison
 132 */
 133
 134int
 135xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
 136    if (str1 == str2) return(0);
 137    if (str1 == NULL) return(-1);
 138    if (str2 == NULL) return(1);
 139#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 140    return(strcmp((const char *)str1, (const char *)str2));
 141#else
 142    do {
 143        int tmp = *str1++ - *str2;
 144        if (tmp != 0) return(tmp);
 145    } while (*str2++ != 0);
 146    return 0;
 147#endif
 148}
 149
 150/**
 151 * xmlStrEqual:
 152 * @str1:  the first xmlChar *
 153 * @str2:  the second xmlChar *
 154 *
 155 * Check if both strings are equal of have same content.
 156 * Should be a bit more readable and faster than xmlStrcmp()
 157 *
 158 * Returns 1 if they are equal, 0 if they are different
 159 */
 160
 161int
 162xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
 163    if (str1 == str2) return(1);
 164    if (str1 == NULL) return(0);
 165    if (str2 == NULL) return(0);
 166#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 167    return(strcmp((const char *)str1, (const char *)str2) == 0);
 168#else
 169    do {
 170        if (*str1++ != *str2) return(0);
 171    } while (*str2++);
 172    return(1);
 173#endif
 174}
 175
 176/**
 177 * xmlStrQEqual:
 178 * @pref:  the prefix of the QName
 179 * @name:  the localname of the QName
 180 * @str:  the second xmlChar *
 181 *
 182 * Check if a QName is Equal to a given string
 183 *
 184 * Returns 1 if they are equal, 0 if they are different
 185 */
 186
 187int
 188xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
 189    if (pref == NULL) return(xmlStrEqual(name, str));
 190    if (name == NULL) return(0);
 191    if (str == NULL) return(0);
 192
 193    do {
 194        if (*pref++ != *str) return(0);
 195    } while ((*str++) && (*pref));
 196    if (*str++ != ':') return(0);
 197    do {
 198        if (*name++ != *str) return(0);
 199    } while (*str++);
 200    return(1);
 201}
 202
 203/**
 204 * xmlStrncmp:
 205 * @str1:  the first xmlChar *
 206 * @str2:  the second xmlChar *
 207 * @len:  the max comparison length
 208 *
 209 * a strncmp for xmlChar's
 210 *
 211 * Returns the integer result of the comparison
 212 */
 213
 214int
 215xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
 216    if (len <= 0) return(0);
 217    if (str1 == str2) return(0);
 218    if (str1 == NULL) return(-1);
 219    if (str2 == NULL) return(1);
 220#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 221    return(strncmp((const char *)str1, (const char *)str2, len));
 222#else
 223    do {
 224        int tmp = *str1++ - *str2;
 225        if (tmp != 0 || --len == 0) return(tmp);
 226    } while (*str2++ != 0);
 227    return 0;
 228#endif
 229}
 230
 231static const xmlChar casemap[256] = {
 232    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
 233    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
 234    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 235    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
 236    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
 237    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
 238    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
 239    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
 240    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 241    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 242    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 243    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
 244    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 245    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 246    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 247    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
 248    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
 249    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
 250    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
 251    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
 252    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
 253    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
 254    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
 255    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
 256    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
 257    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
 258    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
 259    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
 260    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
 261    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
 262    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
 263    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
 264};
 265
 266/**
 267 * xmlStrcasecmp:
 268 * @str1:  the first xmlChar *
 269 * @str2:  the second xmlChar *
 270 *
 271 * a strcasecmp for xmlChar's
 272 *
 273 * Returns the integer result of the comparison
 274 */
 275
 276int
 277xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
 278    register int tmp;
 279
 280    if (str1 == str2) return(0);
 281    if (str1 == NULL) return(-1);
 282    if (str2 == NULL) return(1);
 283    do {
 284        tmp = casemap[*str1++] - casemap[*str2];
 285        if (tmp != 0) return(tmp);
 286    } while (*str2++ != 0);
 287    return 0;
 288}
 289
 290/**
 291 * xmlStrncasecmp:
 292 * @str1:  the first xmlChar *
 293 * @str2:  the second xmlChar *
 294 * @len:  the max comparison length
 295 *
 296 * a strncasecmp for xmlChar's
 297 *
 298 * Returns the integer result of the comparison
 299 */
 300
 301int
 302xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
 303    register int tmp;
 304
 305    if (len <= 0) return(0);
 306    if (str1 == str2) return(0);
 307    if (str1 == NULL) return(-1);
 308    if (str2 == NULL) return(1);
 309    do {
 310        tmp = casemap[*str1++] - casemap[*str2];
 311        if (tmp != 0 || --len == 0) return(tmp);
 312    } while (*str2++ != 0);
 313    return 0;
 314}
 315
 316/**
 317 * xmlStrchr:
 318 * @str:  the xmlChar * array
 319 * @val:  the xmlChar to search
 320 *
 321 * a strchr for xmlChar's
 322 *
 323 * Returns the xmlChar * for the first occurrence or NULL.
 324 */
 325
 326const xmlChar *
 327xmlStrchr(const xmlChar *str, xmlChar val) {
 328    if (str == NULL) return(NULL);
 329    while (*str != 0) { /* non input consuming */
 330        if (*str == val) return((xmlChar *) str);
 331        str++;
 332    }
 333    return(NULL);
 334}
 335
 336/**
 337 * xmlStrstr:
 338 * @str:  the xmlChar * array (haystack)
 339 * @val:  the xmlChar to search (needle)
 340 *
 341 * a strstr for xmlChar's
 342 *
 343 * Returns the xmlChar * for the first occurrence or NULL.
 344 */
 345
 346const xmlChar *
 347xmlStrstr(const xmlChar *str, const xmlChar *val) {
 348    int n;
 349
 350    if (str == NULL) return(NULL);
 351    if (val == NULL) return(NULL);
 352    n = xmlStrlen(val);
 353
 354    if (n == 0) return(str);
 355    while (*str != 0) { /* non input consuming */
 356        if (*str == *val) {
 357            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
 358        }
 359        str++;
 360    }
 361    return(NULL);
 362}
 363
 364/**
 365 * xmlStrcasestr:
 366 * @str:  the xmlChar * array (haystack)
 367 * @val:  the xmlChar to search (needle)
 368 *
 369 * a case-ignoring strstr for xmlChar's
 370 *
 371 * Returns the xmlChar * for the first occurrence or NULL.
 372 */
 373
 374const xmlChar *
 375xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
 376    int n;
 377
 378    if (str == NULL) return(NULL);
 379    if (val == NULL) return(NULL);
 380    n = xmlStrlen(val);
 381
 382    if (n == 0) return(str);
 383    while (*str != 0) { /* non input consuming */
 384        if (casemap[*str] == casemap[*val])
 385            if (!xmlStrncasecmp(str, val, n)) return(str);
 386        str++;
 387    }
 388    return(NULL);
 389}
 390
 391/**
 392 * xmlStrsub:
 393 * @str:  the xmlChar * array (haystack)
 394 * @start:  the index of the first char (zero based)
 395 * @len:  the length of the substring
 396 *
 397 * Extract a substring of a given string
 398 *
 399 * Returns the xmlChar * for the first occurrence or NULL.
 400 */
 401
 402xmlChar *
 403xmlStrsub(const xmlChar *str, int start, int len) {
 404    int i;
 405
 406    if (str == NULL) return(NULL);
 407    if (start < 0) return(NULL);
 408    if (len < 0) return(NULL);
 409
 410    for (i = 0;i < start;i++) {
 411        if (*str == 0) return(NULL);
 412        str++;
 413    }
 414    if (*str == 0) return(NULL);
 415    return(xmlStrndup(str, len));
 416}
 417
 418/**
 419 * xmlStrlen:
 420 * @str:  the xmlChar * array
 421 *
 422 * length of a xmlChar's string
 423 *
 424 * Returns the number of xmlChar contained in the ARRAY.
 425 */
 426
 427int
 428xmlStrlen(const xmlChar *str) {
 429    size_t len = str ? strlen((const char *)str) : 0;
 430    return(len > INT_MAX ? 0 : len);
 431}
 432
 433/**
 434 * xmlStrncat:
 435 * @cur:  the original xmlChar * array
 436 * @add:  the xmlChar * array added
 437 * @len:  the length of @add
 438 *
 439 * a strncat for array of xmlChar's, it will extend @cur with the len
 440 * first bytes of @add. Note that if @len < 0 then this is an API error
 441 * and NULL will be returned.
 442 *
 443 * Returns a new xmlChar *, the original @cur is reallocated and should
 444 * not be freed.
 445 */
 446
 447xmlChar *
 448xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
 449    int size;
 450    xmlChar *ret;
 451
 452    if ((add == NULL) || (len == 0))
 453        return(cur);
 454    if (len < 0)
 455	return(NULL);
 456    if (cur == NULL)
 457        return(xmlStrndup(add, len));
 458
 459    size = xmlStrlen(cur);
 460    if ((size < 0) || (size > INT_MAX - len))
 461        return(NULL);
 462    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
 463    if (ret == NULL) {
 464        return(cur);
 465    }
 466    memcpy(&ret[size], add, len);
 467    ret[size + len] = 0;
 468    return(ret);
 469}
 470
 471/**
 472 * xmlStrncatNew:
 473 * @str1:  first xmlChar string
 474 * @str2:  second xmlChar string
 475 * @len:  the len of @str2 or < 0
 476 *
 477 * same as xmlStrncat, but creates a new string.  The original
 478 * two strings are not freed. If @len is < 0 then the length
 479 * will be calculated automatically.
 480 *
 481 * Returns a new xmlChar * or NULL
 482 */
 483xmlChar *
 484xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
 485    int size;
 486    xmlChar *ret;
 487
 488    if (len < 0) {
 489        len = xmlStrlen(str2);
 490        if (len < 0)
 491            return(NULL);
 492    }
 493    if ((str2 == NULL) || (len == 0))
 494        return(xmlStrdup(str1));
 495    if (str1 == NULL)
 496        return(xmlStrndup(str2, len));
 497
 498    size = xmlStrlen(str1);
 499    if ((size < 0) || (size > INT_MAX - len))
 500        return(NULL);
 501    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
 502    if (ret == NULL) {
 503        return(xmlStrndup(str1, size));
 504    }
 505    memcpy(ret, str1, size);
 506    memcpy(&ret[size], str2, len);
 507    ret[size + len] = 0;
 508    return(ret);
 509}
 510
 511/**
 512 * xmlStrcat:
 513 * @cur:  the original xmlChar * array
 514 * @add:  the xmlChar * array added
 515 *
 516 * a strcat for array of xmlChar's. Since they are supposed to be
 517 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
 518 * a termination mark of '0'.
 519 *
 520 * Returns a new xmlChar * containing the concatenated string. The original
 521 * @cur is reallocated and should not be freed.
 522 */
 523xmlChar *
 524xmlStrcat(xmlChar *cur, const xmlChar *add) {
 525    const xmlChar *p = add;
 526
 527    if (add == NULL) return(cur);
 528    if (cur == NULL)
 529        return(xmlStrdup(add));
 530
 531    while (*p != 0) p++; /* non input consuming */
 532    return(xmlStrncat(cur, add, p - add));
 533}
 534
 535/**
 536 * xmlStrPrintf:
 537 * @buf:   the result buffer.
 538 * @len:   the result buffer length.
 539 * @msg:   the message with printf formatting.
 540 * @...:   extra parameters for the message.
 541 *
 542 * Formats @msg and places result into @buf.
 543 *
 544 * Returns the number of characters written to @buf or -1 if an error occurs.
 545 */
 546int
 547xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
 548    va_list args;
 549    int ret;
 550
 551    if((buf == NULL) || (msg == NULL)) {
 552        return(-1);
 553    }
 554
 555    va_start(args, msg);
 556    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
 557    va_end(args);
 558    buf[len - 1] = 0; /* be safe ! */
 559
 560    return(ret);
 561}
 562
 563/**
 564 * xmlStrVPrintf:
 565 * @buf:   the result buffer.
 566 * @len:   the result buffer length.
 567 * @msg:   the message with printf formatting.
 568 * @ap:    extra parameters for the message.
 569 *
 570 * Formats @msg and places result into @buf.
 571 *
 572 * Returns the number of characters written to @buf or -1 if an error occurs.
 573 */
 574int
 575xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
 576    int ret;
 577
 578    if((buf == NULL) || (msg == NULL)) {
 579        return(-1);
 580    }
 581
 582    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
 583    buf[len - 1] = 0; /* be safe ! */
 584
 585    return(ret);
 586}
 587
 588/************************************************************************
 589 *                                                                      *
 590 *              Generic UTF8 handling routines                          *
 591 *                                                                      *
 592 * From rfc2044: encoding of the Unicode values on UTF-8:               *
 593 *                                                                      *
 594 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
 595 * 0000 0000-0000 007F   0xxxxxxx                                       *
 596 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
 597 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
 598 *                                                                      *
 599 * I hope we won't use values > 0xFFFF anytime soon !                   *
 600 *                                                                      *
 601 ************************************************************************/
 602
 603
 604/**
 605 * xmlUTF8Size:
 606 * @utf: pointer to the UTF8 character
 607 *
 608 * calculates the internal size of a UTF8 character
 609 *
 610 * returns the numbers of bytes in the character, -1 on format error
 611 */
 612int
 613xmlUTF8Size(const xmlChar *utf) {
 614    xmlChar mask;
 615    int len;
 616
 617    if (utf == NULL)
 618        return -1;
 619    if (*utf < 0x80)
 620        return 1;
 621    /* check valid UTF8 character */
 622    if (!(*utf & 0x40))
 623        return -1;
 624    /* determine number of bytes in char */
 625    len = 2;
 626    for (mask=0x20; mask != 0; mask>>=1) {
 627        if (!(*utf & mask))
 628            return len;
 629        len++;
 630    }
 631    return -1;
 632}
 633
 634/**
 635 * xmlUTF8Charcmp:
 636 * @utf1: pointer to first UTF8 char
 637 * @utf2: pointer to second UTF8 char
 638 *
 639 * compares the two UCS4 values
 640 *
 641 * returns result of the compare as with xmlStrncmp
 642 */
 643int
 644xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
 645
 646    if (utf1 == NULL ) {
 647        if (utf2 == NULL)
 648            return 0;
 649        return -1;
 650    }
 651    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
 652}
 653
 654/**
 655 * xmlUTF8Strlen:
 656 * @utf:  a sequence of UTF-8 encoded bytes
 657 *
 658 * compute the length of an UTF8 string, it doesn't do a full UTF8
 659 * checking of the content of the string.
 660 *
 661 * Returns the number of characters in the string or -1 in case of error
 662 */
 663int
 664xmlUTF8Strlen(const xmlChar *utf) {
 665    size_t ret = 0;
 666
 667    if (utf == NULL)
 668        return(-1);
 669
 670    while (*utf != 0) {
 671        if (utf[0] & 0x80) {
 672            if ((utf[1] & 0xc0) != 0x80)
 673                return(-1);
 674            if ((utf[0] & 0xe0) == 0xe0) {
 675                if ((utf[2] & 0xc0) != 0x80)
 676                    return(-1);
 677                if ((utf[0] & 0xf0) == 0xf0) {
 678                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 679                        return(-1);
 680                    utf += 4;
 681                } else {
 682                    utf += 3;
 683                }
 684            } else {
 685                utf += 2;
 686            }
 687        } else {
 688            utf++;
 689        }
 690        ret++;
 691    }
 692    return(ret > INT_MAX ? 0 : ret);
 693}
 694
 695/**
 696 * xmlGetUTF8Char:
 697 * @utf:  a sequence of UTF-8 encoded bytes
 698 * @len:  a pointer to the minimum number of bytes present in
 699 *        the sequence.  This is used to assure the next character
 700 *        is completely contained within the sequence.
 701 *
 702 * Read the first UTF8 character from @utf
 703 *
 704 * Returns the char value or -1 in case of error, and sets *len to
 705 *        the actual number of bytes consumed (0 in case of error)
 706 */
 707int
 708xmlGetUTF8Char(const unsigned char *utf, int *len) {
 709    unsigned int c;
 710
 711    if (utf == NULL)
 712        goto error;
 713    if (len == NULL)
 714        goto error;
 715
 716    c = utf[0];
 717    if (c < 0x80) {
 718        if (*len < 1)
 719            goto error;
 720        /* 1-byte code */
 721        *len = 1;
 722    } else {
 723        if ((*len < 2) || ((utf[1] & 0xc0) != 0x80))
 724            goto error;
 725        if (c < 0xe0) {
 726            if (c < 0xc2)
 727                goto error;
 728            /* 2-byte code */
 729            *len = 2;
 730            c = (c & 0x1f) << 6;
 731            c |= utf[1] & 0x3f;
 732        } else {
 733            if ((*len < 3) || ((utf[2] & 0xc0) != 0x80))
 734                goto error;
 735            if (c < 0xf0) {
 736                /* 3-byte code */
 737                *len = 3;
 738                c = (c & 0xf) << 12;
 739                c |= (utf[1] & 0x3f) << 6;
 740                c |= utf[2] & 0x3f;
 741                if ((c < 0x800) || ((c >= 0xd800) && (c < 0xe000)))
 742                    goto error;
 743            } else {
 744                if ((*len < 4) || ((utf[3] & 0xc0) != 0x80))
 745                    goto error;
 746                *len = 4;
 747                /* 4-byte code */
 748                c = (c & 0x7) << 18;
 749                c |= (utf[1] & 0x3f) << 12;
 750                c |= (utf[2] & 0x3f) << 6;
 751                c |= utf[3] & 0x3f;
 752                if ((c < 0x10000) || (c >= 0x110000))
 753                    goto error;
 754            }
 755        }
 756    }
 757    return(c);
 758
 759error:
 760    if (len != NULL)
 761	*len = 0;
 762    return(-1);
 763}
 764
 765/**
 766 * xmlCheckUTF8:
 767 * @utf: Pointer to putative UTF-8 encoded string.
 768 *
 769 * Checks @utf for being valid UTF-8. @utf is assumed to be
 770 * null-terminated. This function is not super-strict, as it will
 771 * allow longer UTF-8 sequences than necessary. Note that Java is
 772 * capable of producing these sequences if provoked. Also note, this
 773 * routine checks for the 4-byte maximum size, but does not check for
 774 * 0x10ffff maximum value.
 775 *
 776 * Return value: true if @utf is valid.
 777 **/
 778int
 779xmlCheckUTF8(const unsigned char *utf)
 780{
 781    int ix;
 782    unsigned char c;
 783
 784    if (utf == NULL)
 785        return(0);
 786    /*
 787     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
 788     * are as follows (in "bit format"):
 789     *    0xxxxxxx                                      valid 1-byte
 790     *    110xxxxx 10xxxxxx                             valid 2-byte
 791     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
 792     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
 793     */
 794    while ((c = utf[0])) {      /* string is 0-terminated */
 795        ix = 0;
 796        if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
 797            ix = 1;
 798	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
 799	    if ((utf[1] & 0xc0 ) != 0x80)
 800	        return 0;
 801	    ix = 2;
 802	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
 803	    if (((utf[1] & 0xc0) != 0x80) ||
 804	        ((utf[2] & 0xc0) != 0x80))
 805		    return 0;
 806	    ix = 3;
 807	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
 808	    if (((utf[1] & 0xc0) != 0x80) ||
 809	        ((utf[2] & 0xc0) != 0x80) ||
 810		((utf[3] & 0xc0) != 0x80))
 811		    return 0;
 812	    ix = 4;
 813	} else				/* unknown encoding */
 814	    return 0;
 815        utf += ix;
 816      }
 817      return(1);
 818}
 819
 820/**
 821 * xmlUTF8Strsize:
 822 * @utf:  a sequence of UTF-8 encoded bytes
 823 * @len:  the number of characters in the array
 824 *
 825 * storage size of an UTF8 string
 826 * the behaviour is not guaranteed if the input string is not UTF-8
 827 *
 828 * Returns the storage size of
 829 * the first 'len' characters of ARRAY
 830 */
 831
 832int
 833xmlUTF8Strsize(const xmlChar *utf, int len) {
 834    const xmlChar *ptr=utf;
 835    int ch;
 836    size_t ret;
 837
 838    if (utf == NULL)
 839        return(0);
 840
 841    if (len <= 0)
 842        return(0);
 843
 844    while ( len-- > 0) {
 845        if ( !*ptr )
 846            break;
 847        if ( (ch = *ptr++) & 0x80)
 848            while ((ch<<=1) & 0x80 ) {
 849		if (*ptr == 0) break;
 850                ptr++;
 851	    }
 852    }
 853    ret = ptr - utf;
 854    return (ret > INT_MAX ? 0 : ret);
 855}
 856
 857
 858/**
 859 * xmlUTF8Strndup:
 860 * @utf:  the input UTF8 *
 861 * @len:  the len of @utf (in chars)
 862 *
 863 * a strndup for array of UTF8's
 864 *
 865 * Returns a new UTF8 * or NULL
 866 */
 867xmlChar *
 868xmlUTF8Strndup(const xmlChar *utf, int len) {
 869    xmlChar *ret;
 870    int i;
 871
 872    if ((utf == NULL) || (len < 0)) return(NULL);
 873    i = xmlUTF8Strsize(utf, len);
 874    ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
 875    if (ret == NULL) {
 876        return(NULL);
 877    }
 878    memcpy(ret, utf, i);
 879    ret[i] = 0;
 880    return(ret);
 881}
 882
 883/**
 884 * xmlUTF8Strpos:
 885 * @utf:  the input UTF8 *
 886 * @pos:  the position of the desired UTF8 char (in chars)
 887 *
 888 * a function to provide the equivalent of fetching a
 889 * character from a string array
 890 *
 891 * Returns a pointer to the UTF8 character or NULL
 892 */
 893const xmlChar *
 894xmlUTF8Strpos(const xmlChar *utf, int pos) {
 895    int ch;
 896
 897    if (utf == NULL) return(NULL);
 898    if (pos < 0)
 899        return(NULL);
 900    while (pos--) {
 901        if ((ch=*utf++) == 0) return(NULL);
 902        if ( ch & 0x80 ) {
 903            /* if not simple ascii, verify proper format */
 904            if ( (ch & 0xc0) != 0xc0 )
 905                return(NULL);
 906            /* then skip over remaining bytes for this char */
 907            while ( (ch <<= 1) & 0x80 )
 908                if ( (*utf++ & 0xc0) != 0x80 )
 909                    return(NULL);
 910        }
 911    }
 912    return((xmlChar *)utf);
 913}
 914
 915/**
 916 * xmlUTF8Strloc:
 917 * @utf:  the input UTF8 *
 918 * @utfchar:  the UTF8 character to be found
 919 *
 920 * a function to provide the relative location of a UTF8 char
 921 *
 922 * Returns the relative character position of the desired char
 923 * or -1 if not found
 924 */
 925int
 926xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
 927    size_t i;
 928    int size;
 929    int ch;
 930
 931    if (utf==NULL || utfchar==NULL) return -1;
 932    size = xmlUTF8Strsize(utfchar, 1);
 933        for(i=0; (ch=*utf) != 0; i++) {
 934            if (xmlStrncmp(utf, utfchar, size)==0)
 935                return(i > INT_MAX ? 0 : i);
 936            utf++;
 937            if ( ch & 0x80 ) {
 938                /* if not simple ascii, verify proper format */
 939                if ( (ch & 0xc0) != 0xc0 )
 940                    return(-1);
 941                /* then skip over remaining bytes for this char */
 942                while ( (ch <<= 1) & 0x80 )
 943                    if ( (*utf++ & 0xc0) != 0x80 )
 944                        return(-1);
 945            }
 946        }
 947
 948    return(-1);
 949}
 950/**
 951 * xmlUTF8Strsub:
 952 * @utf:  a sequence of UTF-8 encoded bytes
 953 * @start: relative pos of first char
 954 * @len:   total number to copy
 955 *
 956 * Create a substring from a given UTF-8 string
 957 * Note:  positions are given in units of UTF-8 chars
 958 *
 959 * Returns a pointer to a newly created string
 960 * or NULL if any problem
 961 */
 962
 963xmlChar *
 964xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
 965    int i;
 966    int ch;
 967
 968    if (utf == NULL) return(NULL);
 969    if (start < 0) return(NULL);
 970    if (len < 0) return(NULL);
 971
 972    /*
 973     * Skip over any leading chars
 974     */
 975    for (i = 0;i < start;i++) {
 976        if ((ch=*utf++) == 0) return(NULL);
 977        if ( ch & 0x80 ) {
 978            /* if not simple ascii, verify proper format */
 979            if ( (ch & 0xc0) != 0xc0 )
 980                return(NULL);
 981            /* then skip over remaining bytes for this char */
 982            while ( (ch <<= 1) & 0x80 )
 983                if ( (*utf++ & 0xc0) != 0x80 )
 984                    return(NULL);
 985        }
 986    }
 987
 988    return(xmlUTF8Strndup(utf, len));
 989}
 990
 991/**
 992 * xmlEscapeFormatString:
 993 * @msg:  a pointer to the string in which to escape '%' characters.
 994 * Must be a heap-allocated buffer created by libxml2 that may be
 995 * returned, or that may be freed and replaced.
 996 *
 997 * Replaces the string pointed to by 'msg' with an escaped string.
 998 * Returns the same string with all '%' characters escaped.
 999 */
1000xmlChar *
1001xmlEscapeFormatString(xmlChar **msg)
1002{
1003    xmlChar *msgPtr = NULL;
1004    xmlChar *result = NULL;
1005    xmlChar *resultPtr = NULL;
1006    size_t count = 0;
1007    size_t msgLen = 0;
1008    size_t resultLen = 0;
1009
1010    if (!msg || !*msg)
1011        return(NULL);
1012
1013    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1014        ++msgLen;
1015        if (*msgPtr == '%')
1016            ++count;
1017    }
1018
1019    if (count == 0)
1020        return(*msg);
1021
1022    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1023        return(NULL);
1024    resultLen = msgLen + count + 1;
1025    result = (xmlChar *) xmlMallocAtomic(resultLen);
1026    if (result == NULL) {
1027        /* Clear *msg to prevent format string vulnerabilities in
1028           out-of-memory situations. */
1029        xmlFree(*msg);
1030        *msg = NULL;
1031        return(NULL);
1032    }
1033
1034    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1035        *resultPtr = *msgPtr;
1036        if (*msgPtr == '%')
1037            *(++resultPtr) = '%';
1038    }
1039    result[resultLen - 1] = '\0';
1040
1041    xmlFree(*msg);
1042    *msg = result;
1043
1044    return *msg;
1045}