an experiment in making a cocoa webkit browser manageable under X11
at master 522 lines 14 kB view raw
1// 2// GTMNSString+HTML.m 3// Dealing with NSStrings that contain HTML 4// 5// Copyright 2006-2008 Google Inc. 6// 7// Licensed under the Apache License, Version 2.0 (the "License"); you may not 8// use this file except in compliance with the License. You may obtain a copy 9// of the License at 10// 11// http://www.apache.org/licenses/LICENSE-2.0 12// 13// Unless required by applicable law or agreed to in writing, software 14// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 15// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 16// License for the specific language governing permissions and limitations under 17// the License. 18// 19 20//#import "GTMDefines.h" 21#import "GTMNSString+HTML.h" 22 23typedef struct { 24 NSString *escapeSequence; 25 unichar uchar; 26} HTMLEscapeMap; 27 28// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters 29// Ordered by uchar lowest to highest for bsearching 30static HTMLEscapeMap gAsciiHTMLEscapeMap[] = { 31 // A.2.2. Special characters 32 { @"&quot;", 34 }, 33 { @"&amp;", 38 }, 34 { @"&apos;", 39 }, 35 { @"&lt;", 60 }, 36 { @"&gt;", 62 }, 37 38 // A.2.1. Latin-1 characters 39 { @"&nbsp;", 160 }, 40 { @"&iexcl;", 161 }, 41 { @"&cent;", 162 }, 42 { @"&pound;", 163 }, 43 { @"&curren;", 164 }, 44 { @"&yen;", 165 }, 45 { @"&brvbar;", 166 }, 46 { @"&sect;", 167 }, 47 { @"&uml;", 168 }, 48 { @"&copy;", 169 }, 49 { @"&ordf;", 170 }, 50 { @"&laquo;", 171 }, 51 { @"&not;", 172 }, 52 { @"&shy;", 173 }, 53 { @"&reg;", 174 }, 54 { @"&macr;", 175 }, 55 { @"&deg;", 176 }, 56 { @"&plusmn;", 177 }, 57 { @"&sup2;", 178 }, 58 { @"&sup3;", 179 }, 59 { @"&acute;", 180 }, 60 { @"&micro;", 181 }, 61 { @"&para;", 182 }, 62 { @"&middot;", 183 }, 63 { @"&cedil;", 184 }, 64 { @"&sup1;", 185 }, 65 { @"&ordm;", 186 }, 66 { @"&raquo;", 187 }, 67 { @"&frac14;", 188 }, 68 { @"&frac12;", 189 }, 69 { @"&frac34;", 190 }, 70 { @"&iquest;", 191 }, 71 { @"&Agrave;", 192 }, 72 { @"&Aacute;", 193 }, 73 { @"&Acirc;", 194 }, 74 { @"&Atilde;", 195 }, 75 { @"&Auml;", 196 }, 76 { @"&Aring;", 197 }, 77 { @"&AElig;", 198 }, 78 { @"&Ccedil;", 199 }, 79 { @"&Egrave;", 200 }, 80 { @"&Eacute;", 201 }, 81 { @"&Ecirc;", 202 }, 82 { @"&Euml;", 203 }, 83 { @"&Igrave;", 204 }, 84 { @"&Iacute;", 205 }, 85 { @"&Icirc;", 206 }, 86 { @"&Iuml;", 207 }, 87 { @"&ETH;", 208 }, 88 { @"&Ntilde;", 209 }, 89 { @"&Ograve;", 210 }, 90 { @"&Oacute;", 211 }, 91 { @"&Ocirc;", 212 }, 92 { @"&Otilde;", 213 }, 93 { @"&Ouml;", 214 }, 94 { @"&times;", 215 }, 95 { @"&Oslash;", 216 }, 96 { @"&Ugrave;", 217 }, 97 { @"&Uacute;", 218 }, 98 { @"&Ucirc;", 219 }, 99 { @"&Uuml;", 220 }, 100 { @"&Yacute;", 221 }, 101 { @"&THORN;", 222 }, 102 { @"&szlig;", 223 }, 103 { @"&agrave;", 224 }, 104 { @"&aacute;", 225 }, 105 { @"&acirc;", 226 }, 106 { @"&atilde;", 227 }, 107 { @"&auml;", 228 }, 108 { @"&aring;", 229 }, 109 { @"&aelig;", 230 }, 110 { @"&ccedil;", 231 }, 111 { @"&egrave;", 232 }, 112 { @"&eacute;", 233 }, 113 { @"&ecirc;", 234 }, 114 { @"&euml;", 235 }, 115 { @"&igrave;", 236 }, 116 { @"&iacute;", 237 }, 117 { @"&icirc;", 238 }, 118 { @"&iuml;", 239 }, 119 { @"&eth;", 240 }, 120 { @"&ntilde;", 241 }, 121 { @"&ograve;", 242 }, 122 { @"&oacute;", 243 }, 123 { @"&ocirc;", 244 }, 124 { @"&otilde;", 245 }, 125 { @"&ouml;", 246 }, 126 { @"&divide;", 247 }, 127 { @"&oslash;", 248 }, 128 { @"&ugrave;", 249 }, 129 { @"&uacute;", 250 }, 130 { @"&ucirc;", 251 }, 131 { @"&uuml;", 252 }, 132 { @"&yacute;", 253 }, 133 { @"&thorn;", 254 }, 134 { @"&yuml;", 255 }, 135 136 // A.2.2. Special characters cont'd 137 { @"&OElig;", 338 }, 138 { @"&oelig;", 339 }, 139 { @"&Scaron;", 352 }, 140 { @"&scaron;", 353 }, 141 { @"&Yuml;", 376 }, 142 143 // A.2.3. Symbols 144 { @"&fnof;", 402 }, 145 146 // A.2.2. Special characters cont'd 147 { @"&circ;", 710 }, 148 { @"&tilde;", 732 }, 149 150 // A.2.3. Symbols cont'd 151 { @"&Alpha;", 913 }, 152 { @"&Beta;", 914 }, 153 { @"&Gamma;", 915 }, 154 { @"&Delta;", 916 }, 155 { @"&Epsilon;", 917 }, 156 { @"&Zeta;", 918 }, 157 { @"&Eta;", 919 }, 158 { @"&Theta;", 920 }, 159 { @"&Iota;", 921 }, 160 { @"&Kappa;", 922 }, 161 { @"&Lambda;", 923 }, 162 { @"&Mu;", 924 }, 163 { @"&Nu;", 925 }, 164 { @"&Xi;", 926 }, 165 { @"&Omicron;", 927 }, 166 { @"&Pi;", 928 }, 167 { @"&Rho;", 929 }, 168 { @"&Sigma;", 931 }, 169 { @"&Tau;", 932 }, 170 { @"&Upsilon;", 933 }, 171 { @"&Phi;", 934 }, 172 { @"&Chi;", 935 }, 173 { @"&Psi;", 936 }, 174 { @"&Omega;", 937 }, 175 { @"&alpha;", 945 }, 176 { @"&beta;", 946 }, 177 { @"&gamma;", 947 }, 178 { @"&delta;", 948 }, 179 { @"&epsilon;", 949 }, 180 { @"&zeta;", 950 }, 181 { @"&eta;", 951 }, 182 { @"&theta;", 952 }, 183 { @"&iota;", 953 }, 184 { @"&kappa;", 954 }, 185 { @"&lambda;", 955 }, 186 { @"&mu;", 956 }, 187 { @"&nu;", 957 }, 188 { @"&xi;", 958 }, 189 { @"&omicron;", 959 }, 190 { @"&pi;", 960 }, 191 { @"&rho;", 961 }, 192 { @"&sigmaf;", 962 }, 193 { @"&sigma;", 963 }, 194 { @"&tau;", 964 }, 195 { @"&upsilon;", 965 }, 196 { @"&phi;", 966 }, 197 { @"&chi;", 967 }, 198 { @"&psi;", 968 }, 199 { @"&omega;", 969 }, 200 { @"&thetasym;", 977 }, 201 { @"&upsih;", 978 }, 202 { @"&piv;", 982 }, 203 204 // A.2.2. Special characters cont'd 205 { @"&ensp;", 8194 }, 206 { @"&emsp;", 8195 }, 207 { @"&thinsp;", 8201 }, 208 { @"&zwnj;", 8204 }, 209 { @"&zwj;", 8205 }, 210 { @"&lrm;", 8206 }, 211 { @"&rlm;", 8207 }, 212 { @"&ndash;", 8211 }, 213 { @"&mdash;", 8212 }, 214 { @"&lsquo;", 8216 }, 215 { @"&rsquo;", 8217 }, 216 { @"&sbquo;", 8218 }, 217 { @"&ldquo;", 8220 }, 218 { @"&rdquo;", 8221 }, 219 { @"&bdquo;", 8222 }, 220 { @"&dagger;", 8224 }, 221 { @"&Dagger;", 8225 }, 222 // A.2.3. Symbols cont'd 223 { @"&bull;", 8226 }, 224 { @"&hellip;", 8230 }, 225 226 // A.2.2. Special characters cont'd 227 { @"&permil;", 8240 }, 228 229 // A.2.3. Symbols cont'd 230 { @"&prime;", 8242 }, 231 { @"&Prime;", 8243 }, 232 233 // A.2.2. Special characters cont'd 234 { @"&lsaquo;", 8249 }, 235 { @"&rsaquo;", 8250 }, 236 237 // A.2.3. Symbols cont'd 238 { @"&oline;", 8254 }, 239 { @"&frasl;", 8260 }, 240 241 // A.2.2. Special characters cont'd 242 { @"&euro;", 8364 }, 243 244 // A.2.3. Symbols cont'd 245 { @"&image;", 8465 }, 246 { @"&weierp;", 8472 }, 247 { @"&real;", 8476 }, 248 { @"&trade;", 8482 }, 249 { @"&alefsym;", 8501 }, 250 { @"&larr;", 8592 }, 251 { @"&uarr;", 8593 }, 252 { @"&rarr;", 8594 }, 253 { @"&darr;", 8595 }, 254 { @"&harr;", 8596 }, 255 { @"&crarr;", 8629 }, 256 { @"&lArr;", 8656 }, 257 { @"&uArr;", 8657 }, 258 { @"&rArr;", 8658 }, 259 { @"&dArr;", 8659 }, 260 { @"&hArr;", 8660 }, 261 { @"&forall;", 8704 }, 262 { @"&part;", 8706 }, 263 { @"&exist;", 8707 }, 264 { @"&empty;", 8709 }, 265 { @"&nabla;", 8711 }, 266 { @"&isin;", 8712 }, 267 { @"&notin;", 8713 }, 268 { @"&ni;", 8715 }, 269 { @"&prod;", 8719 }, 270 { @"&sum;", 8721 }, 271 { @"&minus;", 8722 }, 272 { @"&lowast;", 8727 }, 273 { @"&radic;", 8730 }, 274 { @"&prop;", 8733 }, 275 { @"&infin;", 8734 }, 276 { @"&ang;", 8736 }, 277 { @"&and;", 8743 }, 278 { @"&or;", 8744 }, 279 { @"&cap;", 8745 }, 280 { @"&cup;", 8746 }, 281 { @"&int;", 8747 }, 282 { @"&there4;", 8756 }, 283 { @"&sim;", 8764 }, 284 { @"&cong;", 8773 }, 285 { @"&asymp;", 8776 }, 286 { @"&ne;", 8800 }, 287 { @"&equiv;", 8801 }, 288 { @"&le;", 8804 }, 289 { @"&ge;", 8805 }, 290 { @"&sub;", 8834 }, 291 { @"&sup;", 8835 }, 292 { @"&nsub;", 8836 }, 293 { @"&sube;", 8838 }, 294 { @"&supe;", 8839 }, 295 { @"&oplus;", 8853 }, 296 { @"&otimes;", 8855 }, 297 { @"&perp;", 8869 }, 298 { @"&sdot;", 8901 }, 299 { @"&lceil;", 8968 }, 300 { @"&rceil;", 8969 }, 301 { @"&lfloor;", 8970 }, 302 { @"&rfloor;", 8971 }, 303 { @"&lang;", 9001 }, 304 { @"&rang;", 9002 }, 305 { @"&loz;", 9674 }, 306 { @"&spades;", 9824 }, 307 { @"&clubs;", 9827 }, 308 { @"&hearts;", 9829 }, 309 { @"&diams;", 9830 } 310}; 311 312// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters 313// This is table A.2.2 Special Characters 314static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = { 315 // C0 Controls and Basic Latin 316 { @"&quot;", 34 }, 317 { @"&amp;", 38 }, 318 { @"&apos;", 39 }, 319 { @"&lt;", 60 }, 320 { @"&gt;", 62 }, 321 322 // Latin Extended-A 323 { @"&OElig;", 338 }, 324 { @"&oelig;", 339 }, 325 { @"&Scaron;", 352 }, 326 { @"&scaron;", 353 }, 327 { @"&Yuml;", 376 }, 328 329 // Spacing Modifier Letters 330 { @"&circ;", 710 }, 331 { @"&tilde;", 732 }, 332 333 // General Punctuation 334 { @"&ensp;", 8194 }, 335 { @"&emsp;", 8195 }, 336 { @"&thinsp;", 8201 }, 337 { @"&zwnj;", 8204 }, 338 { @"&zwj;", 8205 }, 339 { @"&lrm;", 8206 }, 340 { @"&rlm;", 8207 }, 341 { @"&ndash;", 8211 }, 342 { @"&mdash;", 8212 }, 343 { @"&lsquo;", 8216 }, 344 { @"&rsquo;", 8217 }, 345 { @"&sbquo;", 8218 }, 346 { @"&ldquo;", 8220 }, 347 { @"&rdquo;", 8221 }, 348 { @"&bdquo;", 8222 }, 349 { @"&dagger;", 8224 }, 350 { @"&Dagger;", 8225 }, 351 { @"&permil;", 8240 }, 352 { @"&lsaquo;", 8249 }, 353 { @"&rsaquo;", 8250 }, 354 { @"&euro;", 8364 }, 355}; 356 357 358// Utility function for Bsearching table above 359static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) { 360 const unichar *uchar = (const unichar*)ucharVoid; 361 const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid; 362 int val; 363 if (*uchar > map->uchar) { 364 val = 1; 365 } else if (*uchar < map->uchar) { 366 val = -1; 367 } else { 368 val = 0; 369 } 370 return val; 371} 372 373@implementation NSString (GTMNSStringHTMLAdditions) 374 375- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table 376 ofSize:(NSUInteger)size 377 escapingUnicode:(BOOL)escapeUnicode { 378 NSUInteger length = [self length]; 379 if (!length) { 380 return self; 381 } 382 383 NSMutableString *finalString = [NSMutableString string]; 384 NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length]; 385 386 // this block is common between GTMNSString+HTML and GTMNSString+XML but 387 // it's so short that it isn't really worth trying to share. 388 const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self); 389 if (!buffer) { 390 // We want this buffer to be autoreleased. 391 NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)]; 392 if (!data) { 393 // COV_NF_START - Memory fail case 394// _GTMDevLog(@"couldn't alloc buffer"); 395 return nil; 396 // COV_NF_END 397 } 398 [self getCharacters:[data mutableBytes]]; 399 buffer = [data bytes]; 400 } 401 402 if (!buffer || !data2) { 403 // COV_NF_START 404// _GTMDevLog(@"Unable to allocate buffer or data2"); 405 return nil; 406 // COV_NF_END 407 } 408 409 unichar *buffer2 = (unichar *)[data2 mutableBytes]; 410 411 NSUInteger buffer2Length = 0; 412 413 for (NSUInteger i = 0; i < length; ++i) { 414 HTMLEscapeMap *val = bsearch(&buffer[i], table, 415 size / sizeof(HTMLEscapeMap), 416 sizeof(HTMLEscapeMap), EscapeMapCompare); 417 if (val || (escapeUnicode && buffer[i] > 127)) { 418 if (buffer2Length) { 419 CFStringAppendCharacters((CFMutableStringRef)finalString, 420 buffer2, 421 buffer2Length); 422 buffer2Length = 0; 423 } 424 if (val) { 425 [finalString appendString:val->escapeSequence]; 426 } 427 else { 428// _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character"); 429 [finalString appendFormat:@"&#%d;", buffer[i]]; 430 } 431 } else { 432 buffer2[buffer2Length] = buffer[i]; 433 buffer2Length += 1; 434 } 435 } 436 if (buffer2Length) { 437 CFStringAppendCharacters((CFMutableStringRef)finalString, 438 buffer2, 439 buffer2Length); 440 } 441 return finalString; 442} 443 444- (NSString *)gtm_stringByEscapingForHTML { 445 return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap 446 ofSize:sizeof(gUnicodeHTMLEscapeMap) 447 escapingUnicode:NO]; 448} // gtm_stringByEscapingHTML 449 450- (NSString *)gtm_stringByEscapingForAsciiHTML { 451 return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap 452 ofSize:sizeof(gAsciiHTMLEscapeMap) 453 escapingUnicode:YES]; 454} // gtm_stringByEscapingAsciiHTML 455 456- (NSString *)gtm_stringByUnescapingFromHTML { 457 NSRange range = NSMakeRange(0, [self length]); 458 NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]; 459 460 // if no ampersands, we've got a quick way out 461 if (subrange.length == 0) return self; 462 NSMutableString *finalString = [NSMutableString stringWithString:self]; 463 do { 464 NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location); 465 semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange]; 466 range = NSMakeRange(0, subrange.location); 467 // if we don't find a semicolon in the range, we don't have a sequence 468 if (semiColonRange.location == NSNotFound) { 469 continue; 470 } 471 NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1); 472 NSString *escapeString = [self substringWithRange:escapeRange]; 473 NSUInteger length = [escapeString length]; 474 // a squence must be longer than 3 (&lt;) and less than 11 (&thetasym;) 475 if (length > 3 && length < 11) { 476 if ([escapeString characterAtIndex:1] == '#') { 477 unichar char2 = [escapeString characterAtIndex:2]; 478 if (char2 == 'x' || char2 == 'X') { 479 // Hex escape squences &#xa3; 480 NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)]; 481 NSScanner *scanner = [NSScanner scannerWithString:hexSequence]; 482 unsigned value; 483 if ([scanner scanHexInt:&value] && 484 value < USHRT_MAX && 485 value > 0 486 && [scanner scanLocation] == length - 4) { 487 unichar uchar = value; 488 NSString *charString = [NSString stringWithCharacters:&uchar length:1]; 489 [finalString replaceCharactersInRange:escapeRange withString:charString]; 490 } 491 492 } else { 493 // Decimal Sequences &#123; 494 NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)]; 495 NSScanner *scanner = [NSScanner scannerWithString:numberSequence]; 496 int value; 497 if ([scanner scanInt:&value] && 498 value < USHRT_MAX && 499 value > 0 500 && [scanner scanLocation] == length - 3) { 501 unichar uchar = value; 502 NSString *charString = [NSString stringWithCharacters:&uchar length:1]; 503 [finalString replaceCharactersInRange:escapeRange withString:charString]; 504 } 505 } 506 } else { 507 // "standard" sequences 508 for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) { 509 if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) { 510 [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]]; 511 break; 512 } 513 } 514 } 515 } 516 } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0); 517 return finalString; 518} // gtm_stringByUnescapingHTML 519 520 521 522@end