an experiment in making a cocoa webkit browser manageable under X11
1//
2// GTMNSString+HTML.m
3// Dealing with NSStrings that contain HTML
4//
5// Copyright 2006-2008 Google Inc.
6//
7// Licensed under the Apache License, Version 2.0 (the "License"); you may not
8// use this file except in compliance with the License. You may obtain a copy
9// of the License at
10//
11// http://www.apache.org/licenses/LICENSE-2.0
12//
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
16// License for the specific language governing permissions and limitations under
17// the License.
18//
19
20//#import "GTMDefines.h"
21#import "GTMNSString+HTML.h"
22
23typedef struct {
24 NSString *escapeSequence;
25 unichar uchar;
26} HTMLEscapeMap;
27
28// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
29// Ordered by uchar lowest to highest for bsearching
30static HTMLEscapeMap gAsciiHTMLEscapeMap[] = {
31 // A.2.2. Special characters
32 { @""", 34 },
33 { @"&", 38 },
34 { @"'", 39 },
35 { @"<", 60 },
36 { @">", 62 },
37
38 // A.2.1. Latin-1 characters
39 { @" ", 160 },
40 { @"¡", 161 },
41 { @"¢", 162 },
42 { @"£", 163 },
43 { @"¤", 164 },
44 { @"¥", 165 },
45 { @"¦", 166 },
46 { @"§", 167 },
47 { @"¨", 168 },
48 { @"©", 169 },
49 { @"ª", 170 },
50 { @"«", 171 },
51 { @"¬", 172 },
52 { @"­", 173 },
53 { @"®", 174 },
54 { @"¯", 175 },
55 { @"°", 176 },
56 { @"±", 177 },
57 { @"²", 178 },
58 { @"³", 179 },
59 { @"´", 180 },
60 { @"µ", 181 },
61 { @"¶", 182 },
62 { @"·", 183 },
63 { @"¸", 184 },
64 { @"¹", 185 },
65 { @"º", 186 },
66 { @"»", 187 },
67 { @"¼", 188 },
68 { @"½", 189 },
69 { @"¾", 190 },
70 { @"¿", 191 },
71 { @"À", 192 },
72 { @"Á", 193 },
73 { @"Â", 194 },
74 { @"Ã", 195 },
75 { @"Ä", 196 },
76 { @"Å", 197 },
77 { @"Æ", 198 },
78 { @"Ç", 199 },
79 { @"È", 200 },
80 { @"É", 201 },
81 { @"Ê", 202 },
82 { @"Ë", 203 },
83 { @"Ì", 204 },
84 { @"Í", 205 },
85 { @"Î", 206 },
86 { @"Ï", 207 },
87 { @"Ð", 208 },
88 { @"Ñ", 209 },
89 { @"Ò", 210 },
90 { @"Ó", 211 },
91 { @"Ô", 212 },
92 { @"Õ", 213 },
93 { @"Ö", 214 },
94 { @"×", 215 },
95 { @"Ø", 216 },
96 { @"Ù", 217 },
97 { @"Ú", 218 },
98 { @"Û", 219 },
99 { @"Ü", 220 },
100 { @"Ý", 221 },
101 { @"Þ", 222 },
102 { @"ß", 223 },
103 { @"à", 224 },
104 { @"á", 225 },
105 { @"â", 226 },
106 { @"ã", 227 },
107 { @"ä", 228 },
108 { @"å", 229 },
109 { @"æ", 230 },
110 { @"ç", 231 },
111 { @"è", 232 },
112 { @"é", 233 },
113 { @"ê", 234 },
114 { @"ë", 235 },
115 { @"ì", 236 },
116 { @"í", 237 },
117 { @"î", 238 },
118 { @"ï", 239 },
119 { @"ð", 240 },
120 { @"ñ", 241 },
121 { @"ò", 242 },
122 { @"ó", 243 },
123 { @"ô", 244 },
124 { @"õ", 245 },
125 { @"ö", 246 },
126 { @"÷", 247 },
127 { @"ø", 248 },
128 { @"ù", 249 },
129 { @"ú", 250 },
130 { @"û", 251 },
131 { @"ü", 252 },
132 { @"ý", 253 },
133 { @"þ", 254 },
134 { @"ÿ", 255 },
135
136 // A.2.2. Special characters cont'd
137 { @"Œ", 338 },
138 { @"œ", 339 },
139 { @"Š", 352 },
140 { @"š", 353 },
141 { @"Ÿ", 376 },
142
143 // A.2.3. Symbols
144 { @"ƒ", 402 },
145
146 // A.2.2. Special characters cont'd
147 { @"ˆ", 710 },
148 { @"˜", 732 },
149
150 // A.2.3. Symbols cont'd
151 { @"Α", 913 },
152 { @"Β", 914 },
153 { @"Γ", 915 },
154 { @"Δ", 916 },
155 { @"Ε", 917 },
156 { @"Ζ", 918 },
157 { @"Η", 919 },
158 { @"Θ", 920 },
159 { @"Ι", 921 },
160 { @"Κ", 922 },
161 { @"Λ", 923 },
162 { @"Μ", 924 },
163 { @"Ν", 925 },
164 { @"Ξ", 926 },
165 { @"Ο", 927 },
166 { @"Π", 928 },
167 { @"Ρ", 929 },
168 { @"Σ", 931 },
169 { @"Τ", 932 },
170 { @"Υ", 933 },
171 { @"Φ", 934 },
172 { @"Χ", 935 },
173 { @"Ψ", 936 },
174 { @"Ω", 937 },
175 { @"α", 945 },
176 { @"β", 946 },
177 { @"γ", 947 },
178 { @"δ", 948 },
179 { @"ε", 949 },
180 { @"ζ", 950 },
181 { @"η", 951 },
182 { @"θ", 952 },
183 { @"ι", 953 },
184 { @"κ", 954 },
185 { @"λ", 955 },
186 { @"μ", 956 },
187 { @"ν", 957 },
188 { @"ξ", 958 },
189 { @"ο", 959 },
190 { @"π", 960 },
191 { @"ρ", 961 },
192 { @"ς", 962 },
193 { @"σ", 963 },
194 { @"τ", 964 },
195 { @"υ", 965 },
196 { @"φ", 966 },
197 { @"χ", 967 },
198 { @"ψ", 968 },
199 { @"ω", 969 },
200 { @"ϑ", 977 },
201 { @"ϒ", 978 },
202 { @"ϖ", 982 },
203
204 // A.2.2. Special characters cont'd
205 { @" ", 8194 },
206 { @" ", 8195 },
207 { @" ", 8201 },
208 { @"‌", 8204 },
209 { @"‍", 8205 },
210 { @"‎", 8206 },
211 { @"‏", 8207 },
212 { @"–", 8211 },
213 { @"—", 8212 },
214 { @"‘", 8216 },
215 { @"’", 8217 },
216 { @"‚", 8218 },
217 { @"“", 8220 },
218 { @"”", 8221 },
219 { @"„", 8222 },
220 { @"†", 8224 },
221 { @"‡", 8225 },
222 // A.2.3. Symbols cont'd
223 { @"•", 8226 },
224 { @"…", 8230 },
225
226 // A.2.2. Special characters cont'd
227 { @"‰", 8240 },
228
229 // A.2.3. Symbols cont'd
230 { @"′", 8242 },
231 { @"″", 8243 },
232
233 // A.2.2. Special characters cont'd
234 { @"‹", 8249 },
235 { @"›", 8250 },
236
237 // A.2.3. Symbols cont'd
238 { @"‾", 8254 },
239 { @"⁄", 8260 },
240
241 // A.2.2. Special characters cont'd
242 { @"€", 8364 },
243
244 // A.2.3. Symbols cont'd
245 { @"ℑ", 8465 },
246 { @"℘", 8472 },
247 { @"ℜ", 8476 },
248 { @"™", 8482 },
249 { @"ℵ", 8501 },
250 { @"←", 8592 },
251 { @"↑", 8593 },
252 { @"→", 8594 },
253 { @"↓", 8595 },
254 { @"↔", 8596 },
255 { @"↵", 8629 },
256 { @"⇐", 8656 },
257 { @"⇑", 8657 },
258 { @"⇒", 8658 },
259 { @"⇓", 8659 },
260 { @"⇔", 8660 },
261 { @"∀", 8704 },
262 { @"∂", 8706 },
263 { @"∃", 8707 },
264 { @"∅", 8709 },
265 { @"∇", 8711 },
266 { @"∈", 8712 },
267 { @"∉", 8713 },
268 { @"∋", 8715 },
269 { @"∏", 8719 },
270 { @"∑", 8721 },
271 { @"−", 8722 },
272 { @"∗", 8727 },
273 { @"√", 8730 },
274 { @"∝", 8733 },
275 { @"∞", 8734 },
276 { @"∠", 8736 },
277 { @"∧", 8743 },
278 { @"∨", 8744 },
279 { @"∩", 8745 },
280 { @"∪", 8746 },
281 { @"∫", 8747 },
282 { @"∴", 8756 },
283 { @"∼", 8764 },
284 { @"≅", 8773 },
285 { @"≈", 8776 },
286 { @"≠", 8800 },
287 { @"≡", 8801 },
288 { @"≤", 8804 },
289 { @"≥", 8805 },
290 { @"⊂", 8834 },
291 { @"⊃", 8835 },
292 { @"⊄", 8836 },
293 { @"⊆", 8838 },
294 { @"⊇", 8839 },
295 { @"⊕", 8853 },
296 { @"⊗", 8855 },
297 { @"⊥", 8869 },
298 { @"⋅", 8901 },
299 { @"⌈", 8968 },
300 { @"⌉", 8969 },
301 { @"⌊", 8970 },
302 { @"⌋", 8971 },
303 { @"⟨", 9001 },
304 { @"⟩", 9002 },
305 { @"◊", 9674 },
306 { @"♠", 9824 },
307 { @"♣", 9827 },
308 { @"♥", 9829 },
309 { @"♦", 9830 }
310};
311
312// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
313// This is table A.2.2 Special Characters
314static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = {
315 // C0 Controls and Basic Latin
316 { @""", 34 },
317 { @"&", 38 },
318 { @"'", 39 },
319 { @"<", 60 },
320 { @">", 62 },
321
322 // Latin Extended-A
323 { @"Œ", 338 },
324 { @"œ", 339 },
325 { @"Š", 352 },
326 { @"š", 353 },
327 { @"Ÿ", 376 },
328
329 // Spacing Modifier Letters
330 { @"ˆ", 710 },
331 { @"˜", 732 },
332
333 // General Punctuation
334 { @" ", 8194 },
335 { @" ", 8195 },
336 { @" ", 8201 },
337 { @"‌", 8204 },
338 { @"‍", 8205 },
339 { @"‎", 8206 },
340 { @"‏", 8207 },
341 { @"–", 8211 },
342 { @"—", 8212 },
343 { @"‘", 8216 },
344 { @"’", 8217 },
345 { @"‚", 8218 },
346 { @"“", 8220 },
347 { @"”", 8221 },
348 { @"„", 8222 },
349 { @"†", 8224 },
350 { @"‡", 8225 },
351 { @"‰", 8240 },
352 { @"‹", 8249 },
353 { @"›", 8250 },
354 { @"€", 8364 },
355};
356
357
358// Utility function for Bsearching table above
359static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) {
360 const unichar *uchar = (const unichar*)ucharVoid;
361 const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid;
362 int val;
363 if (*uchar > map->uchar) {
364 val = 1;
365 } else if (*uchar < map->uchar) {
366 val = -1;
367 } else {
368 val = 0;
369 }
370 return val;
371}
372
373@implementation NSString (GTMNSStringHTMLAdditions)
374
375- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table
376 ofSize:(NSUInteger)size
377 escapingUnicode:(BOOL)escapeUnicode {
378 NSUInteger length = [self length];
379 if (!length) {
380 return self;
381 }
382
383 NSMutableString *finalString = [NSMutableString string];
384 NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length];
385
386 // this block is common between GTMNSString+HTML and GTMNSString+XML but
387 // it's so short that it isn't really worth trying to share.
388 const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self);
389 if (!buffer) {
390 // We want this buffer to be autoreleased.
391 NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)];
392 if (!data) {
393 // COV_NF_START - Memory fail case
394// _GTMDevLog(@"couldn't alloc buffer");
395 return nil;
396 // COV_NF_END
397 }
398 [self getCharacters:[data mutableBytes]];
399 buffer = [data bytes];
400 }
401
402 if (!buffer || !data2) {
403 // COV_NF_START
404// _GTMDevLog(@"Unable to allocate buffer or data2");
405 return nil;
406 // COV_NF_END
407 }
408
409 unichar *buffer2 = (unichar *)[data2 mutableBytes];
410
411 NSUInteger buffer2Length = 0;
412
413 for (NSUInteger i = 0; i < length; ++i) {
414 HTMLEscapeMap *val = bsearch(&buffer[i], table,
415 size / sizeof(HTMLEscapeMap),
416 sizeof(HTMLEscapeMap), EscapeMapCompare);
417 if (val || (escapeUnicode && buffer[i] > 127)) {
418 if (buffer2Length) {
419 CFStringAppendCharacters((CFMutableStringRef)finalString,
420 buffer2,
421 buffer2Length);
422 buffer2Length = 0;
423 }
424 if (val) {
425 [finalString appendString:val->escapeSequence];
426 }
427 else {
428// _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character");
429 [finalString appendFormat:@"&#%d;", buffer[i]];
430 }
431 } else {
432 buffer2[buffer2Length] = buffer[i];
433 buffer2Length += 1;
434 }
435 }
436 if (buffer2Length) {
437 CFStringAppendCharacters((CFMutableStringRef)finalString,
438 buffer2,
439 buffer2Length);
440 }
441 return finalString;
442}
443
444- (NSString *)gtm_stringByEscapingForHTML {
445 return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap
446 ofSize:sizeof(gUnicodeHTMLEscapeMap)
447 escapingUnicode:NO];
448} // gtm_stringByEscapingHTML
449
450- (NSString *)gtm_stringByEscapingForAsciiHTML {
451 return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap
452 ofSize:sizeof(gAsciiHTMLEscapeMap)
453 escapingUnicode:YES];
454} // gtm_stringByEscapingAsciiHTML
455
456- (NSString *)gtm_stringByUnescapingFromHTML {
457 NSRange range = NSMakeRange(0, [self length]);
458 NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range];
459
460 // if no ampersands, we've got a quick way out
461 if (subrange.length == 0) return self;
462 NSMutableString *finalString = [NSMutableString stringWithString:self];
463 do {
464 NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location);
465 semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange];
466 range = NSMakeRange(0, subrange.location);
467 // if we don't find a semicolon in the range, we don't have a sequence
468 if (semiColonRange.location == NSNotFound) {
469 continue;
470 }
471 NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1);
472 NSString *escapeString = [self substringWithRange:escapeRange];
473 NSUInteger length = [escapeString length];
474 // a squence must be longer than 3 (<) and less than 11 (ϑ)
475 if (length > 3 && length < 11) {
476 if ([escapeString characterAtIndex:1] == '#') {
477 unichar char2 = [escapeString characterAtIndex:2];
478 if (char2 == 'x' || char2 == 'X') {
479 // Hex escape squences £
480 NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)];
481 NSScanner *scanner = [NSScanner scannerWithString:hexSequence];
482 unsigned value;
483 if ([scanner scanHexInt:&value] &&
484 value < USHRT_MAX &&
485 value > 0
486 && [scanner scanLocation] == length - 4) {
487 unichar uchar = value;
488 NSString *charString = [NSString stringWithCharacters:&uchar length:1];
489 [finalString replaceCharactersInRange:escapeRange withString:charString];
490 }
491
492 } else {
493 // Decimal Sequences {
494 NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)];
495 NSScanner *scanner = [NSScanner scannerWithString:numberSequence];
496 int value;
497 if ([scanner scanInt:&value] &&
498 value < USHRT_MAX &&
499 value > 0
500 && [scanner scanLocation] == length - 3) {
501 unichar uchar = value;
502 NSString *charString = [NSString stringWithCharacters:&uchar length:1];
503 [finalString replaceCharactersInRange:escapeRange withString:charString];
504 }
505 }
506 } else {
507 // "standard" sequences
508 for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) {
509 if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) {
510 [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]];
511 break;
512 }
513 }
514 }
515 }
516 } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0);
517 return finalString;
518} // gtm_stringByUnescapingHTML
519
520
521
522@end