an experiment in making a cocoa webkit browser manageable under X11
at master 342 lines 11 kB view raw
1// 2// NSString+HTML.m 3// MWFeedParser 4// 5// Copyright (c) 2010 Michael Waterfall 6// 7// Permission is hereby granted, free of charge, to any person obtaining a copy 8// of this software and associated documentation files (the "Software"), to deal 9// in the Software without restriction, including without limitation the rights 10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11// copies of the Software, and to permit persons to whom the Software is 12// furnished to do so, subject to the following conditions: 13// 14// 1. The above copyright notice and this permission notice shall be included 15// in all copies or substantial portions of the Software. 16// 17// 2. This Software cannot be used to archive or collect data such as (but not 18// limited to) that of events, news, experiences and activities, for the 19// purpose of any concept relating to diary/journal keeping. 20// 21// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 27// THE SOFTWARE. 28// 29 30#import "NSString+HTML.h" 31#import "GTMNSString+HTML.h" 32 33@implementation NSString (HTML) 34 35#pragma mark - Instance Methods 36 37- (NSString *)stringByConvertingHTMLToPlainText { 38 39 // Pool 40 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; 41 42 // Character sets 43 NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; 44 NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; 45 NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]; 46 47 // Scan and find all tags 48 NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length]; 49 NSScanner *scanner = [[NSScanner alloc] initWithString:self]; 50 [scanner setCharactersToBeSkipped:nil]; 51 [scanner setCaseSensitive:YES]; 52 NSString *str = nil, *tagName = nil; 53 BOOL dontReplaceTagWithSpace = NO; 54 do { 55 56 // Scan up to the start of a tag or whitespace 57 if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) { 58 [result appendString:str]; 59 str = nil; // reset 60 } 61 62 // Check if we've stopped at a tag/comment or whitespace 63 if ([scanner scanString:@"<" intoString:NULL]) { 64 65 // Stopped at a comment or tag 66 if ([scanner scanString:@"!--" intoString:NULL]) { 67 68 // Comment 69 [scanner scanUpToString:@"-->" intoString:NULL]; 70 [scanner scanString:@"-->" intoString:NULL]; 71 72 } else { 73 74 // Tag - remove and replace with space unless it's 75 // a closing inline tag then dont replace with a space 76 if ([scanner scanString:@"/" intoString:NULL]) { 77 78 // Closing tag - replace with space unless it's inline 79 tagName = nil; dontReplaceTagWithSpace = NO; 80 if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) { 81 tagName = [tagName lowercaseString]; 82 dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] || 83 [tagName isEqualToString:@"b"] || 84 [tagName isEqualToString:@"i"] || 85 [tagName isEqualToString:@"q"] || 86 [tagName isEqualToString:@"span"] || 87 [tagName isEqualToString:@"em"] || 88 [tagName isEqualToString:@"strong"] || 89 [tagName isEqualToString:@"cite"] || 90 [tagName isEqualToString:@"abbr"] || 91 [tagName isEqualToString:@"acronym"] || 92 [tagName isEqualToString:@"label"]); 93 } 94 95 // Replace tag with string unless it was an inline 96 if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; 97 98 } 99 100 // Scan past tag 101 [scanner scanUpToString:@">" intoString:NULL]; 102 [scanner scanString:@">" intoString:NULL]; 103 104 } 105 106 } else { 107 108 // Stopped at whitespace - replace all whitespace and newlines with a space 109 if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) { 110 if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result 111 } 112 113 } 114 115 } while (![scanner isAtEnd]); 116 117 // Cleanup 118 [scanner release]; 119 120 // Decode HTML entities and return 121 NSString *retString = [[result stringByDecodingHTMLEntities] retain]; 122 [result release]; 123 124 // Drain 125 [pool drain]; 126 127 // Return 128 return [retString autorelease]; 129 130} 131 132- (NSString *)stringByDecodingHTMLEntities { 133 // Can return self so create new string if we're a mutable string 134 return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]]; 135} 136 137 138- (NSString *)stringByEncodingHTMLEntities { 139 // Can return self so create new string if we're a mutable string 140 return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]]; 141} 142 143- (NSString *)stringByEncodingHTMLEntities:(BOOL)isUnicode { 144 // Can return self so create new string if we're a mutable string 145 return [NSString stringWithString:(isUnicode ? [self gtm_stringByEscapingForHTML] : [self gtm_stringByEscapingForAsciiHTML])]; 146} 147 148- (NSString *)stringWithNewLinesAsBRs { 149 150 // Pool 151 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; 152 153 // Strange New lines: 154 // Next Line, U+0085 155 // Form Feed, U+000C 156 // Line Separator, U+2028 157 // Paragraph Separator, U+2029 158 159 // Scanner 160 NSScanner *scanner = [[NSScanner alloc] initWithString:self]; 161 [scanner setCharactersToBeSkipped:nil]; 162 NSMutableString *result = [[NSMutableString alloc] init]; 163 NSString *temp; 164 NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString: 165 [NSString stringWithFormat:@"\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; 166 // Scan 167 do { 168 169 // Get non new line characters 170 temp = nil; 171 [scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp]; 172 if (temp) [result appendString:temp]; 173 temp = nil; 174 175 // Add <br /> s 176 if ([scanner scanString:@"\r\n" intoString:nil]) { 177 178 // Combine \r\n into just 1 <br /> 179 [result appendString:@"<br />"]; 180 181 } else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) { 182 183 // Scan other new line characters and add <br /> s 184 if (temp) { 185 for (int i = 0; i < temp.length; i++) { 186 [result appendString:@"<br />"]; 187 } 188 } 189 190 } 191 192 } while (![scanner isAtEnd]); 193 194 // Cleanup & return 195 [scanner release]; 196 NSString *retString = [[NSString stringWithString:result] retain]; 197 [result release]; 198 199 // Drain 200 [pool drain]; 201 202 // Return 203 return [retString autorelease]; 204 205} 206 207- (NSString *)stringByRemovingNewLinesAndWhitespace { 208 209 // Pool 210 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; 211 212 // Strange New lines: 213 // Next Line, U+0085 214 // Form Feed, U+000C 215 // Line Separator, U+2028 216 // Paragraph Separator, U+2029 217 218 // Scanner 219 NSScanner *scanner = [[NSScanner alloc] initWithString:self]; 220 [scanner setCharactersToBeSkipped:nil]; 221 NSMutableString *result = [[NSMutableString alloc] init]; 222 NSString *temp; 223 NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString: 224 [NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; 225 // Scan 226 while (![scanner isAtEnd]) { 227 228 // Get non new line or whitespace characters 229 temp = nil; 230 [scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp]; 231 if (temp) [result appendString:temp]; 232 233 // Replace with a space 234 if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) { 235 if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result 236 [result appendString:@" "]; 237 } 238 239 } 240 241 // Cleanup 242 [scanner release]; 243 244 // Return 245 NSString *retString = [[NSString stringWithString:result] retain]; 246 [result release]; 247 248 // Drain 249 [pool drain]; 250 251 // Return 252 return [retString autorelease]; 253 254} 255 256- (NSString *)stringByLinkifyingURLs { 257 if (!NSClassFromString(@"NSRegularExpression")) return self; 258 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; 259 NSString *pattern = @"(?<!=\")\\b((http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%%&amp;:/~\\+#]*[\\w\\-\\@?^=%%&amp;/~\\+#])?)"; 260 NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:pattern options:0 error:nil]; 261 NSString *modifiedString = [[regex stringByReplacingMatchesInString:self options:0 range:NSMakeRange(0, [self length]) 262 withTemplate:@"<a href=\"$1\" class=\"linkified\">$1</a>"] retain]; 263 [pool drain]; 264 return [modifiedString autorelease]; 265} 266 267- (NSString *)stringByStrippingTags { 268 269 // Pool 270 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; 271 272 // Find first & and short-cut if we can 273 NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location; 274 if (ampIndex == NSNotFound) { 275 return [NSString stringWithString:self]; // return copy of string as no tags found 276 } 277 278 // Scan and find all tags 279 NSScanner *scanner = [NSScanner scannerWithString:self]; 280 [scanner setCharactersToBeSkipped:nil]; 281 NSMutableSet *tags = [[NSMutableSet alloc] init]; 282 NSString *tag; 283 do { 284 285 // Scan up to < 286 tag = nil; 287 [scanner scanUpToString:@"<" intoString:NULL]; 288 [scanner scanUpToString:@">" intoString:&tag]; 289 290 // Add to set 291 if (tag) { 292 NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag]; 293 [tags addObject:t]; 294 [t release]; 295 } 296 297 } while (![scanner isAtEnd]); 298 299 // Strings 300 NSMutableString *result = [[NSMutableString alloc] initWithString:self]; 301 NSString *finalString; 302 303 // Replace tags 304 NSString *replacement; 305 for (NSString *t in tags) { 306 307 // Replace tag with space unless it's an inline element 308 replacement = @" "; 309 if ([t isEqualToString:@"<a>"] || 310 [t isEqualToString:@"</a>"] || 311 [t isEqualToString:@"<span>"] || 312 [t isEqualToString:@"</span>"] || 313 [t isEqualToString:@"<strong>"] || 314 [t isEqualToString:@"</strong>"] || 315 [t isEqualToString:@"<em>"] || 316 [t isEqualToString:@"</em>"]) { 317 replacement = @""; 318 } 319 320 // Replace 321 [result replaceOccurrencesOfString:t 322 withString:replacement 323 options:NSLiteralSearch 324 range:NSMakeRange(0, result.length)]; 325 } 326 327 // Remove multi-spaces and line breaks 328 finalString = [[result stringByRemovingNewLinesAndWhitespace] retain]; 329 330 // Cleanup 331 [result release]; 332 [tags release]; 333 334 // Drain 335 [pool drain]; 336 337 // Return 338 return [finalString autorelease]; 339 340} 341 342@end