an experiment in making a cocoa webkit browser manageable under X11
1//
2// NSString+HTML.m
3// MWFeedParser
4//
5// Copyright (c) 2010 Michael Waterfall
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// 1. The above copyright notice and this permission notice shall be included
15// in all copies or substantial portions of the Software.
16//
17// 2. This Software cannot be used to archive or collect data such as (but not
18// limited to) that of events, news, experiences and activities, for the
19// purpose of any concept relating to diary/journal keeping.
20//
21// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27// THE SOFTWARE.
28//
29
30#import "NSString+HTML.h"
31#import "GTMNSString+HTML.h"
32
33@implementation NSString (HTML)
34
35#pragma mark - Instance Methods
36
37- (NSString *)stringByConvertingHTMLToPlainText {
38
39 // Pool
40 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
41
42 // Character sets
43 NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
44 NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
45 NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"];
46
47 // Scan and find all tags
48 NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length];
49 NSScanner *scanner = [[NSScanner alloc] initWithString:self];
50 [scanner setCharactersToBeSkipped:nil];
51 [scanner setCaseSensitive:YES];
52 NSString *str = nil, *tagName = nil;
53 BOOL dontReplaceTagWithSpace = NO;
54 do {
55
56 // Scan up to the start of a tag or whitespace
57 if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) {
58 [result appendString:str];
59 str = nil; // reset
60 }
61
62 // Check if we've stopped at a tag/comment or whitespace
63 if ([scanner scanString:@"<" intoString:NULL]) {
64
65 // Stopped at a comment or tag
66 if ([scanner scanString:@"!--" intoString:NULL]) {
67
68 // Comment
69 [scanner scanUpToString:@"-->" intoString:NULL];
70 [scanner scanString:@"-->" intoString:NULL];
71
72 } else {
73
74 // Tag - remove and replace with space unless it's
75 // a closing inline tag then dont replace with a space
76 if ([scanner scanString:@"/" intoString:NULL]) {
77
78 // Closing tag - replace with space unless it's inline
79 tagName = nil; dontReplaceTagWithSpace = NO;
80 if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) {
81 tagName = [tagName lowercaseString];
82 dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] ||
83 [tagName isEqualToString:@"b"] ||
84 [tagName isEqualToString:@"i"] ||
85 [tagName isEqualToString:@"q"] ||
86 [tagName isEqualToString:@"span"] ||
87 [tagName isEqualToString:@"em"] ||
88 [tagName isEqualToString:@"strong"] ||
89 [tagName isEqualToString:@"cite"] ||
90 [tagName isEqualToString:@"abbr"] ||
91 [tagName isEqualToString:@"acronym"] ||
92 [tagName isEqualToString:@"label"]);
93 }
94
95 // Replace tag with string unless it was an inline
96 if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "];
97
98 }
99
100 // Scan past tag
101 [scanner scanUpToString:@">" intoString:NULL];
102 [scanner scanString:@">" intoString:NULL];
103
104 }
105
106 } else {
107
108 // Stopped at whitespace - replace all whitespace and newlines with a space
109 if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
110 if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result
111 }
112
113 }
114
115 } while (![scanner isAtEnd]);
116
117 // Cleanup
118 [scanner release];
119
120 // Decode HTML entities and return
121 NSString *retString = [[result stringByDecodingHTMLEntities] retain];
122 [result release];
123
124 // Drain
125 [pool drain];
126
127 // Return
128 return [retString autorelease];
129
130}
131
132- (NSString *)stringByDecodingHTMLEntities {
133 // Can return self so create new string if we're a mutable string
134 return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]];
135}
136
137
138- (NSString *)stringByEncodingHTMLEntities {
139 // Can return self so create new string if we're a mutable string
140 return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]];
141}
142
143- (NSString *)stringByEncodingHTMLEntities:(BOOL)isUnicode {
144 // Can return self so create new string if we're a mutable string
145 return [NSString stringWithString:(isUnicode ? [self gtm_stringByEscapingForHTML] : [self gtm_stringByEscapingForAsciiHTML])];
146}
147
148- (NSString *)stringWithNewLinesAsBRs {
149
150 // Pool
151 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
152
153 // Strange New lines:
154 // Next Line, U+0085
155 // Form Feed, U+000C
156 // Line Separator, U+2028
157 // Paragraph Separator, U+2029
158
159 // Scanner
160 NSScanner *scanner = [[NSScanner alloc] initWithString:self];
161 [scanner setCharactersToBeSkipped:nil];
162 NSMutableString *result = [[NSMutableString alloc] init];
163 NSString *temp;
164 NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString:
165 [NSString stringWithFormat:@"\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
166 // Scan
167 do {
168
169 // Get non new line characters
170 temp = nil;
171 [scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp];
172 if (temp) [result appendString:temp];
173 temp = nil;
174
175 // Add <br /> s
176 if ([scanner scanString:@"\r\n" intoString:nil]) {
177
178 // Combine \r\n into just 1 <br />
179 [result appendString:@"<br />"];
180
181 } else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) {
182
183 // Scan other new line characters and add <br /> s
184 if (temp) {
185 for (int i = 0; i < temp.length; i++) {
186 [result appendString:@"<br />"];
187 }
188 }
189
190 }
191
192 } while (![scanner isAtEnd]);
193
194 // Cleanup & return
195 [scanner release];
196 NSString *retString = [[NSString stringWithString:result] retain];
197 [result release];
198
199 // Drain
200 [pool drain];
201
202 // Return
203 return [retString autorelease];
204
205}
206
207- (NSString *)stringByRemovingNewLinesAndWhitespace {
208
209 // Pool
210 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
211
212 // Strange New lines:
213 // Next Line, U+0085
214 // Form Feed, U+000C
215 // Line Separator, U+2028
216 // Paragraph Separator, U+2029
217
218 // Scanner
219 NSScanner *scanner = [[NSScanner alloc] initWithString:self];
220 [scanner setCharactersToBeSkipped:nil];
221 NSMutableString *result = [[NSMutableString alloc] init];
222 NSString *temp;
223 NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:
224 [NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
225 // Scan
226 while (![scanner isAtEnd]) {
227
228 // Get non new line or whitespace characters
229 temp = nil;
230 [scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp];
231 if (temp) [result appendString:temp];
232
233 // Replace with a space
234 if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
235 if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result
236 [result appendString:@" "];
237 }
238
239 }
240
241 // Cleanup
242 [scanner release];
243
244 // Return
245 NSString *retString = [[NSString stringWithString:result] retain];
246 [result release];
247
248 // Drain
249 [pool drain];
250
251 // Return
252 return [retString autorelease];
253
254}
255
256- (NSString *)stringByLinkifyingURLs {
257 if (!NSClassFromString(@"NSRegularExpression")) return self;
258 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
259 NSString *pattern = @"(?<!=\")\\b((http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%%&:/~\\+#]*[\\w\\-\\@?^=%%&/~\\+#])?)";
260 NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:pattern options:0 error:nil];
261 NSString *modifiedString = [[regex stringByReplacingMatchesInString:self options:0 range:NSMakeRange(0, [self length])
262 withTemplate:@"<a href=\"$1\" class=\"linkified\">$1</a>"] retain];
263 [pool drain];
264 return [modifiedString autorelease];
265}
266
267- (NSString *)stringByStrippingTags {
268
269 // Pool
270 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
271
272 // Find first & and short-cut if we can
273 NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location;
274 if (ampIndex == NSNotFound) {
275 return [NSString stringWithString:self]; // return copy of string as no tags found
276 }
277
278 // Scan and find all tags
279 NSScanner *scanner = [NSScanner scannerWithString:self];
280 [scanner setCharactersToBeSkipped:nil];
281 NSMutableSet *tags = [[NSMutableSet alloc] init];
282 NSString *tag;
283 do {
284
285 // Scan up to <
286 tag = nil;
287 [scanner scanUpToString:@"<" intoString:NULL];
288 [scanner scanUpToString:@">" intoString:&tag];
289
290 // Add to set
291 if (tag) {
292 NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag];
293 [tags addObject:t];
294 [t release];
295 }
296
297 } while (![scanner isAtEnd]);
298
299 // Strings
300 NSMutableString *result = [[NSMutableString alloc] initWithString:self];
301 NSString *finalString;
302
303 // Replace tags
304 NSString *replacement;
305 for (NSString *t in tags) {
306
307 // Replace tag with space unless it's an inline element
308 replacement = @" ";
309 if ([t isEqualToString:@"<a>"] ||
310 [t isEqualToString:@"</a>"] ||
311 [t isEqualToString:@"<span>"] ||
312 [t isEqualToString:@"</span>"] ||
313 [t isEqualToString:@"<strong>"] ||
314 [t isEqualToString:@"</strong>"] ||
315 [t isEqualToString:@"<em>"] ||
316 [t isEqualToString:@"</em>"]) {
317 replacement = @"";
318 }
319
320 // Replace
321 [result replaceOccurrencesOfString:t
322 withString:replacement
323 options:NSLiteralSearch
324 range:NSMakeRange(0, result.length)];
325 }
326
327 // Remove multi-spaces and line breaks
328 finalString = [[result stringByRemovingNewLinesAndWhitespace] retain];
329
330 // Cleanup
331 [result release];
332 [tags release];
333
334 // Drain
335 [pool drain];
336
337 // Return
338 return [finalString autorelease];
339
340}
341
342@end