| // |
| // GTMNSString+HTML.m |
| // Dealing with NSStrings that contain HTML |
| // |
| // Copyright 2006-2008 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| // use this file except in compliance with the License. You may obtain a copy |
| // of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| // License for the specific language governing permissions and limitations under |
| // the License. |
| // |
| |
| #import "GTMDefines.h" |
| #import "GTMNSString+HTML.h" |
| |
| // Export a nonsense symbol to suppress a libtool warning when this is linked |
| // alone in a static lib. |
| __attribute__((visibility("default"))) |
| char GTMNSString_HTMLExportToSuppressLibToolWarning = 0; |
| |
| typedef struct { |
| NSString *escapeSequence; |
| unichar uchar; |
| } HTMLEscapeMap; |
| |
| // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters |
| // Ordered by uchar lowest to highest for bsearching |
| static HTMLEscapeMap gAsciiHTMLEscapeMap[] = { |
| // A.2.2. Special characters |
| { @""", 34 }, |
| { @"&", 38 }, |
| { @"'", 39 }, |
| { @"<", 60 }, |
| { @">", 62 }, |
| |
| // A.2.1. Latin-1 characters |
| { @" ", 160 }, |
| { @"¡", 161 }, |
| { @"¢", 162 }, |
| { @"£", 163 }, |
| { @"¤", 164 }, |
| { @"¥", 165 }, |
| { @"¦", 166 }, |
| { @"§", 167 }, |
| { @"¨", 168 }, |
| { @"©", 169 }, |
| { @"ª", 170 }, |
| { @"«", 171 }, |
| { @"¬", 172 }, |
| { @"­", 173 }, |
| { @"®", 174 }, |
| { @"¯", 175 }, |
| { @"°", 176 }, |
| { @"±", 177 }, |
| { @"²", 178 }, |
| { @"³", 179 }, |
| { @"´", 180 }, |
| { @"µ", 181 }, |
| { @"¶", 182 }, |
| { @"·", 183 }, |
| { @"¸", 184 }, |
| { @"¹", 185 }, |
| { @"º", 186 }, |
| { @"»", 187 }, |
| { @"¼", 188 }, |
| { @"½", 189 }, |
| { @"¾", 190 }, |
| { @"¿", 191 }, |
| { @"À", 192 }, |
| { @"Á", 193 }, |
| { @"Â", 194 }, |
| { @"Ã", 195 }, |
| { @"Ä", 196 }, |
| { @"Å", 197 }, |
| { @"Æ", 198 }, |
| { @"Ç", 199 }, |
| { @"È", 200 }, |
| { @"É", 201 }, |
| { @"Ê", 202 }, |
| { @"Ë", 203 }, |
| { @"Ì", 204 }, |
| { @"Í", 205 }, |
| { @"Î", 206 }, |
| { @"Ï", 207 }, |
| { @"Ð", 208 }, |
| { @"Ñ", 209 }, |
| { @"Ò", 210 }, |
| { @"Ó", 211 }, |
| { @"Ô", 212 }, |
| { @"Õ", 213 }, |
| { @"Ö", 214 }, |
| { @"×", 215 }, |
| { @"Ø", 216 }, |
| { @"Ù", 217 }, |
| { @"Ú", 218 }, |
| { @"Û", 219 }, |
| { @"Ü", 220 }, |
| { @"Ý", 221 }, |
| { @"Þ", 222 }, |
| { @"ß", 223 }, |
| { @"à", 224 }, |
| { @"á", 225 }, |
| { @"â", 226 }, |
| { @"ã", 227 }, |
| { @"ä", 228 }, |
| { @"å", 229 }, |
| { @"æ", 230 }, |
| { @"ç", 231 }, |
| { @"è", 232 }, |
| { @"é", 233 }, |
| { @"ê", 234 }, |
| { @"ë", 235 }, |
| { @"ì", 236 }, |
| { @"í", 237 }, |
| { @"î", 238 }, |
| { @"ï", 239 }, |
| { @"ð", 240 }, |
| { @"ñ", 241 }, |
| { @"ò", 242 }, |
| { @"ó", 243 }, |
| { @"ô", 244 }, |
| { @"õ", 245 }, |
| { @"ö", 246 }, |
| { @"÷", 247 }, |
| { @"ø", 248 }, |
| { @"ù", 249 }, |
| { @"ú", 250 }, |
| { @"û", 251 }, |
| { @"ü", 252 }, |
| { @"ý", 253 }, |
| { @"þ", 254 }, |
| { @"ÿ", 255 }, |
| |
| // A.2.2. Special characters cont'd |
| { @"Œ", 338 }, |
| { @"œ", 339 }, |
| { @"Š", 352 }, |
| { @"š", 353 }, |
| { @"Ÿ", 376 }, |
| |
| // A.2.3. Symbols |
| { @"ƒ", 402 }, |
| |
| // A.2.2. Special characters cont'd |
| { @"ˆ", 710 }, |
| { @"˜", 732 }, |
| |
| // A.2.3. Symbols cont'd |
| { @"Α", 913 }, |
| { @"Β", 914 }, |
| { @"Γ", 915 }, |
| { @"Δ", 916 }, |
| { @"Ε", 917 }, |
| { @"Ζ", 918 }, |
| { @"Η", 919 }, |
| { @"Θ", 920 }, |
| { @"Ι", 921 }, |
| { @"Κ", 922 }, |
| { @"Λ", 923 }, |
| { @"Μ", 924 }, |
| { @"Ν", 925 }, |
| { @"Ξ", 926 }, |
| { @"Ο", 927 }, |
| { @"Π", 928 }, |
| { @"Ρ", 929 }, |
| { @"Σ", 931 }, |
| { @"Τ", 932 }, |
| { @"Υ", 933 }, |
| { @"Φ", 934 }, |
| { @"Χ", 935 }, |
| { @"Ψ", 936 }, |
| { @"Ω", 937 }, |
| { @"α", 945 }, |
| { @"β", 946 }, |
| { @"γ", 947 }, |
| { @"δ", 948 }, |
| { @"ε", 949 }, |
| { @"ζ", 950 }, |
| { @"η", 951 }, |
| { @"θ", 952 }, |
| { @"ι", 953 }, |
| { @"κ", 954 }, |
| { @"λ", 955 }, |
| { @"μ", 956 }, |
| { @"ν", 957 }, |
| { @"ξ", 958 }, |
| { @"ο", 959 }, |
| { @"π", 960 }, |
| { @"ρ", 961 }, |
| { @"ς", 962 }, |
| { @"σ", 963 }, |
| { @"τ", 964 }, |
| { @"υ", 965 }, |
| { @"φ", 966 }, |
| { @"χ", 967 }, |
| { @"ψ", 968 }, |
| { @"ω", 969 }, |
| { @"ϑ", 977 }, |
| { @"ϒ", 978 }, |
| { @"ϖ", 982 }, |
| |
| // A.2.2. Special characters cont'd |
| { @" ", 8194 }, |
| { @" ", 8195 }, |
| { @" ", 8201 }, |
| { @"‌", 8204 }, |
| { @"‍", 8205 }, |
| { @"‎", 8206 }, |
| { @"‏", 8207 }, |
| { @"–", 8211 }, |
| { @"—", 8212 }, |
| { @"‘", 8216 }, |
| { @"’", 8217 }, |
| { @"‚", 8218 }, |
| { @"“", 8220 }, |
| { @"”", 8221 }, |
| { @"„", 8222 }, |
| { @"†", 8224 }, |
| { @"‡", 8225 }, |
| // A.2.3. Symbols cont'd |
| { @"•", 8226 }, |
| { @"…", 8230 }, |
| |
| // A.2.2. Special characters cont'd |
| { @"‰", 8240 }, |
| |
| // A.2.3. Symbols cont'd |
| { @"′", 8242 }, |
| { @"″", 8243 }, |
| |
| // A.2.2. Special characters cont'd |
| { @"‹", 8249 }, |
| { @"›", 8250 }, |
| |
| // A.2.3. Symbols cont'd |
| { @"‾", 8254 }, |
| { @"⁄", 8260 }, |
| |
| // A.2.2. Special characters cont'd |
| { @"€", 8364 }, |
| |
| // A.2.3. Symbols cont'd |
| { @"ℑ", 8465 }, |
| { @"℘", 8472 }, |
| { @"ℜ", 8476 }, |
| { @"™", 8482 }, |
| { @"ℵ", 8501 }, |
| { @"←", 8592 }, |
| { @"↑", 8593 }, |
| { @"→", 8594 }, |
| { @"↓", 8595 }, |
| { @"↔", 8596 }, |
| { @"↵", 8629 }, |
| { @"⇐", 8656 }, |
| { @"⇑", 8657 }, |
| { @"⇒", 8658 }, |
| { @"⇓", 8659 }, |
| { @"⇔", 8660 }, |
| { @"∀", 8704 }, |
| { @"∂", 8706 }, |
| { @"∃", 8707 }, |
| { @"∅", 8709 }, |
| { @"∇", 8711 }, |
| { @"∈", 8712 }, |
| { @"∉", 8713 }, |
| { @"∋", 8715 }, |
| { @"∏", 8719 }, |
| { @"∑", 8721 }, |
| { @"−", 8722 }, |
| { @"∗", 8727 }, |
| { @"√", 8730 }, |
| { @"∝", 8733 }, |
| { @"∞", 8734 }, |
| { @"∠", 8736 }, |
| { @"∧", 8743 }, |
| { @"∨", 8744 }, |
| { @"∩", 8745 }, |
| { @"∪", 8746 }, |
| { @"∫", 8747 }, |
| { @"∴", 8756 }, |
| { @"∼", 8764 }, |
| { @"≅", 8773 }, |
| { @"≈", 8776 }, |
| { @"≠", 8800 }, |
| { @"≡", 8801 }, |
| { @"≤", 8804 }, |
| { @"≥", 8805 }, |
| { @"⊂", 8834 }, |
| { @"⊃", 8835 }, |
| { @"⊄", 8836 }, |
| { @"⊆", 8838 }, |
| { @"⊇", 8839 }, |
| { @"⊕", 8853 }, |
| { @"⊗", 8855 }, |
| { @"⊥", 8869 }, |
| { @"⋅", 8901 }, |
| { @"⌈", 8968 }, |
| { @"⌉", 8969 }, |
| { @"⌊", 8970 }, |
| { @"⌋", 8971 }, |
| { @"⟨", 9001 }, |
| { @"⟩", 9002 }, |
| { @"◊", 9674 }, |
| { @"♠", 9824 }, |
| { @"♣", 9827 }, |
| { @"♥", 9829 }, |
| { @"♦", 9830 } |
| }; |
| |
| // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters |
| // This is table A.2.2 Special Characters |
| static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = { |
| // C0 Controls and Basic Latin |
| { @""", 34 }, |
| { @"&", 38 }, |
| { @"'", 39 }, |
| { @"<", 60 }, |
| { @">", 62 }, |
| |
| // Latin Extended-A |
| { @"Œ", 338 }, |
| { @"œ", 339 }, |
| { @"Š", 352 }, |
| { @"š", 353 }, |
| { @"Ÿ", 376 }, |
| |
| // Spacing Modifier Letters |
| { @"ˆ", 710 }, |
| { @"˜", 732 }, |
| |
| // General Punctuation |
| { @" ", 8194 }, |
| { @" ", 8195 }, |
| { @" ", 8201 }, |
| { @"‌", 8204 }, |
| { @"‍", 8205 }, |
| { @"‎", 8206 }, |
| { @"‏", 8207 }, |
| { @"–", 8211 }, |
| { @"—", 8212 }, |
| { @"‘", 8216 }, |
| { @"’", 8217 }, |
| { @"‚", 8218 }, |
| { @"“", 8220 }, |
| { @"”", 8221 }, |
| { @"„", 8222 }, |
| { @"†", 8224 }, |
| { @"‡", 8225 }, |
| { @"‰", 8240 }, |
| { @"‹", 8249 }, |
| { @"›", 8250 }, |
| { @"€", 8364 }, |
| }; |
| |
| |
| // Utility function for Bsearching table above |
| static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) { |
| const unichar *uchar = (const unichar*)ucharVoid; |
| const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid; |
| int val; |
| if (*uchar > map->uchar) { |
| val = 1; |
| } else if (*uchar < map->uchar) { |
| val = -1; |
| } else { |
| val = 0; |
| } |
| return val; |
| } |
| |
| @implementation NSString (GTMNSStringHTMLAdditions) |
| |
| - (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table |
| ofSize:(NSUInteger)size |
| escapingUnicode:(BOOL)escapeUnicode { |
| NSUInteger length = [self length]; |
| if (!length) { |
| return self; |
| } |
| |
| NSMutableString *finalString = [NSMutableString string]; |
| NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length]; |
| |
| // this block is common between GTMNSString+HTML and GTMNSString+XML but |
| // it's so short that it isn't really worth trying to share. |
| const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self); |
| if (!buffer) { |
| // We want this buffer to be autoreleased. |
| NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)]; |
| if (!data) { |
| // COV_NF_START - Memory fail case |
| _GTMDevLog(@"couldn't alloc buffer"); |
| return nil; |
| // COV_NF_END |
| } |
| [self getCharacters:[data mutableBytes]]; |
| buffer = [data bytes]; |
| } |
| |
| if (!buffer || !data2) { |
| // COV_NF_START |
| _GTMDevLog(@"Unable to allocate buffer or data2"); |
| return nil; |
| // COV_NF_END |
| } |
| |
| unichar *buffer2 = (unichar *)[data2 mutableBytes]; |
| |
| NSUInteger buffer2Length = 0; |
| |
| for (NSUInteger i = 0; i < length; ++i) { |
| HTMLEscapeMap *val = bsearch(&buffer[i], table, |
| size / sizeof(HTMLEscapeMap), |
| sizeof(HTMLEscapeMap), EscapeMapCompare); |
| if (val || (escapeUnicode && buffer[i] > 127)) { |
| if (buffer2Length) { |
| CFStringAppendCharacters((CFMutableStringRef)finalString, |
| buffer2, |
| buffer2Length); |
| buffer2Length = 0; |
| } |
| if (val) { |
| [finalString appendString:val->escapeSequence]; |
| } |
| else { |
| _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character"); |
| [finalString appendFormat:@"&#%d;", buffer[i]]; |
| } |
| } else { |
| buffer2[buffer2Length] = buffer[i]; |
| buffer2Length += 1; |
| } |
| } |
| if (buffer2Length) { |
| CFStringAppendCharacters((CFMutableStringRef)finalString, |
| buffer2, |
| buffer2Length); |
| } |
| return finalString; |
| } |
| |
| - (NSString *)gtm_stringByEscapingForHTML { |
| return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap |
| ofSize:sizeof(gUnicodeHTMLEscapeMap) |
| escapingUnicode:NO]; |
| } // gtm_stringByEscapingHTML |
| |
| - (NSString *)gtm_stringByEscapingForAsciiHTML { |
| return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap |
| ofSize:sizeof(gAsciiHTMLEscapeMap) |
| escapingUnicode:YES]; |
| } // gtm_stringByEscapingAsciiHTML |
| |
| - (NSString *)gtm_stringByUnescapingFromHTML { |
| NSRange range = NSMakeRange(0, [self length]); |
| NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]; |
| |
| // if no ampersands, we've got a quick way out |
| if (subrange.length == 0) return self; |
| NSMutableString *finalString = [NSMutableString stringWithString:self]; |
| do { |
| NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location); |
| semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange]; |
| range = NSMakeRange(0, subrange.location); |
| // if we don't find a semicolon in the range, we don't have a sequence |
| if (semiColonRange.location == NSNotFound) { |
| continue; |
| } |
| NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1); |
| NSString *escapeString = [self substringWithRange:escapeRange]; |
| NSUInteger length = [escapeString length]; |
| // a squence must be longer than 3 (<) and less than 11 (ϑ) |
| if (length > 3 && length < 11) { |
| if ([escapeString characterAtIndex:1] == '#') { |
| unichar char2 = [escapeString characterAtIndex:2]; |
| if (char2 == 'x' || char2 == 'X') { |
| // Hex escape squences £ |
| NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)]; |
| NSScanner *scanner = [NSScanner scannerWithString:hexSequence]; |
| unsigned value; |
| if ([scanner scanHexInt:&value] && |
| value > 0 |
| && [scanner scanLocation] == length - 4) { |
| if (value < USHRT_MAX) { |
| unichar uchar = (unichar)value; |
| NSString *charString = [NSString stringWithCharacters:&uchar length:1]; |
| [finalString replaceCharactersInRange:escapeRange withString:charString]; |
| } else if (value >= 0x10000 && value <= 0x10FFFF) { |
| // code points in unicode supplementary planes |
| int subtractedValue = value - 0x10000; |
| unichar uchars[2]; |
| uchars[0] = 0xD800 + (subtractedValue >> 10); |
| uchars[1] = 0xDC00 + (subtractedValue & 0x3FF); |
| NSString *charString = [NSString stringWithCharacters:uchars length:2]; |
| if (charString) { |
| [finalString replaceCharactersInRange:escapeRange withString:charString]; |
| } |
| } |
| } |
| } else { |
| // Decimal Sequences { |
| NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)]; |
| NSScanner *scanner = [NSScanner scannerWithString:numberSequence]; |
| int value; |
| if ([scanner scanInt:&value] && |
| value > 0 |
| && [scanner scanLocation] == length - 3) { |
| if (value < USHRT_MAX) { |
| unichar uchar = (unichar)value; |
| NSString *charString = [NSString stringWithCharacters:&uchar length:1]; |
| [finalString replaceCharactersInRange:escapeRange withString:charString]; |
| } else if (value >= 0x10000 && value <= 0x10FFFF) { |
| // code points in unicode supplementary planes |
| int subtractedValue = value - 0x10000; |
| unichar uchars[2]; |
| uchars[0] = 0xD800 + (subtractedValue >> 10); |
| uchars[1] = 0xDC00 + (subtractedValue & 0x3FF); |
| NSString *charString = [NSString stringWithCharacters:uchars length:2]; |
| if (charString) { |
| [finalString replaceCharactersInRange:escapeRange withString:charString]; |
| } |
| } |
| } |
| } |
| } else { |
| // "standard" sequences |
| for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) { |
| if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) { |
| [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]]; |
| break; |
| } |
| } |
| } |
| } |
| } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0); |
| return finalString; |
| } // gtm_stringByUnescapingHTML |
| |
| |
| |
| @end |