blob: 11080415e22a102d4201de0f02494bb1d4f4721f [file] [log] [blame]
//
// GTMRegex.h
//
// Copyright 2007-2008 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
//
#import <Foundation/Foundation.h>
#import <regex.h>
#import "GTMDefines.h"
/// Options for controlling the behavior of the matches
enum {
kGTMRegexOptionIgnoreCase = 0x01,
// Ignore case in matching, ie: 'a' matches 'a' or 'A'
kGTMRegexOptionSupressNewlineSupport = 0x02,
// By default (without this option), regular expressions are implicitly
// processed on a line by line basis, where "lines" are delimited by newline
// characters. In this mode '.' (dot) does NOT match newline characters, and
// '^' and '$' match at the beginning and end of the string as well as
// around newline characters. This behavior matches the default behavior for
// regular expressions in other languages including Perl and Python. For
// example,
// foo.*bar
// would match
// fooAAAbar
// but would NOT match
// fooAAA\nbar
// With the kGTMRegexOptionSupressNewlineSupport option, newlines are treated
// just like any other character which means that '.' will match them. In
// this mode, ^ and $ only match the beginning and end of the input string
// and do NOT match around the newline characters. For example,
// foo.*bar
// would match
// fooAAAbar
// and would also match
// fooAAA\nbar
};
typedef NSUInteger GTMRegexOptions;
/// Global contants needed for errors from consuming patterns
// Ignore the "Macro name is a reserved identifier" warning in this section
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-id-macro"
#undef _EXTERN
#undef _INITIALIZE_AS
#if GTMREGEX_DEFINE_GLOBALS
#define _EXTERN
#define _INITIALIZE_AS(x) =x
#else
#define _EXTERN GTM_EXTERN
#define _INITIALIZE_AS(x)
#endif
#pragma clang diagnostic pop
_EXTERN NSString* kGTMRegexErrorDomain _INITIALIZE_AS(@"com.google.mactoolbox.RegexDomain");
enum {
kGTMRegexPatternParseFailedError = -100
};
// Keys for the userInfo from a kGTMRegexErrorDomain/kGTMRegexPatternParseFailedError error
_EXTERN NSString* kGTMRegexPatternErrorPattern _INITIALIZE_AS(@"pattern");
_EXTERN NSString* kGTMRegexPatternErrorErrorString _INITIALIZE_AS(@"patternError");
/// Class for doing Extended Regex operations w/ libregex (see re_format(7)).
//
// NOTE: the docs for recomp/regexec make *no* claims about i18n. All work
// within this class is done w/ UTF-8 so Unicode should move through it safely,
// however, the character classes described in re_format(7) might not really
// be unicode "savvy", so use them and this class w/ that in mind.
//
// Example usage:
//
// NSArray *inputArrayOfStrings = ...
// NSArray *matches = [NSMutableArray array];
//
// GTMRegex *regex = [GTMRegex regexWithPattern:@"foo.*bar"];
// for (NSString *curStr in inputArrayOfStrings) {
// if ([regex matchesString:curStr])
// [matches addObject:curStr];
// }
// ....
//
// -------------
//
// If you need to include something dynamic in a pattern:
//
// NSString *pattern =
// [NSString stringWithFormat:@"^foo:%@bar",
// [GTMRegex escapedPatternForString:inputStr]];
// GTMRegex *regex = [GTMRegex regexWithPattern:pattern];
// ....
//
// -------------
//
// GTMRegex *regex = [GTMRegex regexWithPattern:@"(foo+)(bar)"];
// NSString *highlighted =
// [regex stringByReplacingMatchesInString:inputString
// withReplacement:@"<i>\\1</i><b>\\2</b>"];
// ....
//
// Use NSRegularExpression instead
NS_DEPRECATED(10_0, 10_7, 1_0, 4_0)
@interface GTMRegex : NSObject {
@private
NSString *pattern_;
GTMRegexOptions options_;
regex_t regexData_;
}
/// Create a new, autoreleased object w/ the given regex pattern with the default options
+ (id)regexWithPattern:(NSString *)pattern;
/// Create a new, autoreleased object w/ the given regex pattern and specify the matching options
+ (id)regexWithPattern:(NSString *)pattern options:(GTMRegexOptions)options;
/// Create a new, autoreleased object w/ the given regex pattern, specify the matching options and receive any error consuming the pattern.
+ (id)regexWithPattern:(NSString *)pattern
options:(GTMRegexOptions)options
withError:(NSError **)outErrorOrNULL;
/// Returns a new, autoreleased copy of |str| w/ any pattern chars in it escaped so they have no meaning when used w/in a pattern.
+ (NSString *)escapedPatternForString:(NSString *)str;
/// Initialize a new object w/ the given regex pattern with the default options
- (id)initWithPattern:(NSString *)pattern;
/// Initialize a new object w/ the given regex pattern and specify the matching options
- (id)initWithPattern:(NSString *)pattern options:(GTMRegexOptions)options;
/// Initialize a new object w/ the given regex pattern, specify the matching options, and receive any error consuming the pattern.
- (id)initWithPattern:(NSString *)pattern
options:(GTMRegexOptions)options
withError:(NSError **)outErrorOrNULL;
/// Returns the number of sub patterns in the pattern
//
// Sub Patterns are basically the number of parenthesis blocks w/in the pattern.
// ie: The pattern "foo((bar)|(baz))" has 3 sub patterns.
//
- (NSUInteger)subPatternCount;
/// Returns YES if the whole string |str| matches the pattern.
- (BOOL)matchesString:(NSString *)str;
/// Returns a new, autoreleased array of string that contain the subpattern matches for the string.
//
// If the whole string does not match the pattern, nil is returned.
//
// The api follows the conventions of most regex engines, and index 0 (zero) is
// the full match, then the subpatterns are index 1, 2, ... going left to right.
// If the pattern has optional subpatterns, then anything that didn't match
// will have NSNull at that index.
// ie: The pattern "(fo(o+))((bar)|(baz))" has five subpatterns, and when
// applied to the string "foooooobaz" you'd get an array of:
// 0: "foooooobaz"
// 1: "foooooo"
// 2: "ooooo"
// 3: "baz"
// 4: NSNull
// 5: "baz"
//
- (NSArray *)subPatternsOfString:(NSString *)str;
/// Returns the first match for this pattern in |str|.
- (NSString *)firstSubStringMatchedInString:(NSString *)str;
/// Returns YES if this pattern some substring of |str|.
- (BOOL)matchesSubStringInString:(NSString *)str;
/// Returns a new, autoreleased enumerator that will walk segments (GTMRegexStringSegment) of |str| based on the pattern.
//
// This will split the string into "segments" using the given pattern. You get
// both the matches and parts that are inbetween matches. ie-the entire string
// will eventually be returned.
//
// See GTMRegexStringSegment for more infomation and examples.
//
- (NSEnumerator *)segmentEnumeratorForString:(NSString *)str;
/// Returns a new, autoreleased enumerator that will walk only the matching segments (GTMRegexStringSegment) of |str| based on the pattern.
//
// This extracts the "segments" of the string that used the pattern. So it can
// be used to collect all of the matching substrings from within a string.
//
// See GTMRegexStringSegment for more infomation and examples.
//
- (NSEnumerator *)matchSegmentEnumeratorForString:(NSString *)str;
/// Returns a new, autoreleased string with all matches of the pattern in |str| replaced with |replacementPattern|.
//
// Replacement uses the SED substitution like syntax w/in |replacementPattern|
// to allow the use of matches in the replacment. The replacement pattern can
// make use of any number of match references by using a backslash followed by
// the match subexpression number (ie-"\2", "\0", ...), see subPatternsOfString:
// for details on the subexpression indexing.
//
// REMINDER: you need to double-slash since the slash has meaning to the
// compiler/preprocessor. ie: "\\0"
//
- (NSString *)stringByReplacingMatchesInString:(NSString *)str
withReplacement:(NSString *)replacementPattern;
@end
/// Class returned by the nextObject for the enumerators from GTMRegex
//
// The two enumerators on from GTMRegex return objects of this type. This object
// represents a "piece" of the string the enumerator is walking. It's the apis
// on this object allow you to figure out why each segment was returned and to
// act on it.
//
// The easiest way to under stand this how the enumerators and this class works
// is through and examples ::
// Pattern: "foo+"
// String: "fo bar foobar foofooo baz"
// If you walk this w/ -segmentEnumeratorForString you'll get:
// # nextObjects Calls -isMatch -string
// 1 NO "fo bar "
// 2 YES "foo"
// 3 NO "bar "
// 4 YES "foo"
// 5 YES "fooo"
// 6 NO " baz"
// And if you walk this w/ -matchSegmentEnumeratorForString you'll get:
// # nextObjects Calls -isMatch -string
// 1 YES "foo"
// 2 YES "foo"
// 3 YES "fooo"
// (see the comments on subPatternString for how it works)
//
// Example usage:
//
// NSMutableString processedStr = [NSMutableString string];
// NSEnumerator *enumerator =
// [inputStr segmentEnumeratorForPattern:@"foo+((ba+r)|(ba+z))"];
// GTMRegexStringSegment *segment = nil;
// while ((segment = [enumerator nextObject]) != nil) {
// if ([segment isMatch]) {
// if ([segment subPatterString:2] != nil) {
// // matched: "(ba+r)"
// [processStr appendFormat:@"<b>%@</b>", [segment string]];
// } else {
// // matched: "(ba+z)"
// [processStr appendFormat:@"<i>%@</i>", [segment string]];
// }
// } else {
// [processStr appendString:[segment string]];
// }
// }
// // proccessedStr now has all the versions of foobar wrapped in bold tags,
// // and all the versons of foobaz in italics tags.
// // ie: " fooobar foobaaz " ==> " <b>fooobar</b> <i>foobaaz</i> "
//
@interface GTMRegexStringSegment : NSObject {
@private
NSData *utf8StrBuf_;
regmatch_t *regMatches_; // STRONG: ie-we call free
NSUInteger numRegMatches_;
BOOL isMatch_;
}
/// Returns YES if this segment from from a match of the regex, false if it was a segment between matches.
//
// Use -isMatch to see if the segment from from a match of the pattern or if the
// segment is some text between matches. (NOTE: isMatch is always YES for
// matchSegmentEnumeratorForString)
//
- (BOOL)isMatch;
/// Returns a new, autoreleased string w/ the full text segment from the original string.
- (NSString *)string;
/// Returns a new, autoreleased string w/ the |index| sub pattern from this segment of the original string.
//
// This api follows the conventions of most regex engines, and index 0 (zero) is
// the full match, then the subpatterns are index 1, 2, ... going left to right.
// If the pattern has optional subpatterns, then anything that didn't match
// will return nil.
// ie: When using the pattern "(fo(o+))((bar)|(baz))" the following indexes
// fetch these values for a segment where -string is @"foooooobaz":
// 0: "foooooobaz"
// 1: "foooooo"
// 2: "ooooo"
// 3: "baz"
// 4: nil
// 5: "baz"
//
- (NSString *)subPatternString:(NSUInteger)index;
@end
/// Some helpers to streamline usage of GTMRegex
//
// Example usage:
//
// if ([inputStr matchesPattern:@"foo.*bar"]) {
// // act on match
// ....
// }
//
// -------------
//
// NSString *subStr = [inputStr firstSubStringMatchedByPattern:@"^foo:.*$"];
// if (subStr != nil) {
// // act on subStr
// ....
// }
//
// -------------
//
// NSArray *headingList =
// [inputStr allSubstringsMatchedByPattern:@"^Heading:.*$"];
// // act on the list of headings
// ....
//
// -------------
//
// NSString *highlightedString =
// [inputString stringByReplacingMatchesOfPattern:@"(foo+)(bar)"
// withReplacement:@"<i>\\1</i><b>\\2</b>"];
// ....
//
@interface NSString (GTMRegexAdditions)
/// Returns YES if the full string matches regex |pattern| using the default match options
- (BOOL)gtm_matchesPattern:(NSString *)pattern;
/// Returns a new, autoreleased array of strings that contain the subpattern matches of |pattern| using the default match options
//
// See [GTMRegex subPatternsOfString:] for information about the returned array.
//
- (NSArray *)gtm_subPatternsOfPattern:(NSString *)pattern;
/// Returns a new, autoreleased string w/ the first substring that matched the regex |pattern| using the default match options
- (NSString *)gtm_firstSubStringMatchedByPattern:(NSString *)pattern;
/// Returns YES if a substring string matches regex |pattern| using the default match options
- (BOOL)gtm_subStringMatchesPattern:(NSString *)pattern;
/// Returns a new, autoreleased array of substrings in the string that match the regex |pattern| using the default match options
//
// Note: if the string has no matches, you get an empty array.
- (NSArray *)gtm_allSubstringsMatchedByPattern:(NSString *)pattern;
/// Returns a new, autoreleased segment enumerator that will break the string using pattern w/ the default match options
//
// The enumerator returns GTMRegexStringSegment options, see that class for more
// details and examples.
//
- (NSEnumerator *)gtm_segmentEnumeratorForPattern:(NSString *)pattern;
/// Returns a new, autoreleased segment enumerator that will only return matching segments from the string using pattern w/ the default match options
//
// The enumerator returns GTMRegexStringSegment options, see that class for more
// details and examples.
//
- (NSEnumerator *)gtm_matchSegmentEnumeratorForPattern:(NSString *)pattern;
/// Returns a new, autoreleased string with all matches for pattern |pattern| are replaced w/ |replacementPattern|. Uses the default match options.
//
// |replacemetPattern| has support for using any subExpression that matched,
// see [GTMRegex stringByReplacingMatchesInString:withReplacement:] above
// for details.
//
- (NSString *)gtm_stringByReplacingMatchesOfPattern:(NSString *)pattern
withReplacement:(NSString *)replacementPattern;
@end