common/string.h - external/omaha - Git at Google

 // Copyright 2003-2009 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // ========================================================================

 #ifndef OMAHA_COMMON_STRING_H__
 #define OMAHA_COMMON_STRING_H__

 #include <windows.h>
 #include <vector>
 #include "base/basictypes.h"
 #include "omaha/common/constants.h"
 #include "omaha/common/debug.h"

 namespace omaha {

 #define STR_SIZE(str) (arraysize(str)-1)  // number of characters in char array (only for single-byte string literals!!!)
 #define TSTR_SIZE(tstr) (arraysize(tstr)-1)  // like STR_SIZE but works on _T("string literal") ONLY!!!

 #define kEllipsis L".."

 // The number of replacements matches we expect, before we start allocating extra memory
 // to process it. This is an optimizing constant
 #define kExpectedMaxReplaceMatches 100

 // TODO(omaha): above each of these function names, we should
 // define what we expect the implementation to do. that way,
 // implementers will know what is desired. an example would probably
 // make things easiest.
 CString AbbreviateString (const CString & title, int32 max_len);
 CString AbbreviateUri (const CString & uri, int32 max_len);
 CString NormalizeUri (const CString & uri);

 // removes "http://", "ftp://", "mailto:" or "file://" (note that the "file" protocol is
 // like: "file:///~/calendar", this method removes only the first two slashes
 CString RemoveInternetProtocolHeader (const CString& url);

 void RemoveFromStart (CString & s, const TCHAR* remove, bool ignore_case);
 void RemoveFromEnd (CString & s, const TCHAR* remove);

 // Limit string to max length, truncating and adding ellipsis if needed
 // Attempts to not leave a partial word at the end, unless min_len is reached
 CString ElideIfNeeded (const CString & input_string, int max_len, int min_len);

 // The ability to clean up a string for relevant target audiences. Add flags accordingly

 // Sanitizes for insertion in an HTML document, uses the basic literals [<>&]
 #define kSanHtml 0x1

 // XML is the HTML replacements, and a few more
 #define kSanXml (kSanHtml | 0x2)

 // Javascript has a seperate set of encodings [which is a superset of HTML replacements]
 #define kSanJs (kSanHtml | 0x4)

 // For input fields on HTML documents
 #define kSanHtmlInput 0x8

 // TODO(omaha): be consistent on use of int/uint32/int32 for lengths

 // The input length of the string does not include the null terminator.
 // Caller deletes the returned buffer.
 WCHAR *ToWide (const char *s, int len);

 // returns pointer to data if found otherwise NULL
 const byte *BufferContains (const byte *buf, uint32 buf_len, const byte *data, uint32 data_len);

 // Given a string, 'protect' the characters that are invalid for a given mode
 // For instance, kSanHtml will replace < with the HTML literal equivalent
 // If kSanHtml is used, and bold_periods is true, then periods used for url abbreviation are bolded.
 // NOTE: If you call AbbreviateLinkForDisplay before this function, then there might be periods
 // used for abbreviation.  BoldAbbreviationPeriods should be called after HighlightTerms.
 CString SanitizeString(const CString & in, DWORD mode);

 // Bolds the periods used for abbreviation.  Call this after HighlightTerms.
 CString BoldAbbreviationPeriods(const CString & in);

 // Unencode a URL encoded string
 CString Unencode(const CString & input);

 CString GetTextInbetween(const CString &input, const CString &start, const CString &end);

 // Given a ? seperated string, extract a particular segment, and URL-Unencode it
 CString GetParam(const CString & input, const CString & key);

 // Given an XML style string, extract the contents of a <INPUT>...</INPUT> pair
 CString GetField (const CString & input, const CString & field);

 // Finds a whole word match in the query, followed by a ":".
 // If not found, return -1.
 //
 // Note: this is case sensitive.
 int FindWholeWordMatch (const CString &query,
   const CString &word_to_match,
   const bool end_with_colon,
   const int index_begin);

 // Do whole-word replacement in "str".
 // This does not do partial matches (unlike CString::Replace),
 //   e.g.  CString::Replace will replace "ie" within "pie" and
 // this function will not.
 //
 // Note: this is case sensitive.
 void ReplaceWholeWord (const CString &string_to_replace,
   const CString &replacement,
   const bool trim_whitespace,
   CString *str);

 // Convert Wide to ANSI directly. Use only when it is all ANSI
 CStringA WideToAnsiDirect(const CString & in);

 // Transform a unicode string into UTF8, used primarily by the webserver
 CStringA WideToUtf8(const CString& w);

 // Converts the UTF-8 encoded buffer to an in-memory Unicode (wide character)
 // string.
 // @param utf8 A non-NULL pointer to a UTF-8 encoded buffer that has at
 // least num_bytes valid characters.  If num_bytes is 0, the buffer must be
 // NULL-terminated.
 // @param num_bytes Number of bytes to process from utf8, or 0 if utf8 is
 // NULL-terminated and should be processed in its entirety.
 // @return The Unicode string represented by utf8 (or that part of it
 // specified by num_bytes).  If the UTF-8 representation of the string started
 // with a byte-order marker (BOM), it will be ignored and not included in the
 // returned string.  On failure, the function returns the empty string.
 CString Utf8ToWideChar(const char* utf8, uint32 num_bytes);
 CString Utf8BufferToWideChar(const std::vector<uint8>& buffer);

 // Dealing with Unicode BOM
 bool StartsWithBOM(const TCHAR* string);
 const TCHAR* StringAfterBOM(const TCHAR* string);

 // Convert an ANSI string into Widechar string, according to the specified
 // codepage. The input length can be -1, if the string is null terminated, and
 // the actual length will be used internally.
 BOOL AnsiToWideString(const char *from, int length, UINT codepage, CString *to);

 // Convert char to Wchar directly
 CString AnsiToWideString(const char *from, int length);

 // these functions untested
 // they should not be used unless tested
 // HRESULT AnsiToUTF8 (char * src, int src_len, char * dest, int *dest_len);
 // HRESULT UTF8ToAnsi (char * src, int src_len, char * dest, int *dest_len);
 // HRESULT UCS2ToUTF8 (LPCWSTR src, int src_len, char * dest, int *dest_len);
 // HRESULT UTF8ToUCS2 (char * src, int src_len, LPWSTR dest, int *dest_len);

 // "Absolute" is perhaps not the right term, this normalizes the Uri
 // given http://www.google.com changes to correct http://www.google.com/
 // given http://www.google.com// changes to correct http://www.google.com/
 // given http://www.google.com/home.html returns the same
 CString GetAbsoluteUri(const CString& uri);

 // Reverse (big-endian<->little-endian) the shorts that make up
 // Unicode characters in a byte array of Unicode chars
 HRESULT ReverseUnicodeByteOrder(byte* unicode_string, int size_in_bytes);

 // given http://google.com/bobby this returns http://google.com/
 // If strip_leading is specified, it will turn
 // http://www.google.com into http://google.com
 #define kStrLeadingWww _T("www.")
 // TODO(omaha): no default parameters
 CString GetUriHostName(const CString& uri, bool strip_leading = false);
 CString GetUriHostNameHostOnly(const CString& uri, bool strip_leading_www);

 const char *stristr(const char *string, const char *pattern);
 const WCHAR *stristrW(const WCHAR *string, const WCHAR *pattern);
 const WCHAR *strstrW(const WCHAR *string, const WCHAR *pattern);

 // Add len_to_add to len_so_far, assuming that if it exceeds the
 // length of the line, it will word wrap onto the next line.  Returns
 // the total length of all the lines summed together.
 float GetLenWithWordWrap (const float len_so_far,
   const float len_to_add,
   const uint32 len_line);

 // ----------------------------------------------------------------------
 // QuotedPrintableUnescape()
 //    Copies "src" to "dest", rewriting quoted printable escape sequences
 //    =XX to their ASCII equivalents. src is not null terminated, instead
 //    specify len. I recommend that slen<len_dest, but we honour len_dest
 //    anyway.
 //    RETURNS the length of dest.
 // ----------------------------------------------------------------------
 int QuotedPrintableUnescape(const WCHAR *src, int slen, WCHAR *dest, int len_dest);

 // Return the length to use for the output buffer given to the base64 escape
 // routines. Make sure to use the same value for do_padding in both.
 // This function may return incorrect results if given input_len values that
 // are extremely high, which should happen rarely.
 int CalculateBase64EscapedLen(int input_len, bool do_padding);
 // Use this version when calling Base64Escape without a do_padding arg.
 int CalculateBase64EscapedLen(int input_len);

 // ----------------------------------------------------------------------
 // Base64Escape()
 // WebSafeBase64Escape()
 //    Encode "src" to "dest" using base64 encoding.
 //    src is not null terminated, instead specify len.
 //    'dest' should have at least CalculateBase64EscapedLen() length.
 //    RETURNS the length of dest.
 //    The WebSafe variation use '-' instead of '+' and '_' instead of '/'
 //    so that we can place the out in the URL or cookies without having
 //    to escape them.  It also has an extra parameter "do_padding",
 //    which when set to false will prevent padding with "=".
 // ----------------------------------------------------------------------
 int Base64Escape(const char *src, int slen, char *dest, int szdest);
 int WebSafeBase64Escape(const char *src, int slen, char *dest,
                         int szdest, bool do_padding);
 void WebSafeBase64Escape(const CStringA& src, CStringA* dest);

 void Base64Escape(const char *src, int szsrc,
                   CStringA* dest, bool do_padding);
 void WebSafeBase64Escape(const char *src, int szsrc,
                          CStringA* dest, bool do_padding);

 // ----------------------------------------------------------------------
 // Base64Unescape()
 //    Copies "src" to "dest", where src is in base64 and is written to its
 //    ASCII equivalents. src is not null terminated, instead specify len.
 //    I recommend that slen<len_dest, but we honour len_dest anyway.
 //    RETURNS the length of dest.
 //    The WebSafe variation use '-' instead of '+' and '_' instead of '/'.
 // ----------------------------------------------------------------------
 int Base64Unescape(const char *src, int slen, char *dest, int len_dest);
 int WebSafeBase64Unescape(const char *src, int slen, char *dest, int szdest);

 #ifdef UNICODE
 #define IsSpace IsSpaceW
 #else
 #define IsSpace IsSpaceA
 #endif

 bool IsSpaceW(WCHAR c);
 bool IsSpaceA(char c);

 // Remove all leading and trailing whitespace from s.
 // Returns the new length of the string (not including 0-terminator)
 int TrimCString(CString &s);
 int Trim(TCHAR *s);

 // Trims all characters in the delimiter string from both ends of the
 // string s
 void TrimString(CString& s, const TCHAR* delimiters);

 // Strip the first token from the front of argument s.  A token is a
 // series of consecutive non-blank characters - unless the first
 // character is a double-quote ("), in that case the token is the full
 // quoted string
 CString StripFirstQuotedToken(const CString& s);

 // A block of text to separate lines, and back
 void TextToLines(const CString& text, const TCHAR* delimiter, std::vector<CString>* lines);
 // (LinesToText puts a delimiter at the end of the last line too)
 void LinesToText(const std::vector<CString>& lines, const TCHAR* delimiter, CString* text);

 // Make a CString lower case
 void MakeLowerCString(CString & s);

 // Clean up the string: replace all whitespace with spaces, and
 // replace consecutive spaces with one.
 // Returns the new length of the string (not including 0-terminator)
 int CleanupWhitespaceCString(CString &s);
 int CleanupWhitespace(TCHAR *s);

 int HexDigitToInt (WCHAR c);
 bool IsHexDigit (WCHAR c);

 // Converts to lower, but does so much faster if the string is ANSI
 TCHAR * String_FastToLower(TCHAR * str);

 // Replacement for the CRT toupper(c)
 int String_ToUpper(int c);

 // Replacement for the CRT toupper(c)
 char String_ToUpperA(char c);

 // Converts str to lowercase in place.
 void String_ToLower(TCHAR* str);

 // Converts str to uppercase in place.
 void String_ToUpper(TCHAR* str);

 bool String_IsUpper(TCHAR c);

 // String comparison based on length
 // Replacement for the CRT strncmp(i)
 int String_StrNCmp(const TCHAR * str1, const TCHAR * str2, uint32 len, bool ignore_case);

 // Replacement for strncpy() - except ALWAYS ends string with null
 TCHAR* String_StrNCpy(TCHAR* destination, const TCHAR* source, uint32 len);

 // check if str starts with start_str
 bool String_StartsWith(const TCHAR *str, const TCHAR *start_str, bool ignore_case);

 // check if str starts with start_str, for char *
 bool String_StartsWithA(const char *str, const char *start_str, bool ignore_case);

 // check if str ends with end_str
 bool String_EndsWith(const TCHAR *str, const TCHAR *end_str, bool ignore_case);

 // If the input string str doesn't already end with the string end_str,
 // make it end with the string end_str.
 CString String_MakeEndWith(const TCHAR *str, const TCHAR* end_str, bool ignore_case);

 // converts an int to a string
 CString String_Int64ToString(int64 value, int radix);

 // converts an uint64 to a string
 CString String_Uint64ToString(uint64 value, int radix);

 // Convert numeric types to CString
 CString sizet_to_str(const size_t & i);
 CString itostr(const int i);
 CString itostr(const uint32 i);

 // converts a large number to an approximate value, like "1.2G" or "900M"
 // base_ten = true if based on powers of 10 (like disk space) otherwise based
 // on powers of two.  power = 0 for *10^0, 1 for *10^3 or 2^10, 2 for *10^6
 // or 2^20, and 3 for *10^9 or 2^30, in other words: no units, K, M, or G.
 CString String_LargeIntToApproximateString(uint64 value, bool base_ten, int* power);

 // converts a string to an  int
 // Does not check for overflow
 int32 String_StringToInt(const TCHAR * str);

 int64 String_StringToInt64(const TCHAR * str);

 // converts an double to a string
 // specifies the number of digits after the decimal point
 // TODO(omaha): Make this work for negative values
 CString String_DoubleToString(double value, int point_digits);

 // convert string to double
 double String_StringToDouble (const TCHAR *s);

 // Converts a character to a digit
 // if the character is not a digit return -1
 int32 String_CharToDigit(const TCHAR c);

 // returns true if ASCII digit
 bool String_IsDigit(const TCHAR c);

 // Converts the digit to a character.
 TCHAR String_DigitToChar(unsigned int n);

 // Returns true if an identifier character: letter, digit, or "_"
 bool String_IsIdentifierChar(const TCHAR c);

 // Returns true if the string has letters in it.
 // This is used by the keyword extractor to downweight numbers,
 // IDs (sequences of numbers like social security numbers), etc.
 bool String_HasAlphabetLetters (const TCHAR *str);

 // Return the index of the first occurrence of s2 in s1, or -1 if none.
 int String_FindString(const TCHAR *s1, const TCHAR *s2);
 int String_FindString(const TCHAR *s1, const TCHAR *s2, int start_pos);

 // Return the index of the first occurrence of c in s1, or -1 if none.
 int String_FindChar(const TCHAR *str, const TCHAR c);
 // start from index start_pos
 int String_FindChar(const TCHAR *str, const TCHAR c, int start_pos);

 // Return the index of the first occurrence of c in string, or -1 if none.
 int String_ReverseFindChar(const TCHAR * str, TCHAR c);

 bool String_Contains(const TCHAR *s1, const TCHAR *s2);

 // Replace old_char with new_char in str.
 void String_ReplaceChar(TCHAR *str, TCHAR old_char, TCHAR new_char);
 void String_ReplaceChar(CString & str, TCHAR old_char, TCHAR new_char);

 // Append the given character to the string if it doesn't already end with it.
 // There must be room in the string to append the character if necessary.
 void String_EndWithChar(TCHAR *str, TCHAR c);

 // A special version of the replace function which takes advantage of CString properties
 // to make it much faster when the string grows

 // NOTE: it CANNOT match more than kMaxReplaceMatches instances within the string
 // do not use this function if that is a possibility

 // The maximum number of replacements to perform. Essentially infinite
 #define kRepMax kUint32Max
 int ReplaceCString (CString & src, const TCHAR *from, unsigned int from_len,
                                    const TCHAR *to, unsigned int to_len,
                                    unsigned int max_matches);

 // replace from with to in src
 // on memory allocation error, returns the original string
 int ReplaceString (TCHAR *src, const TCHAR *from, const TCHAR *to, TCHAR **out, int *out_len);

 // replace from with to in src
 // will replace in place if length(to) <= length(from) and return *out == src
 // WILL CREATE NEW OUTPUT BUFFER OTHERWISE and set created_new_string to true
 // on memory allocation error, returns the original string
 int ReplaceStringMaybeInPlace (TCHAR *src, const TCHAR *from, const TCHAR *to, TCHAR **out, int *out_len, bool *created_new_string);

 // you really want to use the straight TCHAR version above. you know it
 // on memory allocation error, returns the original string
 int ReplaceCString (CString & src, const TCHAR *from, const TCHAR *to);

 long __cdecl Wcstol (const wchar_t *nptr, wchar_t **endptr, int ibase);
 unsigned long __cdecl Wcstoul (const wchar_t *nptr, wchar_t **endptr, int ibase);

 // Functions on arrays of strings

 // Returns true iff s is in the array strings (case-insensitive compare)
 bool String_MemberOf(const TCHAR* const* strings, const TCHAR* s);
 // Returns index of s in the array of strings (or -1 for missing) (case-insensitive compare)
 int String_IndexOf(const TCHAR* const* strings, const TCHAR* s);

 // Serializes a time64 to a string, and then loads it out again, this string it not for human consumption
 time64 StringToTime(const CString & time);
 CString TimeToString(const time64 & time);

 // looks for string A followed by any number of spaces/tabs followed by string b
 // returns starting position of a if found, NULL if not
 // case insensitive
 const TCHAR *FindStringASpaceStringB (const TCHAR *s, const TCHAR *a, const TCHAR *b);

 bool IsAlphaA (const char c);
 bool IsDigitA (const char c);

 // TODO(omaha): deprecate since we have secure CRT now.
 // dest_buffer_len includes the NULL
 // always NULL terminates
 // dest must be a valid string with length < dest_buffer_len
 void SafeStrCat (TCHAR *dest, const TCHAR *src, int dest_buffer_len);

 const TCHAR *ExtractNextDouble (const TCHAR *s, double *f);

 TCHAR *String_PathFindExtension(const TCHAR *path);

 inline TCHAR Char_ToLower(TCHAR c) {
 // C4302: truncation from 'type 1' to 'type 2'
 #pragma  warning(disable : 4302)
   return reinterpret_cast<TCHAR>(::CharLower(reinterpret_cast<TCHAR*>(c)));
 #pragma warning(default : 4302)
 }

 // @returns the lowercase character (type is int to be consistent with the CRT)
 int String_ToLowerChar(int c);

 // Replacement for the CRT tolower(c)
 char String_ToLowerCharAnsi(char c);

 bool String_PathRemoveFileSpec(TCHAR *path);

 // Escapes and unescapes strings (shlwapi-based implementation).
 // The indended usage for these APIs is escaping strings to make up
 // URLs, for example building query strings.
 //
 // Pass false to the flag segment_only to escape the url. This will not
 // cause the conversion of the # (%23), ? (%3F), and / (%2F) characters.
 HRESULT StringEscape(const CString& str_in,
                      bool segment_only,
                      CString* str_out);

 HRESULT StringUnescape(const CString& str_in, CString* str_out);

 // Converts a string to an int, performs all the necessary
 // checks to ensure that the string is correct.
 // Tests for overflow and non-int strings.
 bool String_StringToDecimalIntChecked(const TCHAR* str, int* value);

 // Converts CLSID to a string.
 bool CLSIDToCString(const GUID& guid, CString* str);

 // Converts a string to a bool.
 HRESULT String_StringToBool(const TCHAR* str, bool* value);

 // Convert boolean to its string representation.
 HRESULT String_BoolToString(bool value, CString* string);

 // Converts a string to a Tristate enum.
 bool String_StringToTristate(const TCHAR* str, Tristate* value);

 // Extracts the name and value from a string that contains a name/value pair.
 bool ParseNameValuePair(const CString& token, TCHAR separator,
                         CString* name, CString* value);

 // Splits a command line buffer into two parts in place:
 // first argument (which could be path to executable) and remaining arguments.
 // Note that the same pointer can be used for both command_line and
 // either of the remaining parameters.
 bool SplitCommandLineInPlace(TCHAR *command_line,
                              TCHAR **first_argument,
                              TCHAR **remaining_arguments);

 // Returns true if the unicode string only contains ascii values.
 bool ContainsOnlyAsciiChars(const CString& str);
 // Converts a buffer of bytes to a hex string.
 CString BytesToHex(const uint8* bytes, size_t num_bytes);

 // Converts a vector of bytes to a hex string.
 CString BytesToHex(const std::vector<uint8>& bytes);

 void JoinStrings(const std::vector<CString>& components,
                  const TCHAR* delim,
                  CString* result);

 void JoinStringsInArray(const TCHAR* components[],
                         int num_components,
                         const TCHAR* delim,
                         CString* result);

 // Formats the specified message ID.
 // It is similar to CStringT::FormatMessage() but it returns an empty string
 // instead of throwing when the message ID cannot be loaded.
 CString FormatResourceMessage(uint32 resource_id, ...);

 // Formats an error code as an 8-digit HRESULT-style hex number or an unsigned
 // integer depending on whether it matches the HRESULT failure format.
 CString FormatErrorCode(DWORD error_code);

 // Converts the unicode string into a utf8 encoded, urlencoded string.
 // The resulting ascii string is returned in a wide CString.
 HRESULT WideStringToUtf8UrlEncodedString(const CString& str, CString* out);

 // Converts a string that is in the utf8 representation and is urlencoded
 // into a unicode string.
 HRESULT Utf8UrlEncodedStringToWideString(const CString& str, CString* out);

 }  // namespace omaha

 #endif  // OMAHA_COMMON_STRING_H__