blob: 4dbb8ec5df1fb15681f9c60970c0e10c6fac7311 [file] [log] [blame]
/*
* Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
* Copyright (c) 2012 Google, inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_DECODE_ESCAPE_SEQUENCES_H_
#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_DECODE_ESCAPE_SEQUENCES_H_
#include "third_party/blink/renderer/platform/wtf/allocator/allocator.h"
#include "third_party/blink/renderer/platform/wtf/assertions.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/text/text_encoding.h"
#include "third_party/blink/renderer/platform/wtf/wtf_size_t.h"
namespace blink {
// See
// <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
struct Unicode16BitEscapeSequence {
STATIC_ONLY(Unicode16BitEscapeSequence);
enum { kSequenceSize = 6 }; // e.g. %u26C4
static wtf_size_t FindInString(const String& string,
wtf_size_t start_position) {
return string.Find("%u", start_position);
}
static wtf_size_t FindEndOfRun(const String& string,
wtf_size_t start_position,
wtf_size_t end_position) {
wtf_size_t run_end = start_position;
while (end_position - run_end >= kSequenceSize && string[run_end] == '%' &&
string[run_end + 1] == 'u' && IsASCIIHexDigit(string[run_end + 2]) &&
IsASCIIHexDigit(string[run_end + 3]) &&
IsASCIIHexDigit(string[run_end + 4]) &&
IsASCIIHexDigit(string[run_end + 5])) {
run_end += kSequenceSize;
}
return run_end;
}
template <typename CharType>
static String DecodeRun(const CharType* run,
wtf_size_t run_length,
const WTF::TextEncoding&) {
// Each %u-escape sequence represents a UTF-16 code unit. See
// <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
// For 16-bit escape sequences, we know that findEndOfRun() has given us a
// contiguous run of sequences without any intervening characters, so decode
// the run without additional checks.
wtf_size_t number_of_sequences = run_length / kSequenceSize;
StringBuilder builder;
builder.ReserveCapacity(number_of_sequences);
while (number_of_sequences--) {
UChar code_unit =
(ToASCIIHexValue(run[2]) << 12) | (ToASCIIHexValue(run[3]) << 8) |
(ToASCIIHexValue(run[4]) << 4) | ToASCIIHexValue(run[5]);
builder.Append(code_unit);
run += kSequenceSize;
}
return builder.ToString();
}
};
struct URLEscapeSequence {
enum { kSequenceSize = 3 }; // e.g. %41
static wtf_size_t FindInString(const String& string,
wtf_size_t start_position) {
return string.find('%', start_position);
}
static wtf_size_t FindEndOfRun(const String& string,
wtf_size_t start_position,
wtf_size_t end_position) {
// Make the simplifying assumption that supported encodings may have up to
// two unescaped characters in the range 0x40 - 0x7F as the trailing bytes
// of their sequences which need to be passed into the decoder as part of
// the run. In other words, we end the run at the first value outside of the
// 0x40 - 0x7F range, after two values in this range, or at a %-sign that
// does not introduce a valid escape sequence.
wtf_size_t run_end = start_position;
wtf_size_t number_of_trailing_characters = 0;
while (run_end < end_position) {
if (string[run_end] == '%') {
if (end_position - run_end >= kSequenceSize &&
IsASCIIHexDigit(string[run_end + 1]) &&
IsASCIIHexDigit(string[run_end + 2])) {
run_end += kSequenceSize;
number_of_trailing_characters = 0;
} else
break;
} else if (string[run_end] >= 0x40 && string[run_end] <= 0x7F &&
number_of_trailing_characters < 2) {
run_end += 1;
number_of_trailing_characters += 1;
} else
break;
}
return run_end;
}
template <typename CharType>
static String DecodeRun(const CharType* run,
wtf_size_t run_length,
const WTF::TextEncoding& encoding) {
// For URL escape sequences, we know that findEndOfRun() has given us a run
// where every %-sign introduces a valid escape sequence, but there may be
// characters between the sequences.
Vector<char, 512> buffer;
buffer.resize(
run_length); // Unescaping hex sequences only makes the length smaller.
char* p = buffer.data();
const CharType* run_end = run + run_length;
while (run < run_end) {
if (run[0] == '%') {
*p++ = (ToASCIIHexValue(run[1]) << 4) | ToASCIIHexValue(run[2]);
run += kSequenceSize;
} else {
*p++ = run[0];
run += 1;
}
}
DCHECK_GE(
buffer.size(),
static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
return (encoding.IsValid() ? encoding : UTF8Encoding())
.Decode(buffer.data(), static_cast<wtf_size_t>(p - buffer.data()));
}
};
template <typename EscapeSequence>
String DecodeEscapeSequences(const String& string,
const WTF::TextEncoding& encoding) {
StringBuilder result;
wtf_size_t length = string.length();
wtf_size_t decoded_position = 0;
wtf_size_t search_position = 0;
wtf_size_t encoded_run_position;
while ((encoded_run_position = EscapeSequence::FindInString(
string, search_position)) != kNotFound) {
wtf_size_t encoded_run_end =
EscapeSequence::FindEndOfRun(string, encoded_run_position, length);
search_position = encoded_run_end;
if (encoded_run_end == encoded_run_position) {
++search_position;
continue;
}
String decoded =
string.Is8Bit() ? EscapeSequence::DecodeRun(
string.Characters8() + encoded_run_position,
encoded_run_end - encoded_run_position, encoding)
: EscapeSequence::DecodeRun(
string.Characters16() + encoded_run_position,
encoded_run_end - encoded_run_position, encoding);
if (decoded.IsEmpty())
continue;
result.Append(string, decoded_position,
encoded_run_position - decoded_position);
result.Append(decoded);
decoded_position = encoded_run_end;
}
result.Append(string, decoded_position, length - decoded_position);
return result.ToString();
}
} // namespace blink
#endif // THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_DECODE_ESCAPE_SEQUENCES_H_