blob: 6b24fe7359c40b01fa7ef2cebb2217b175490699 [file] [log] [blame]
// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <algorithm>
#include <unicode/ucnv.h>
#include <unicode/uidna.h>
#include <unicode/ulocdata.h>
#include <unicode/uniset.h>
#include <unicode/uscript.h>
#include <unicode/uset.h>
#include "build/build_config.h"
#if defined(OS_WIN)
#include <windows.h>
#include <winsock2.h>
#elif defined(OS_POSIX)
#include <sys/socket.h>
#include <fcntl.h>
#endif
#include "net/base/net_util.h"
#include "base/basictypes.h"
#include "base/file_path.h"
#include "base/file_util.h"
#include "base/logging.h"
#include "base/path_service.h"
#include "base/scoped_ptr.h"
#include "base/string_escape.h"
#include "base/string_piece.h"
#include "base/string_tokenizer.h"
#include "base/string_util.h"
#include "base/sys_string_conversions.h"
#include "base/time.h"
#include "base/time_format.h"
#include "googleurl/src/gurl.h"
#include "googleurl/src/url_canon.h"
#include "googleurl/src/url_parse.h"
#include "net/base/escape.h"
#include "net/base/net_module.h"
#include "net/base/base64.h"
#include "unicode/datefmt.h"
#if !defined(OS_MACOSX)
#include "net_resources.h"
#endif
using base::Time;
namespace {
// what we prepend to get a file URL
static const FilePath::CharType kFileURLPrefix[] =
FILE_PATH_LITERAL("file:///");
// The general list of blocked ports. Will be blocked unless a specific
// protocol overrides it. (Ex: ftp can use ports 20 and 21)
static const int kRestrictedPorts[] = {
1, // tcpmux
7, // echo
9, // discard
11, // systat
13, // daytime
15, // netstat
17, // qotd
19, // chargen
20, // ftp data
21, // ftp access
22, // ssh
23, // telnet
25, // smtp
37, // time
42, // name
43, // nicname
53, // domain
77, // priv-rjs
79, // finger
87, // ttylink
95, // supdup
101, // hostriame
102, // iso-tsap
103, // gppitnp
104, // acr-nema
109, // pop2
110, // pop3
111, // sunrpc
113, // auth
115, // sftp
117, // uucp-path
119, // nntp
123, // NTP
135, // loc-srv /epmap
139, // netbios
143, // imap2
179, // BGP
389, // ldap
465, // smtp+ssl
512, // print / exec
513, // login
514, // shell
515, // printer
526, // tempo
530, // courier
531, // chat
532, // netnews
540, // uucp
556, // remotefs
563, // nntp+ssl
587, // stmp?
601, // ??
636, // ldap+ssl
993, // ldap+ssl
995, // pop3+ssl
2049, // nfs
4045, // lockd
6000, // X11
};
// FTP overrides the following restricted ports.
static const int kAllowedFtpPorts[] = {
21, // ftp data
22, // ssh
};
template<typename STR>
STR GetSpecificHeaderT(const STR& headers, const STR& name) {
// We want to grab the Value from the "Key: Value" pairs in the headers,
// which should look like this (no leading spaces, \n-separated) (we format
// them this way in url_request_inet.cc):
// HTTP/1.1 200 OK\n
// ETag: "6d0b8-947-24f35ec0"\n
// Content-Length: 2375\n
// Content-Type: text/html; charset=UTF-8\n
// Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
if (headers.empty())
return STR();
STR match;
match.push_back('\n');
match.append(name);
match.push_back(':');
typename STR::const_iterator begin =
search(headers.begin(), headers.end(), match.begin(), match.end(),
CaseInsensitiveCompareASCII<typename STR::value_type>());
if (begin == headers.end())
return STR();
begin += match.length();
typename STR::const_iterator end = find(begin, headers.end(), '\n');
STR ret;
TrimWhitespace(STR(begin, end), TRIM_ALL, &ret);
return ret;
}
// TODO(jungshik): We have almost identical hex-decoding code else where.
// Consider refactoring and moving it somewhere(base?). Bug 1224311
inline bool IsHexDigit(unsigned char c) {
return (('0' <= c && c <= '9') || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f'));
}
inline unsigned char HexToInt(unsigned char c) {
DCHECK(IsHexDigit(c));
static unsigned char kOffset[4] = {0, 0x30u, 0x37u, 0x57u};
return c - kOffset[(c >> 5) & 3];
}
// Similar to Base64Decode. Decodes a Q-encoded string to a sequence
// of bytes. If input is invalid, return false.
bool QPDecode(const std::string& input, std::string* output) {
std::string temp;
temp.reserve(input.size());
std::string::const_iterator it = input.begin();
while (it != input.end()) {
if (*it == '_') {
temp.push_back(' ');
} else if (*it == '=') {
if (input.end() - it < 3) {
return false;
}
if (IsHexDigit(static_cast<unsigned char>(*(it + 1))) &&
IsHexDigit(static_cast<unsigned char>(*(it + 2)))) {
unsigned char ch = HexToInt(*(it + 1)) * 16 + HexToInt(*(it + 2));
temp.push_back(static_cast<char>(ch));
++it;
++it;
} else {
return false;
}
} else if (0x20 < *it && *it < 0x7F) {
// In a Q-encoded word, only printable ASCII characters
// represent themselves. Besides, space, '=', '_' and '?' are
// not allowed, but they're already filtered out.
DCHECK(*it != 0x3D && *it != 0x5F && *it != 0x3F);
temp.push_back(*it);
} else {
return false;
}
++it;
}
output->swap(temp);
return true;
}
enum RFC2047EncodingType {Q_ENCODING, B_ENCODING};
bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type,
const std::string& charset, std::string* output) {
std::string decoded;
if (enc_type == B_ENCODING) {
if (!net::Base64Decode(part, &decoded)) {
return false;
}
} else {
if (!QPDecode(part, &decoded)) {
return false;
}
}
UErrorCode err = U_ZERO_ERROR;
UConverter* converter(ucnv_open(charset.c_str(), &err));
if (U_FAILURE(err)) {
return false;
}
// A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
// A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
// in UTF-8. Therefore, the expansion ratio is 3 at most.
int length = static_cast<int>(decoded.length());
char* buf = WriteInto(output, length * 3);
length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, length * 3,
decoded.data(), length, &err);
ucnv_close(converter);
if (U_FAILURE(err)) {
return false;
}
output->resize(length);
return true;
}
bool DecodeWord(const std::string& encoded_word,
bool *is_rfc2047,
std::string* output) {
// TODO(jungshik) : Revisit this later. Do we want to pass through non-ASCII
// strings which can be mozibake? WinHTTP converts a raw 8bit string
// UTF-16 assuming it's in the OS default encoding.
if (!IsStringASCII(encoded_word)) {
// Try falling back to the NativeMB encoding if the raw input is not UTF-8.
if (IsStringUTF8(encoded_word)) {
*output = encoded_word;
} else {
*output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
}
*is_rfc2047 = false;
return true;
}
// RFC 2047 : one of encoding methods supported by Firefox and relatively
// widely used by web servers.
// =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
// We don't care about the length restriction (72 bytes) because
// many web servers generate encoded words longer than the limit.
std::string tmp;
*is_rfc2047 = true;
int part_index = 0;
std::string charset;
StringTokenizer t(encoded_word, "?");
RFC2047EncodingType enc_type = Q_ENCODING;
while (*is_rfc2047 && t.GetNext()) {
std::string part = t.token();
switch (part_index) {
case 0:
if (part != "=") {
*is_rfc2047 = false;
break;
}
++part_index;
break;
case 1:
// Do we need charset validity check here?
charset = part;
++part_index;
break;
case 2:
if (part.size() > 1 ||
part.find_first_of("bBqQ") == std::string::npos) {
*is_rfc2047 = false;
break;
}
if (part[0] == 'b' || part[0] == 'B') {
enc_type = B_ENCODING;
}
++part_index;
break;
case 3:
*is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
if (!*is_rfc2047) {
// Last minute failure. Invalid B/Q encoding. Rather than
// passing it through, return now.
return false;
}
++part_index;
break;
case 4:
if (part != "=") {
// Another last minute failure !
// Likely to be a case of two encoded-words in a row or
// an encoded word followed by a non-encoded word. We can be
// generous, but it does not help much in terms of compatibility,
// I believe. Return immediately.
*is_rfc2047 = false;
return false;
}
++part_index;
break;
default:
*is_rfc2047 = false;
return false;
}
}
if (*is_rfc2047) {
if (*(encoded_word.end() - 1) == '=') {
output->swap(tmp);
return true;
}
// encoded_word ending prematurelly with '?' or extra '?'
*is_rfc2047 = false;
return false;
}
// We're not handling 'especial' characters quoted with '\', but
// it should be Ok because we're not an email client but a
// web browser.
// What IE6/7 does: %-escaped UTF-8. We could extend this to
// support a rudimentary form of RFC 2231 with charset label, but
// it'd gain us little in terms of compatibility.
tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
if (IsStringUTF8(tmp)) {
output->swap(tmp);
return true;
// We can try either the OS default charset or 'origin charset' here,
// As far as I can tell, IE does not support it. However, I've seen
// web servers emit %-escaped string in a legacy encoding (usually
// origin charset).
// TODO(jungshik) : Test IE further and consider adding a fallback here.
}
return false;
}
bool DecodeParamValue(const std::string& input, std::string* output) {
std::string tmp;
// Tokenize with whitespace characters.
StringTokenizer t(input, " \t\n\r");
t.set_options(StringTokenizer::RETURN_DELIMS);
bool is_previous_token_rfc2047 = true;
while (t.GetNext()) {
if (t.token_is_delim()) {
// If the previous non-delimeter token is not RFC2047-encoded,
// put in a space in its place. Otheriwse, skip over it.
if (!is_previous_token_rfc2047) {
tmp.push_back(' ');
}
continue;
}
// We don't support a single multibyte character split into
// adjacent encoded words. Some broken mail clients emit headers
// with that problem, but most web servers usually encode a filename
// in a single encoded-word. Firefox/Thunderbird do not support
// it, either.
std::string decoded;
if (!DecodeWord(t.token(), &is_previous_token_rfc2047, &decoded))
return false;
tmp.append(decoded);
}
output->swap(tmp);
return true;
}
// TODO(mpcomplete): This is a quick and dirty implementation for now. I'm
// sure this doesn't properly handle all (most?) cases.
template<typename STR>
STR GetHeaderParamValueT(const STR& header, const STR& param_name) {
// This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
typename STR::const_iterator param_begin =
search(header.begin(), header.end(), param_name.begin(), param_name.end(),
CaseInsensitiveCompareASCII<typename STR::value_type>());
if (param_begin == header.end())
return STR();
param_begin += param_name.length();
STR whitespace;
whitespace.push_back(' ');
whitespace.push_back('\t');
const typename STR::size_type equals_offset =
header.find_first_not_of(whitespace, param_begin - header.begin());
if (equals_offset == STR::npos || header.at(equals_offset) != '=')
return STR();
param_begin = header.begin() + equals_offset + 1;
if (param_begin == header.end())
return STR();
typename STR::const_iterator param_end;
if (*param_begin == '"') {
param_end = find(param_begin+1, header.end(), '"');
if (param_end == header.end())
return STR(); // poorly formatted param?
++param_begin; // skip past the quote.
} else {
param_end = find(param_begin+1, header.end(), ';');
}
return STR(param_begin, param_end);
}
// Does some simple normalization of scripts so we can allow certain scripts
// to exist together.
// TODO(brettw) bug 880223: we should allow some other languages to be
// oombined such as Chinese and Latin. We will probably need a more
// complicated system of language pairs to have more fine-grained control.
UScriptCode NormalizeScript(UScriptCode code) {
switch (code) {
case USCRIPT_KATAKANA:
case USCRIPT_HIRAGANA:
case USCRIPT_KATAKANA_OR_HIRAGANA:
case USCRIPT_HANGUL: // This one is arguable.
return USCRIPT_HAN;
default:
return code;
}
}
bool IsIDNComponentInSingleScript(const char16* str, int str_len) {
UScriptCode first_script = USCRIPT_INVALID_CODE;
bool is_first = true;
int i = 0;
while (i < str_len) {
unsigned code_point;
U16_NEXT(str, i, str_len, code_point);
UErrorCode err = U_ZERO_ERROR;
UScriptCode cur_script = uscript_getScript(code_point, &err);
if (err != U_ZERO_ERROR)
return false; // Report mixed on error.
cur_script = NormalizeScript(cur_script);
// TODO(brettw) We may have to check for USCRIPT_INHERENT as well.
if (is_first && cur_script != USCRIPT_COMMON) {
first_script = cur_script;
is_first = false;
} else {
if (cur_script != USCRIPT_COMMON && cur_script != first_script)
return false;
}
}
return true;
}
// Check if the script of a language can be 'safely' mixed with
// Latin letters in the ASCII range.
bool IsCompatibleWithASCIILetters(const std::string& lang) {
// For now, just list Chinese, Japanese and Korean (positive list).
// An alternative is negative-listing (languages using Greek and
// Cyrillic letters), but it can be more dangerous.
return !lang.substr(0,2).compare("zh") ||
!lang.substr(0,2).compare("ja") ||
!lang.substr(0,2).compare("ko");
}
// Returns true if the given Unicode host component is safe to display to the
// user.
bool IsIDNComponentSafe(const char16* str,
int str_len,
const std::wstring& languages) {
// Most common cases (non-IDN) do not reach here so that we don't
// need a fast return path.
// TODO(jungshik) : Check if there's any character inappropriate
// (although allowed) for domain names.
// See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and
// http://www.unicode.org/reports/tr39/data/xidmodifications.txt
// For now, we borrow the list from Mozilla and tweaked it slightly.
// (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because
// they're gonna be canonicalized to U+0020 and full stop before
// reaching here.)
// The original list is available at
// http://kb.mozillazine.org/Network.IDN.blacklist_chars and
// at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703
UErrorCode status = U_ZERO_ERROR;
#ifdef U_WCHAR_IS_UTF16
UnicodeSet dangerous_characters(UnicodeString(
L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338"
L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"
L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"
L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"
L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"
L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"
L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"
L"[\ufffa-\ufffd]]"), status);
#else
UnicodeSet dangerous_characters(UnicodeString(
"[[\\ \\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338"
"\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"
"[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"
"[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"
"\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"
"\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"
"\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"
"[\\ufffa-\\ufffd]]", -1, US_INV), status);
#endif
DCHECK(U_SUCCESS(status));
UnicodeSet component_characters;
component_characters.addAll(UnicodeString(str, str_len));
if (dangerous_characters.containsSome(component_characters))
return false;
// If the language list is empty, the result is completely determined
// by whether a component is a single script or not. This will block
// even "safe" script mixing cases like <Chinese, Latin-ASCII> that are
// allowed with |languages| (while it blocks Chinese + Latin letters with
// an accent as should be the case), but we want to err on the safe side
// when |languages| is empty.
if (languages.empty())
return IsIDNComponentInSingleScript(str, str_len);
// |common_characters| is made up of ASCII numbers, hyphen, plus and
// underscore that are used across scripts and allowed in domain names.
// (sync'd with characters allowed in url_canon_host with square
// brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.
UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),
status);
DCHECK(U_SUCCESS(status));
// Subtract common characters because they're always allowed so that
// we just have to check if a language-specific set contains
// the remainder.
component_characters.removeAll(common_characters);
USet *lang_set = uset_open(1, 0); // create an empty set
UnicodeSet ascii_letters(0x61, 0x7a); // [a-z]
bool safe = false;
std::string languages_list(WideToASCII(languages));
StringTokenizer t(languages_list, ",");
while (t.GetNext()) {
std::string lang = t.token();
status = U_ZERO_ERROR;
// TODO(jungshik) Cache exemplar sets for locales.
ULocaleData* uld = ulocdata_open(lang.c_str(), &status);
// TODO(jungshik) Turn this check on when the ICU data file is
// rebuilt with the minimal subset of locale data for languages
// to which Chrome is not localized but which we offer in the list
// of languages selectable for Accept-Languages. With the rebuilt ICU
// data, ulocdata_open never should fall back to the default locale. (issue 2078)
// DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING);
if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) {
// Should we use auxiliary set, instead?
ulocdata_getExemplarSet(uld, lang_set, 0, ULOCDATA_ES_STANDARD, &status);
ulocdata_close(uld);
if (U_SUCCESS(status)) {
UnicodeSet* allowed_characters =
reinterpret_cast<UnicodeSet*>(lang_set);
// If |lang| is compatible with ASCII Latin letters, add them.
if (IsCompatibleWithASCIILetters(lang))
allowed_characters->addAll(ascii_letters);
if (allowed_characters->containsAll(component_characters)) {
safe = true;
break;
}
}
}
}
uset_close(lang_set);
return safe;
}
// Converts one component of a host (between dots) to IDN if safe. The result
// will be APPENDED to the given output string and will be the same as the
// input if it is not IDN or the IDN is unsafe to display.
void IDNToUnicodeOneComponent(const char16* comp,
int comp_len,
const std::wstring& languages,
string16* out) {
DCHECK(comp_len >= 0);
if (comp_len == 0)
return;
// Expand the output string to make room for a possibly longer string
// (we'll expand if it's still not big enough below).
int extra_space = 64;
size_t host_begin_in_output = out->size();
// Just copy the input if it can't be an IDN component.
if (comp_len < 4 ||
comp[0] != 'x' || comp[1] != 'n' || comp[2] != '-' || comp[3] != '-') {
out->resize(host_begin_in_output + comp_len);
for (int i = 0; i < comp_len; i++)
(*out)[host_begin_in_output + i] = comp[i];
return;
}
while (true) {
UErrorCode status = U_ZERO_ERROR;
out->resize(out->size() + extra_space);
int output_chars =
uidna_IDNToUnicode(comp, comp_len, &(*out)[host_begin_in_output],
extra_space, UIDNA_DEFAULT, NULL, &status);
if (status == U_ZERO_ERROR) {
// Converted successfully.
out->resize(host_begin_in_output + output_chars);
if (!IsIDNComponentSafe(&out->data()[host_begin_in_output],
output_chars,
languages))
break; // The error handling below will undo the IDN.
return;
}
if (status != U_BUFFER_OVERFLOW_ERROR)
break;
// Need to loop again with a bigger buffer. It looks like ICU will
// return the required size of the buffer, but that's not documented,
// so we'll just grow by 2x. This should be rare and is not on a
// critical path.
extra_space *= 2;
}
// We get here on error, in which case we replace anything that was added
// with the literal input.
out->resize(host_begin_in_output + comp_len);
for (int i = 0; i < comp_len; i++)
(*out)[host_begin_in_output + i] = comp[i];
}
} // namespace
namespace net {
GURL FilePathToFileURL(const FilePath& path) {
// Produce a URL like "file:///C:/foo" for a regular file, or
// "file://///server/path" for UNC. The URL canonicalizer will fix up the
// latter case to be the canonical UNC form: "file://server/path"
FilePath::StringType url_string(kFileURLPrefix);
url_string.append(path.value());
// Now do replacement of some characters. Since we assume the input is a
// literal filename, anything the URL parser might consider special should
// be escaped here.
// must be the first substitution since others will introduce percents as the
// escape character
ReplaceSubstringsAfterOffset(&url_string, 0,
FILE_PATH_LITERAL("%"), FILE_PATH_LITERAL("%25"));
// semicolon is supposed to be some kind of separator according to RFC 2396
ReplaceSubstringsAfterOffset(&url_string, 0,
FILE_PATH_LITERAL(";"), FILE_PATH_LITERAL("%3B"));
ReplaceSubstringsAfterOffset(&url_string, 0,
FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23"));
return GURL(url_string);
}
GURL FilePathToFileURL(const std::wstring& path_str) {
return FilePathToFileURL(FilePath::FromWStringHack(path_str));
}
std::wstring GetSpecificHeader(const std::wstring& headers,
const std::wstring& name) {
return GetSpecificHeaderT(headers, name);
}
std::string GetSpecificHeader(const std::string& headers,
const std::string& name) {
return GetSpecificHeaderT(headers, name);
}
std::wstring GetFileNameFromCD(const std::string& header) {
std::string param_value = GetHeaderParamValue(header, "filename");
if (param_value.empty()) {
// Some servers use 'name' parameter.
param_value = GetHeaderParamValue(header, "name");
}
if (param_value.empty())
return std::wstring();
std::string decoded;
if (DecodeParamValue(param_value, &decoded))
return UTF8ToWide(decoded);
return std::wstring();
}
std::wstring GetHeaderParamValue(const std::wstring& field,
const std::wstring& param_name) {
return GetHeaderParamValueT(field, param_name);
}
std::string GetHeaderParamValue(const std::string& field,
const std::string& param_name) {
return GetHeaderParamValueT(field, param_name);
}
// TODO(brettw) bug 734373: check the scripts for each host component and
// don't un-IDN-ize if there is more than one. Alternatively, only IDN for
// scripts that the user has installed. For now, just put the entire
// path through IDN. Maybe this feature can be implemented in ICU itself?
//
// We may want to skip this step in the case of file URLs to allow unicode
// UNC hostnames regardless of encodings.
void IDNToUnicode(const char* host,
int host_len,
const std::wstring& languages,
std::wstring* out) {
// Convert the ASCII input to a wide string for ICU.
string16 input16;
input16.reserve(host_len);
for (int i = 0; i < host_len; i++)
input16.push_back(host[i]);
string16 out16;
// The output string is appended to, so convert what's already there if
// needed.
#if defined(WCHAR_T_IS_UTF32)
WideToUTF16(out->data(), out->length(), &out16);
out->clear(); // for equivalence with the swap below
#elif defined(WCHAR_T_IS_UTF16)
out->swap(out16);
#endif
// Do each component of the host separately, since we enforce script matching
// on a per-component basis.
size_t cur_begin = 0; // Beginning of the current component (inclusive).
while (cur_begin < input16.size()) {
// Find the next dot or the end of the string.
size_t next_dot = input16.find_first_of('.', cur_begin);
if (next_dot == std::wstring::npos)
next_dot = input16.size(); // For getting the last component.
if (next_dot > cur_begin) {
// Add the substring that we just found.
IDNToUnicodeOneComponent(&input16[cur_begin],
static_cast<int>(next_dot - cur_begin),
languages,
&out16);
}
// Need to add the dot we just found (if we found one). This needs to be
// done before we break out below in case the URL ends in a dot.
if (next_dot < input16.size())
out16.push_back('.');
else
break; // No more components left.
cur_begin = next_dot + 1;
}
#if defined(WCHAR_T_IS_UTF32)
UTF16ToWide(out16.data(), out16.length(), out);
#elif defined(WCHAR_T_IS_UTF16)
out->swap(out16);
#endif
}
std::string CanonicalizeHost(const std::string& host, bool* is_ip_address) {
// Try to canonicalize the host.
const url_parse::Component raw_host_component(0,
static_cast<int>(host.length()));
std::string canon_host;
url_canon::StdStringCanonOutput canon_host_output(&canon_host);
url_parse::Component canon_host_component;
if (!url_canon::CanonicalizeHost(host.c_str(), raw_host_component,
&canon_host_output, &canon_host_component)) {
if (is_ip_address)
*is_ip_address = false;
return std::string();
}
canon_host_output.Complete();
if (is_ip_address) {
// See if the host is an IP address.
url_canon::RawCanonOutputT<char, 128> ignored_output;
url_parse::Component ignored_component;
*is_ip_address = url_canon::CanonicalizeIPAddress(canon_host.c_str(),
canon_host_component,
&ignored_output,
&ignored_component);
}
// Return the host as a string, stripping any unnecessary bits off the ends.
if ((canon_host_component.begin == 0) &&
(static_cast<size_t>(canon_host_component.len) == canon_host.length()))
return canon_host;
return canon_host.substr(canon_host_component.begin,
canon_host_component.len);
}
std::string CanonicalizeHost(const std::wstring& host, bool* is_ip_address) {
std::string converted_host;
WideToUTF8(host.c_str(), host.length(), &converted_host);
return CanonicalizeHost(converted_host, is_ip_address);
}
std::string GetDirectoryListingHeader(const std::string& title) {
#if defined(OS_WIN)
static const StringPiece header(NetModule::GetResource(IDR_DIR_HEADER_HTML));
if (header.empty()) {
NOTREACHED() << "expected resource not found";
}
std::string result(header.data(), header.size());
#elif defined(OS_POSIX)
// TODO(estade): Temporary hack. Remove these platform #ifdefs when we
// have implemented resources for non-Windows platforms.
LOG(INFO) << "FIXME: hacked resource loading";
FilePath path;
PathService::Get(base::DIR_EXE, &path);
path = path.Append("../../net/base/dir_header.html");
std::string result;
file_util::ReadFileToString(path.ToWStringHack(), &result);
#endif
result.append("<script>start(");
string_escape::JavascriptDoubleQuote(title, true, &result);
result.append(");</script>\n");
return result;
}
std::string GetDirectoryListingEntry(const std::string& name,
bool is_dir,
int64 size,
const Time& modified) {
std::string result;
result.append("<script>addRow(");
string_escape::JavascriptDoubleQuote(name, true, &result);
result.append(",");
string_escape::JavascriptDoubleQuote(
EscapePath(name), true, &result);
if (is_dir) {
result.append(",1,");
} else {
result.append(",0,");
}
string_escape::JavascriptDoubleQuote(
FormatBytes(size, GetByteDisplayUnits(size), true), true, &result);
result.append(",");
std::wstring modified_str;
// |modified| can be NULL in FTP listings.
if (!modified.is_null()) {
modified_str = base::TimeFormatShortDateAndTime(modified);
}
string_escape::JavascriptDoubleQuote(modified_str, true, &result);
result.append(");</script>\n");
return result;
}
std::wstring StripWWW(const std::wstring& text) {
const std::wstring www(L"www.");
return (text.compare(0, www.length(), www) == 0) ?
text.substr(www.length()) : text;
}
std::wstring GetSuggestedFilename(const GURL& url,
const std::string& content_disposition,
const std::wstring& default_name) {
std::wstring filename = GetFileNameFromCD(content_disposition);
if (!filename.empty()) {
// Remove any path information the server may have sent, take the name
// only.
filename = file_util::GetFilenameFromPath(filename);
// Next, remove "." from the beginning and end of the file name to avoid
// tricks with hidden files, "..", and "."
TrimString(filename, L".", &filename);
}
if (filename.empty()) {
if (url.is_valid()) {
filename = UnescapeAndDecodeUTF8URLComponent(
url.ExtractFileName(),
UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
}
}
// Trim '.' once more.
TrimString(filename, L".", &filename);
// If there's no filename or it gets trimed to be empty, use
// the URL hostname or default_name
if (filename.empty()) {
if (!default_name.empty())
filename = default_name;
else if (url.is_valid()) {
// Some schemes (e.g. file) do not have a hostname. Even though it's
// not likely to reach here, let's hardcode the last fallback name.
// TODO(jungshik) : Decode a 'punycoded' IDN hostname. (bug 1264451)
filename = url.host().empty() ? L"download" : UTF8ToWide(url.host());
} else
NOTREACHED();
}
file_util::ReplaceIllegalCharacters(&filename, '-');
return filename;
}
std::wstring GetSuggestedFilename(const GURL& url,
const std::wstring& content_disposition,
const std::wstring& default_name) {
return GetSuggestedFilename(
url, WideToUTF8(content_disposition), default_name);
}
bool IsPortAllowedByDefault(int port) {
int array_size = arraysize(kRestrictedPorts);
for (int i = 0; i < array_size; i++) {
if (kRestrictedPorts[i] == port) {
return false;
}
}
return true;
}
bool IsPortAllowedByFtp(int port) {
int array_size = arraysize(kAllowedFtpPorts);
for (int i = 0; i < array_size; i++) {
if (kAllowedFtpPorts[i] == port) {
return true;
}
}
// Port not explicitly allowed by FTP, so return the default restrictions.
return IsPortAllowedByDefault(port);
}
int SetNonBlocking(int fd) {
#if defined(OS_WIN)
unsigned long no_block = 1;
return ioctlsocket(fd, FIONBIO, &no_block);
#elif defined(OS_POSIX)
int flags = fcntl(fd, F_GETFL, 0);
if (-1 == flags)
flags = 0;
return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
#endif
}
// Deprecated.
bool FileURLToFilePath(const GURL& gurl, std::wstring* file_path) {
FilePath path;
bool rv = FileURLToFilePath(gurl, &path);
*file_path = path.ToWStringHack();
return rv;
}
} // namespace net