| // Copyright 2025 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/omnibox/browser/omnibox_text_util.h" |
| |
| #include "base/strings/strcat.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "components/dom_distiller/core/url_constants.h" |
| #include "components/dom_distiller/core/url_utils.h" |
| #include "components/omnibox/browser/autocomplete_classifier.h" |
| #include "components/omnibox/browser/autocomplete_match.h" |
| #include "components/omnibox/browser/omnibox_client.h" |
| #include "url/gurl.h" |
| #include "url/url_constants.h" |
| |
| namespace omnibox { |
| |
| std::u16string StripJavascriptSchemas(const std::u16string& text) { |
| const std::u16string kJsPrefix( |
| base::StrCat({url::kJavaScriptScheme16, u":"})); |
| |
| bool found_JavaScript = false; |
| size_t i = 0; |
| // Find the index of the first character that isn't whitespace, a control |
| // character, or a part of a JavaScript: scheme. |
| while (i < text.size()) { |
| if (base::IsUnicodeWhitespace(text[i]) || (text[i] < 0x20)) { |
| ++i; |
| } else { |
| if (!base::EqualsCaseInsensitiveASCII(text.substr(i, kJsPrefix.length()), |
| kJsPrefix)) { |
| break; |
| } |
| |
| // We've found a JavaScript scheme. Continue searching to ensure that |
| // strings like "javascript:javascript:alert()" are fully stripped. |
| found_JavaScript = true; |
| i += kJsPrefix.length(); |
| } |
| } |
| |
| // If we found any "JavaScript:" schemes in the text, return the text starting |
| // at the first non-whitespace/control character after the last instance of |
| // the scheme. |
| if (found_JavaScript) { |
| return text.substr(i); |
| } |
| |
| return text; |
| } |
| |
| std::u16string SanitizeTextForPaste(const std::u16string& text) { |
| if (text.empty()) { |
| return std::u16string(); // Nothing to do. |
| } |
| |
| size_t end = text.find_first_not_of(base::kWhitespaceUTF16); |
| if (end == std::u16string::npos) { |
| return u" "; // Convert all-whitespace to single space. |
| } |
| // Because `end` points at the first non-whitespace character, the loop |
| // below will skip leading whitespace. |
| |
| // Reserve space for the sanitized output. |
| std::u16string output; |
| output.reserve(text.size()); // Guaranteed to be large enough. |
| |
| // Copy all non-whitespace sequences. |
| // Do not copy trailing whitespace. |
| // Copy all other whitespace sequences that do not contain CR/LF. |
| // Convert all other whitespace sequences that do contain CR/LF to either ' ' |
| // or nothing, depending on whether there are any other sequences that do not |
| // contain CR/LF. |
| bool output_needs_lf_conversion = false; |
| bool seen_non_lf_whitespace = false; |
| const auto copy_range = [&text, &output](size_t begin, size_t end) { |
| output += |
| text.substr(begin, (end == std::u16string::npos) ? end : (end - begin)); |
| }; |
| constexpr char16_t kNewline[] = {'\n', 0}; |
| constexpr char16_t kSpace[] = {' ', 0}; |
| while (true) { |
| // Copy this non-whitespace sequence. |
| size_t begin = end; |
| end = text.find_first_of(base::kWhitespaceUTF16, begin + 1); |
| copy_range(begin, end); |
| |
| // Now there is either a whitespace sequence, or the end of the string. |
| if (end != std::u16string::npos) { |
| // There is a whitespace sequence; see if it contains CR/LF. |
| begin = end; |
| end = text.find_first_not_of(base::kWhitespaceNoCrLfUTF16, begin); |
| if ((end != std::u16string::npos) && (text[end] != '\n') && |
| (text[end] != '\r')) { |
| // Found a non-trailing whitespace sequence without CR/LF. Copy it. |
| seen_non_lf_whitespace = true; |
| copy_range(begin, end); |
| continue; |
| } |
| } |
| |
| // `end` either points at the end of the string or a CR/LF. |
| if (end != std::u16string::npos) { |
| end = text.find_first_not_of(base::kWhitespaceUTF16, end + 1); |
| } |
| if (end == std::u16string::npos) { |
| break; // Ignore any trailing whitespace. |
| } |
| |
| // The preceding whitespace sequence contained CR/LF. Convert to a single |
| // LF that we'll fix up below the loop. |
| output_needs_lf_conversion = true; |
| output += '\n'; |
| } |
| |
| // Convert LFs to ' ' or '' depending on whether there were non-LF whitespace |
| // sequences. |
| if (output_needs_lf_conversion) { |
| base::ReplaceChars(output, kNewline, |
| seen_non_lf_whitespace ? kSpace : std::u16string(), |
| &output); |
| } |
| |
| return StripJavascriptSchemas(output); |
| } |
| |
| void AdjustTextForCopy(int sel_min, |
| std::u16string* text, |
| bool has_user_modified_text, |
| bool is_keyword_selected, |
| std::optional<AutocompleteMatch> current_popup_match, |
| OmniboxClient* client, |
| GURL* url_from_text, |
| bool* write_url) { |
| DCHECK(text); |
| DCHECK(url_from_text); |
| DCHECK(write_url); |
| |
| *write_url = false; |
| |
| // Do not adjust if selection did not start at the beginning of the field. |
| if (sel_min != 0) { |
| return; |
| } |
| |
| // If the user has not modified the display text and is copying the whole URL |
| // text (whether it's in the elided or unelided form), copy the omnibox |
| // contents as a hyperlink to the current page. |
| if (!has_user_modified_text) { |
| *url_from_text = client->GetNavigationEntryURL(); |
| *write_url = true; |
| |
| // Don't let users copy Reader Mode page URLs. |
| // We display the original article's URL in the omnibox, so users will |
| // expect that to be what is copied to the clipboard. |
| if (dom_distiller::url_utils::IsDistilledPage(*url_from_text)) { |
| *url_from_text = dom_distiller::url_utils::GetOriginalUrlFromDistillerUrl( |
| *url_from_text); |
| } |
| *text = base::UTF8ToUTF16(url_from_text->spec()); |
| return; |
| } |
| |
| // This code early exits if the copied text looks like a search query. It's |
| // not at the very top of this method, as it would interpret the intranet URL |
| // "printer/path" as a search query instead of a URL. |
| // |
| // We can't use CurrentTextIsURL() or GetDataForURLExport() because right now |
| // the user is probably holding down control to cause the copy, which will |
| // screw up our calculation of the desired_tld. |
| AutocompleteMatch match_from_text; |
| client->GetAutocompleteClassifier()->Classify( |
| *text, is_keyword_selected, true, |
| client->GetPageClassification(/*is_prefetch=*/false), &match_from_text, |
| nullptr); |
| if (AutocompleteMatch::IsSearchType(match_from_text.type)) { |
| return; |
| } |
| |
| // Make our best GURL interpretation of |text|. |
| *url_from_text = match_from_text.destination_url; |
| |
| // Get the current page GURL (or the GURL of the currently selected match). |
| GURL current_page_url = client->GetNavigationEntryURL(); |
| if (current_popup_match) { |
| AutocompleteMatch current_match = *current_popup_match; |
| if (!AutocompleteMatch::IsSearchType(current_match.type) && |
| current_match.destination_url.is_valid()) { |
| // If the popup is open and a valid match is selected, treat that as the |
| // current page, since the URL in the Omnibox will be from that match. |
| current_page_url = current_match.destination_url; |
| } |
| } |
| |
| // If the user has altered the host piece of the omnibox text, then we cannot |
| // guess at user intent, so early exit and leave |text| as-is as plain text. |
| if (!current_page_url.SchemeIsHTTPOrHTTPS() || |
| !url_from_text->SchemeIsHTTPOrHTTPS() || |
| current_page_url.host_piece() != url_from_text->host_piece()) { |
| return; |
| } |
| |
| // Infer the correct scheme for the copied text, and prepend it if necessary. |
| { |
| const std::u16string http = |
| base::StrCat({url::kHttpScheme16, url::kStandardSchemeSeparator16}); |
| const std::u16string https = |
| base::StrCat({url::kHttpsScheme16, url::kStandardSchemeSeparator16}); |
| |
| const std::u16string& current_page_url_prefix = |
| current_page_url.SchemeIs(url::kHttpScheme) ? http : https; |
| |
| // Only prepend a scheme if the text doesn't already have a scheme. |
| if (!base::StartsWith(*text, http, base::CompareCase::INSENSITIVE_ASCII) && |
| !base::StartsWith(*text, https, base::CompareCase::INSENSITIVE_ASCII)) { |
| *text = current_page_url_prefix + *text; |
| |
| // Amend the copied URL to match the prefixed string. |
| GURL::Replacements replace_scheme; |
| replace_scheme.SetSchemeStr(current_page_url.scheme_piece()); |
| *url_from_text = url_from_text->ReplaceComponents(replace_scheme); |
| } |
| } |
| |
| // If the URL derived from |text| is valid, mark |write_url| true, and modify |
| // |text| to contain the canonical URL spec with non-ASCII characters escaped. |
| if (url_from_text->is_valid()) { |
| *write_url = true; |
| *text = base::UTF8ToUTF16(url_from_text->spec()); |
| } |
| } |
| |
| } // namespace omnibox |