| // Copyright 2024 The Chromium Authors | 
 | // Use of this source code is governed by a BSD-style license that can be | 
 | // found in the LICENSE file. | 
 |  | 
 | #include "components/url_deduplication/docs_url_strip_handler.h" | 
 |  | 
 | #include <string> | 
 | #include <vector> | 
 |  | 
 | #include "base/containers/fixed_flat_set.h" | 
 | #include "base/containers/lru_cache.h" | 
 | #include "base/no_destructor.h" | 
 | #include "base/strings/escape.h" | 
 | #include "base/strings/string_util.h" | 
 | #include "components/url_formatter/url_formatter.h" | 
 | #include "net/base/url_util.h" | 
 | #include "third_party/re2/src/re2/re2.h" | 
 | #include "url/gurl.h" | 
 |  | 
 | // TODO(crbug.com/353966074) There is a plan to avoid/consolidate any | 
 | //  duplicated code as this borrows from: | 
 | //  components/omnibox/browser/document_provider.cc | 
 | namespace { | 
 | // Verify if the host could possibly be for a valid doc URL. This is a more | 
 | // lightweight check than `ExtractDocIdFromUrl()`. It can be done before | 
 | // unescaping the URL as valid hosts don't contain escapable chars; unescaping | 
 | // is relatively expensive. E.g., 'docs.google.com' isn't a valid doc URL, but | 
 | // it's host looks like it could be, so return true. On the other hand, | 
 | // 'google.com' is definitely not a doc URL so return false. | 
 | bool ValidHostPrefix(const std::string& host) { | 
 |   // There are 66 (5*11) valid, e.g. 'docs5.google.com', so rather than check | 
 |   // all 66, we just check the 6 prefixes. Keep these prefixes consistent with | 
 |   // those in `ExtractDocIdFromUrl()`. | 
 |   constexpr auto kValidHostPrefixes = base::MakeFixedFlatSet<std::string_view>({ | 
 |       "spreadsheets", | 
 |       "docs", | 
 |       "drive", | 
 |       "script", | 
 |       "sites", | 
 |       "jamboard", | 
 |   }); | 
 |   for (const auto& valid_host_prefix : kValidHostPrefixes) { | 
 |     if (base::StartsWith(host, valid_host_prefix, | 
 |                          base::CompareCase::INSENSITIVE_ASCII)) { | 
 |       return true; | 
 |     } | 
 |   } | 
 |   return false; | 
 | } | 
 |  | 
 | // Derived from google3/apps/share/util/docs_url_extractor.cc. | 
 | std::string ExtractDocIdFromUrl(const std::string& url) { | 
 |   static const base::NoDestructor<RE2> docs_url_pattern( | 
 |       "\\b("  // The first groups matches the whole URL. | 
 |       // Domain. | 
 |       "(?:https?://)?(?:" | 
 |       // Keep the hosts consistent with `ValidHostPrefix()`. | 
 |       "spreadsheets|docs|drive|script|sites|jamboard" | 
 |       ")[0-9]?\\.google\\.com" | 
 |       "(?::[0-9]+)?\\/"  // Port. | 
 |       "(?:\\S*)"         // Non-whitespace chars. | 
 |       "(?:" | 
 |       // Doc url prefix to match /d/{id}. (?:e/)? deviates from google3. | 
 |       "(?:/d/(?:e/)?(?P<path_docid>[0-9a-zA-Z\\-\\_]+))" | 
 |       "|" | 
 |       // Docs id expr to match a valid id parameter. | 
 |       "(?:(?:\\?|&|&)" | 
 |       "(?:id|docid|key|docID|DocId)=(?P<query_docid>[0-9a-zA-Z\\-\\_]+))" | 
 |       "|" | 
 |       // Folder url prefix to match /folders/{folder_id}. | 
 |       "(?:/folders/(?P<folder_docid>[0-9a-zA-Z\\-\\_]+))" | 
 |       "|" | 
 |       // Sites url prefix. | 
 |       "(?:/?s/)(?P<sites_docid>[0-9a-zA-Z\\-\\_]+)" | 
 |       "(?:/p/[0-9a-zA-Z\\-\\_]+)?/edit" | 
 |       "|" | 
 |       // Jam url. | 
 |       "(?:d/)(?P<jam_docid>[0-9a-zA-Z\\-\\_]+)/(?:edit|viewer)" | 
 |       ")" | 
 |       // Other valid chars. | 
 |       "(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)" | 
 |       // Summarization details. | 
 |       "(?:summarizationDetails=[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/" | 
 |       "\\?(?:%5B)(?:%5D)]*)?" | 
 |       // Other valid chars. | 
 |       "(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)" | 
 |       "(?:(#[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]+)?)"  // Fragment | 
 |       ")"); | 
 |  | 
 |   std::vector<std::string_view> matched_doc_ids( | 
 |       docs_url_pattern->NumberOfCapturingGroups() + 1); | 
 |   // ANCHOR_START deviates from google3 which uses UNANCHORED. Using | 
 |   // ANCHOR_START prevents incorrectly matching with non-drive URLs but which | 
 |   // contain a drive URL; e.g., | 
 |   // url-parser.com/?url=https://docs.google.com/document/d/(id)/edit. | 
 |   if (!docs_url_pattern->Match(url, 0, url.size(), RE2::ANCHOR_START, | 
 |                                matched_doc_ids.data(), | 
 |                                matched_doc_ids.size())) { | 
 |     return std::string(); | 
 |   } | 
 |   for (const auto& doc_id_group : docs_url_pattern->NamedCapturingGroups()) { | 
 |     std::string_view identified_doc_id = matched_doc_ids[doc_id_group.second]; | 
 |     if (!identified_doc_id.empty()) { | 
 |       return std::string(identified_doc_id); | 
 |     } | 
 |   } | 
 |   return std::string(); | 
 | } | 
 | }  // namespace | 
 |  | 
 | namespace url_deduplication { | 
 |  | 
 | GURL DocsURLStripHandler::StripExtraParams(GURL url) { | 
 |   if (!url.is_valid()) { | 
 |     return GURL(); | 
 |   } | 
 |  | 
 |   // A memoization cache. Only updated if `ExtractDocIdFromUrl()` was attempted. | 
 |   // That's the most expensive part of this algorithm, and memoizing the earlier | 
 |   // trivial checks would worsen performance by pushing out more useful cache | 
 |   // entries. | 
 |   static base::NoDestructor<base::LRUCache<GURL, GURL>> cache(10); | 
 |   const auto& cached = cache->Get(url); | 
 |   if (cached != cache->end()) { | 
 |     return cached->second; | 
 |   } | 
 |  | 
 |   // Early exit to avoid unnecessary and more involved checks. Don't update the | 
 |   // cache for trivial cases to avoid pushing out a more useful entry. | 
 |   if (!url.DomainIs("google.com")) { | 
 |     return GURL(); | 
 |   } | 
 |  | 
 |   // We aim to prevent duplicate Drive URLs to appear between the Drive document | 
 |   // search provider and history/bookmark entries. | 
 |   // All URLs are canonicalized to a GURL form only used for deduplication and | 
 |   // not guaranteed to be usable for navigation. | 
 |  | 
 |   // Drive redirects are already handled by the regex in |ExtractDocIdFromUrl|. | 
 |   // The below logic handles google.com redirects; e.g., google.com/url/q=<url> | 
 |   std::string url_str; | 
 |   std::string url_str_host; | 
 |   if (url.host() == "www.google.com" && url.path() == "/url") { | 
 |     if ((!net::GetValueForKeyInQuery(url, "q", &url_str) || url_str.empty()) && | 
 |         (!net::GetValueForKeyInQuery(url, "url", &url_str) || | 
 |          url_str.empty())) { | 
 |       return GURL(); | 
 |     } | 
 |     url_str_host = GURL(url_str).host(); | 
 |   } else { | 
 |     url_str = url.spec(); | 
 |     url_str_host = url.host(); | 
 |   } | 
 |  | 
 |   // Recheck the domain, since a google URL could redirect to a non-google URL | 
 |   if (!base::EndsWith(url_str_host, "google.com", | 
 |                       base::CompareCase::INSENSITIVE_ASCII)) { | 
 |     return GURL(); | 
 |   } | 
 |  | 
 |   // Filter out non-doc hosts. Do this before unescaping the URL below, as | 
 |   // unescaping can be expensive and valid hosts don't contain escapable chars. | 
 |   // Do this after simplifying the google.com redirect above, as that changes | 
 |   // the host. | 
 |   if (!ValidHostPrefix(url_str_host)) { | 
 |     return GURL(); | 
 |   } | 
 |  | 
 |   // Unescape |url_str| | 
 |   url_str = base::UnescapeURLComponent( | 
 |       url_str, | 
 |       base::UnescapeRule::PATH_SEPARATORS | | 
 |           base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS); | 
 |  | 
 |   const std::string id = ExtractDocIdFromUrl(url_str); | 
 |  | 
 |   // Canonicalize to the /open form without any extra args. | 
 |   // This is similar to what we expect from the server. | 
 |   GURL deduping_url = | 
 |       id.empty() ? GURL() : GURL("https://drive.google.com/open?id=" + id); | 
 |   cache->Put(url, deduping_url); | 
 |   return deduping_url; | 
 | } | 
 |  | 
 | }  // namespace url_deduplication |