| // Copyright 2017 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "modules/document_metadata/CopylessPasteExtractor.h" |
| |
| #include <algorithm> |
| #include <memory> |
| #include <utility> |
| |
| #include "core/dom/ElementTraversal.h" |
| #include "core/frame/LocalFrame.h" |
| #include "core/html/HTMLElement.h" |
| #include "core/html_names.h" |
| #include "platform/Histogram.h" |
| #include "platform/instrumentation/tracing/TraceEvent.h" |
| #include "platform/json/JSONParser.h" |
| #include "platform/wtf/Vector.h" |
| #include "platform/wtf/text/AtomicString.h" |
| #include "platform/wtf/text/StringBuilder.h" |
| #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h" |
| |
| namespace blink { |
| |
| namespace { |
| |
| using mojom::document_metadata::blink::Entity; |
| using mojom::document_metadata::blink::EntityPtr; |
| using mojom::document_metadata::blink::Property; |
| using mojom::document_metadata::blink::PropertyPtr; |
| using mojom::document_metadata::blink::Values; |
| using mojom::document_metadata::blink::ValuesPtr; |
| using mojom::document_metadata::blink::WebPage; |
| using mojom::document_metadata::blink::WebPagePtr; |
| |
| // App Indexing enforces a max nesting depth of 5. Our top level message |
| // corresponds to the WebPage, so this only leaves 4 more levels. We will parse |
| // entites up to this depth, and ignore any further nesting. If an object at the |
| // max nesting depth has a property corresponding to an entity, that property |
| // will be dropped. Note that we will still parse json-ld blocks deeper than |
| // this, but it won't be passed to App Indexing. |
| constexpr int kMaxDepth = 4; |
| // Some strings are very long, and we don't currently use those, so limit string |
| // length to something reasonable to avoid undue pressure on Icing. Note that |
| // App Indexing supports strings up to length 20k. |
| constexpr int kMaxStringLength = 200; |
| // Enforced by App Indexing, so stop processing early if possible. |
| constexpr size_t kMaxNumFields = 20; |
| // Enforced by App Indexing, so stop processing early if possible. |
| constexpr size_t kMaxRepeatedSize = 100; |
| |
| constexpr char kJSONLDKeyType[] = "@type"; |
| constexpr char kJSONLDKeyGraph[] = "@graph"; |
| bool isWhitelistedType(AtomicString type) { |
| DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements, |
| ({// Common types that include addresses. |
| "AutoDealer", "Hotel", "LocalBusiness", "Organization", |
| "Person", "Place", "PostalAddress", "Product", |
| "Residence", "Restaurant", "SingleFamilyResidence", |
| // Common types including phone numbers |
| "Store", "ContactPoint", "LodgingBusiness"})); |
| return type && elements.Contains(type); |
| } |
| |
| void extractEntity(const JSONObject&, Entity&, int recursionLevel); |
| |
| bool parseRepeatedValue(const JSONArray& arr, |
| Values& values, |
| int recursionLevel) { |
| if (arr.size() < 1) { |
| return false; |
| } |
| |
| const JSONValue::ValueType type = arr.at(0)->GetType(); |
| switch (type) { |
| case JSONValue::ValueType::kTypeBoolean: |
| values.set_bool_values(Vector<bool>()); |
| break; |
| case JSONValue::ValueType::kTypeInteger: |
| values.set_long_values(Vector<int64_t>()); |
| break; |
| case JSONValue::ValueType::kTypeDouble: |
| // App Indexing doesn't support double type, so just encode its decimal |
| // value as a string instead. |
| values.set_string_values(Vector<String>()); |
| break; |
| case JSONValue::ValueType::kTypeString: |
| values.set_string_values(Vector<String>()); |
| break; |
| case JSONValue::ValueType::kTypeObject: |
| if (recursionLevel + 1 >= kMaxDepth) { |
| return false; |
| } |
| values.set_entity_values(Vector<EntityPtr>()); |
| break; |
| case JSONArray::ValueType::kTypeArray: |
| // App Indexing doesn't support nested arrays. |
| return false; |
| default: |
| break; |
| } |
| for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) { |
| const JSONValue* innerVal = arr.at(j); |
| if (innerVal->GetType() != type) { |
| // App Indexing doesn't support mixed types. If there are mixed |
| // types in the parsed object, we will drop the property. |
| return false; |
| } |
| switch (innerVal->GetType()) { |
| case JSONValue::ValueType::kTypeBoolean: { |
| bool v; |
| innerVal->AsBoolean(&v); |
| values.get_bool_values().push_back(v); |
| } break; |
| case JSONValue::ValueType::kTypeInteger: { |
| int v; |
| innerVal->AsInteger(&v); |
| values.get_long_values().push_back(v); |
| } break; |
| case JSONValue::ValueType::kTypeDouble: { |
| // App Indexing doesn't support double type, so just encode its decimal |
| // value as a string instead. |
| double v; |
| innerVal->AsDouble(&v); |
| String s = String::Number(v); |
| s.Truncate(kMaxStringLength); |
| values.get_string_values().push_back(s); |
| } break; |
| case JSONValue::ValueType::kTypeString: { |
| String v; |
| innerVal->AsString(&v); |
| v.Truncate(kMaxStringLength); |
| values.get_string_values().push_back(v); |
| } break; |
| case JSONValue::ValueType::kTypeObject: |
| values.get_entity_values().push_back(Entity::New()); |
| extractEntity(*(JSONObject::Cast(innerVal)), |
| *(values.get_entity_values().at(j)), recursionLevel + 1); |
| break; |
| default: |
| break; |
| } |
| } |
| return true; |
| } |
| |
| void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) { |
| if (recursionLevel >= kMaxDepth) { |
| return; |
| } |
| |
| String type; |
| val.GetString(kJSONLDKeyType, &type); |
| if (!type) { |
| type = "Thing"; |
| } |
| entity.type = type; |
| for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) { |
| PropertyPtr property = Property::New(); |
| const JSONObject::Entry& entry = val.at(i); |
| property->name = entry.first; |
| if (property->name == kJSONLDKeyType) { |
| continue; |
| } |
| property->values = Values::New(); |
| |
| bool addProperty = true; |
| |
| switch (entry.second->GetType()) { |
| case JSONValue::ValueType::kTypeBoolean: { |
| bool v; |
| val.GetBoolean(entry.first, &v); |
| property->values->set_bool_values({v}); |
| } break; |
| case JSONValue::ValueType::kTypeInteger: { |
| int v; |
| val.GetInteger(entry.first, &v); |
| property->values->set_long_values({v}); |
| } break; |
| case JSONValue::ValueType::kTypeDouble: { |
| double v; |
| val.GetDouble(entry.first, &v); |
| String s = String::Number(v); |
| s.Truncate(kMaxStringLength); |
| property->values->set_string_values({s}); |
| } break; |
| case JSONValue::ValueType::kTypeString: { |
| String v; |
| val.GetString(entry.first, &v); |
| v.Truncate(kMaxStringLength); |
| property->values->set_string_values({v}); |
| } break; |
| case JSONValue::ValueType::kTypeObject: { |
| if (recursionLevel + 1 >= kMaxDepth) { |
| addProperty = false; |
| break; |
| } |
| property->values->set_entity_values(Vector<EntityPtr>()); |
| property->values->get_entity_values().push_back(Entity::New()); |
| |
| extractEntity(*(val.GetJSONObject(entry.first)), |
| *(property->values->get_entity_values().at(0)), |
| recursionLevel + 1); |
| } break; |
| case JSONValue::ValueType::kTypeArray: |
| addProperty = parseRepeatedValue(*(val.GetArray(entry.first)), |
| *(property->values), recursionLevel); |
| break; |
| default: |
| break; |
| } |
| if (addProperty) |
| entity.properties.push_back(std::move(property)); |
| } |
| } |
| |
| void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) { |
| // Now we have a JSONObject which corresponds to a single (possibly nested) |
| // entity. |
| EntityPtr entity = Entity::New(); |
| String type; |
| val.GetString(kJSONLDKeyType, &type); |
| if (!isWhitelistedType(AtomicString(type))) { |
| return; |
| } |
| extractEntity(val, *entity, 0); |
| entities.push_back(std::move(entity)); |
| } |
| |
| void extractEntitiesFromArray(const JSONArray& arr, |
| Vector<EntityPtr>& entities) { |
| for (size_t i = 0; i < arr.size(); ++i) { |
| const JSONValue* val = arr.at(i); |
| if (val->GetType() == JSONValue::ValueType::kTypeObject) { |
| extractTopLevelEntity(*(JSONObject::Cast(val)), entities); |
| } |
| } |
| } |
| |
| void extractEntityFromTopLevelObject(const JSONObject& val, |
| Vector<EntityPtr>& entities) { |
| const JSONArray* graph = val.GetArray(kJSONLDKeyGraph); |
| if (graph) { |
| extractEntitiesFromArray(*graph, entities); |
| } |
| extractTopLevelEntity(val, entities); |
| } |
| |
| // ExtractionStatus is used in UMA, hence is append-only. |
| // kCount must be the last entry. |
| enum ExtractionStatus { kOK, kEmpty, kParseFailure, kWrongType, kCount }; |
| |
| ExtractionStatus extractMetadata(const Element& root, |
| Vector<EntityPtr>& entities) { |
| for (Element& element : ElementTraversal::DescendantsOf(root)) { |
| if (element.HasTagName(HTMLNames::scriptTag) && |
| element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
| std::unique_ptr<JSONValue> json = ParseJSON(element.textContent()); |
| if (!json) { |
| LOG(ERROR) << "Failed to parse json."; |
| return kParseFailure; |
| } |
| switch (json->GetType()) { |
| case JSONValue::ValueType::kTypeArray: |
| extractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities); |
| break; |
| case JSONValue::ValueType::kTypeObject: |
| extractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())), |
| entities); |
| break; |
| default: |
| return kWrongType; |
| } |
| } |
| } |
| if (entities.IsEmpty()) { |
| return kEmpty; |
| } |
| return kOK; |
| } |
| |
| } // namespace |
| |
| WebPagePtr CopylessPasteExtractor::extract(const Document& document) { |
| TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
| |
| if (!document.GetFrame() || !document.GetFrame()->IsMainFrame()) |
| return nullptr; |
| |
| Element* html = document.documentElement(); |
| if (!html) |
| return nullptr; |
| |
| WebPagePtr page = WebPage::New(); |
| |
| // Traverse the DOM tree and extract the metadata. |
| double start_time = MonotonicallyIncreasingTime(); |
| ExtractionStatus status = extractMetadata(*html, page->entities); |
| double elapsed_time = MonotonicallyIncreasingTime() - start_time; |
| |
| DEFINE_STATIC_LOCAL(EnumerationHistogram, status_histogram, |
| ("CopylessPaste.ExtractionStatus", kCount)); |
| status_histogram.Count(status); |
| |
| if (status != kOK) { |
| DEFINE_STATIC_LOCAL( |
| CustomCountHistogram, extractionHistogram, |
| ("CopylessPaste.ExtractionFailedUs", 1, 1000 * 1000, 50)); |
| extractionHistogram.Count(1e6 * elapsed_time); |
| return nullptr; |
| } |
| DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
| ("CopylessPaste.ExtractionUs", 1, 1000 * 1000, 50)); |
| extractionHistogram.Count(1e6 * elapsed_time); |
| |
| page->url = document.Url(); |
| page->title = document.title(); |
| return page; |
| } |
| |
| } // namespace blink |