blob: a18f0001e2d76b30a441ea369ef375d6a39ea38a [file] [log] [blame]
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/blink/renderer/modules/document_metadata/document_metadata_extractor.h"
#include <algorithm>
#include <memory>
#include <utility>
#include "components/schema_org/common/metadata.mojom-blink.h"
#include "third_party/blink/public/mojom/document_metadata/document_metadata.mojom-blink.h"
#include "third_party/blink/renderer/core/dom/document.h"
#include "third_party/blink/renderer/core/dom/element_traversal.h"
#include "third_party/blink/renderer/core/frame/local_frame.h"
#include "third_party/blink/renderer/core/html/html_element.h"
#include "third_party/blink/renderer/core/html_names.h"
#include "third_party/blink/renderer/platform/instrumentation/tracing/trace_event.h"
#include "third_party/blink/renderer/platform/json/json_parser.h"
#include "third_party/blink/renderer/platform/json/json_values.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"
namespace blink {
namespace {
using mojom::blink::WebPage;
using mojom::blink::WebPagePtr;
using schema_org::mojom::blink::Entity;
using schema_org::mojom::blink::EntityPtr;
using schema_org::mojom::blink::Property;
using schema_org::mojom::blink::PropertyPtr;
using schema_org::mojom::blink::Values;
using schema_org::mojom::blink::ValuesPtr;
// App Indexing enforces a max nesting depth of 5. Our top level message
// corresponds to the WebPage, so this only leaves 4 more levels. We will parse
// entites up to this depth, and ignore any further nesting. If an object at the
// max nesting depth has a property corresponding to an entity, that property
// will be dropped. Note that we will still parse json-ld blocks deeper than
// this, but it won't be passed to App Indexing.
constexpr int kMaxDepth = 4;
// Some strings are very long, and we don't currently use those, so limit string
// length to something reasonable to avoid undue pressure on Icing. Note that
// App Indexing supports strings up to length 20k.
constexpr wtf_size_t kMaxStringLength = 200;
// Enforced by App Indexing, so stop processing early if possible.
constexpr wtf_size_t kMaxNumFields = 20;
// Enforced by App Indexing, so stop processing early if possible.
constexpr wtf_size_t kMaxRepeatedSize = 100;
constexpr char kJSONLDKeyType[] = "@type";
constexpr char kJSONLDKeyGraph[] = "@graph";
bool IsSupportedType(AtomicString type) {
DEFINE_STATIC_LOCAL(
HashSet<AtomicString>, elements,
({// Common types that include addresses.
AtomicString("AutoDealer"), AtomicString("Hotel"),
AtomicString("LocalBusiness"), AtomicString("Organization"),
AtomicString("Person"), AtomicString("Place"),
AtomicString("PostalAddress"), AtomicString("Product"),
AtomicString("Residence"), AtomicString("Restaurant"),
AtomicString("SingleFamilyResidence"),
// Common types including phone numbers
AtomicString("Store"), AtomicString("ContactPoint"),
AtomicString("LodgingBusiness")}));
return type && elements.Contains(type);
}
void ExtractEntity(const JSONObject&, int recursion_level, Entity&);
bool ParseRepeatedValue(const JSONArray& arr,
int recursion_level,
ValuesPtr& values) {
if (arr.size() < 1) {
return false;
}
const JSONValue::ValueType type = arr.at(0)->GetType();
switch (type) {
case JSONValue::ValueType::kTypeNull:
return false;
case JSONValue::ValueType::kTypeBoolean:
values = Values::NewBoolValues({});
break;
case JSONValue::ValueType::kTypeInteger:
values = Values::NewLongValues({});
break;
// App Indexing doesn't support double type, so just encode its decimal
// value as a string instead.
case JSONValue::ValueType::kTypeDouble:
case JSONValue::ValueType::kTypeString:
values = Values::NewStringValues({});
break;
case JSONValue::ValueType::kTypeObject:
if (recursion_level + 1 >= kMaxDepth) {
return false;
}
values = Values::NewEntityValues({});
break;
case JSONArray::ValueType::kTypeArray:
// App Indexing doesn't support nested arrays.
return false;
}
const wtf_size_t arr_size = std::min(arr.size(), kMaxRepeatedSize);
for (wtf_size_t i = 0; i < arr_size; ++i) {
const JSONValue* const element = arr.at(i);
if (element->GetType() != type) {
// App Indexing doesn't support mixed types. If there are mixed
// types in the parsed object, we will drop the property.
return false;
}
switch (type) {
case JSONValue::ValueType::kTypeBoolean: {
bool v;
element->AsBoolean(&v);
values->get_bool_values().push_back(v);
continue;
}
case JSONValue::ValueType::kTypeInteger: {
int v;
element->AsInteger(&v);
values->get_long_values().push_back(v);
continue;
}
case JSONValue::ValueType::kTypeDouble: {
// App Indexing doesn't support double type, so just encode its decimal
// value as a string instead.
double v;
element->AsDouble(&v);
String s = String::Number(v);
s.Truncate(kMaxStringLength);
values->get_string_values().push_back(s);
continue;
}
case JSONValue::ValueType::kTypeString: {
String v;
element->AsString(&v);
v.Truncate(kMaxStringLength);
values->get_string_values().push_back(v);
continue;
}
case JSONValue::ValueType::kTypeObject: {
auto entity = Entity::New();
ExtractEntity(*(JSONObject::Cast(element)), recursion_level + 1,
*entity);
values->get_entity_values().push_back(std::move(entity));
continue;
}
case JSONValue::ValueType::kTypeNull:
case JSONValue::ValueType::kTypeArray:
CHECK(false);
}
}
return true;
}
void ExtractEntity(const JSONObject& val, int recursion_level, Entity& entity) {
if (recursion_level >= kMaxDepth) {
return;
}
String type;
val.GetString(kJSONLDKeyType, &type);
if (!type) {
type = "Thing";
}
entity.type = type;
for (wtf_size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
PropertyPtr property = Property::New();
const JSONObject::Entry& entry = val.at(i);
property->name = entry.first;
if (property->name == kJSONLDKeyType) {
continue;
}
bool add_property = true;
switch (entry.second->GetType()) {
case JSONValue::ValueType::kTypeBoolean: {
bool v;
val.GetBoolean(entry.first, &v);
property->values = Values::NewBoolValues({v});
} break;
case JSONValue::ValueType::kTypeInteger: {
int v;
val.GetInteger(entry.first, &v);
property->values = Values::NewLongValues({v});
} break;
case JSONValue::ValueType::kTypeDouble: {
double v;
val.GetDouble(entry.first, &v);
String s = String::Number(v);
s.Truncate(kMaxStringLength);
property->values = Values::NewStringValues({s});
} break;
case JSONValue::ValueType::kTypeString: {
String v;
val.GetString(entry.first, &v);
v.Truncate(kMaxStringLength);
property->values = Values::NewStringValues({v});
} break;
case JSONValue::ValueType::kTypeObject: {
if (recursion_level + 1 >= kMaxDepth) {
add_property = false;
break;
}
Vector<EntityPtr> entities;
entities.push_back(Entity::New());
ExtractEntity(*(val.GetJSONObject(entry.first)), recursion_level + 1,
*entities[0]);
property->values = Values::NewEntityValues(std::move(entities));
} break;
case JSONValue::ValueType::kTypeArray:
add_property = ParseRepeatedValue(*(val.GetArray(entry.first)),
recursion_level, property->values);
break;
default:
break;
}
if (add_property)
entity.properties.push_back(std::move(property));
}
}
void ExtractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
// Now we have a JSONObject which corresponds to a single (possibly nested)
// entity.
EntityPtr entity = Entity::New();
String type;
val.GetString(kJSONLDKeyType, &type);
if (!IsSupportedType(AtomicString(type))) {
return;
}
ExtractEntity(val, 0, *entity);
entities.push_back(std::move(entity));
}
void ExtractEntitiesFromArray(const JSONArray& arr,
Vector<EntityPtr>& entities) {
for (wtf_size_t i = 0; i < arr.size(); ++i) {
const JSONValue* val = arr.at(i);
if (val->GetType() == JSONValue::ValueType::kTypeObject) {
ExtractTopLevelEntity(*(JSONObject::Cast(val)), entities);
}
}
}
void ExtractEntityFromTopLevelObject(const JSONObject& val,
Vector<EntityPtr>& entities) {
const JSONArray* graph = val.GetArray(kJSONLDKeyGraph);
if (graph) {
ExtractEntitiesFromArray(*graph, entities);
}
ExtractTopLevelEntity(val, entities);
}
// These values are persisted to logs. Entries should not be renumbered and
// numeric values should never be reused.
enum ExtractionStatus {
kOK,
kEmpty,
kParseFailure,
kWrongType,
kMaxValue = kWrongType,
};
ExtractionStatus ExtractMetadata(const Element& root,
Vector<EntityPtr>& entities) {
for (Element& element : ElementTraversal::DescendantsOf(root)) {
if (element.HasTagName(html_names::kScriptTag) &&
element.FastGetAttribute(html_names::kTypeAttr) ==
"application/ld+json") {
// TODO(crbug.com/1264024): Deprecate JSON comments here, if possible.
std::unique_ptr<JSONValue> json =
ParseJSONWithCommentsDeprecated(element.textContent());
if (!json) {
LOG(ERROR) << "Failed to parse json.";
return ExtractionStatus::kParseFailure;
}
switch (json->GetType()) {
case JSONValue::ValueType::kTypeArray:
ExtractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities);
break;
case JSONValue::ValueType::kTypeObject:
ExtractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())),
entities);
break;
default:
return ExtractionStatus::kWrongType;
}
}
}
if (entities.empty()) {
return ExtractionStatus::kEmpty;
}
return ExtractionStatus::kOK;
}
} // namespace
WebPagePtr DocumentMetadataExtractor::Extract(const Document& document) {
TRACE_EVENT0("blink", "DocumentMetadataExtractor::Extract");
if (!document.GetFrame() || !document.GetFrame()->IsMainFrame())
return nullptr;
Element* html = document.documentElement();
if (!html)
return nullptr;
WebPagePtr page = WebPage::New();
// Traverse the DOM tree and extract the metadata.
ExtractionStatus status = ExtractMetadata(*html, page->entities);
if (status != ExtractionStatus::kOK) {
return nullptr;
}
page->url = document.Url();
page->title = document.title();
return page;
}
} // namespace blink