blob: 308d751a7ecf3ad23e72bcefb520dfb6ed3ed5ac [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/schema_org/extractor.h"
#include <algorithm>
#include <memory>
#include <unordered_set>
#include <utility>
#include <vector>
#include "base/no_destructor.h"
#include "base/strings/string_number_conversions.h"
#include "components/schema_org/common/improved_metadata.mojom.h"
#include "components/schema_org/common/time.h"
#include "components/schema_org/schema_org_entity_names.h"
#include "components/schema_org/schema_org_property_configurations.h"
namespace schema_org {
namespace {
// Use a max entity nesting depth of 10. This is higher than AppIndexing allows
// (depth of 5), but we need at least a depth of 6 for MediaFeeds, and this
// extractor is no longer used by AppIndexing. Round up to 10 since feeds could
// conceivably nest deeper than the test cases we have.
constexpr int kMaxDictionaryDepth = 10;
// Maximum amount of nesting of arrays to support, where 0 is a completely flat
// array.
constexpr int kMaxNestedArrayDepth = 1;
// Some strings are very long, and we don't currently use those, so limit string
// length to something reasonable to avoid undue pressure on Icing. Note that
// App Indexing supports strings up to length 20k.
constexpr size_t kMaxStringLength = 200;
// Enforced by App Indexing, so stop processing early if possible.
constexpr size_t kMaxNumFields = 25;
// Enforced by App Indexing, so stop processing early if possible.
constexpr size_t kMaxRepeatedSize = 100;
constexpr char kJSONLDKeyType[] = "@type";
constexpr char kJSONLDKeyId[] = "@id";
using improved::mojom::Entity;
using improved::mojom::EntityPtr;
using improved::mojom::Property;
using improved::mojom::PropertyPtr;
using improved::mojom::Values;
using improved::mojom::ValuesPtr;
void ExtractEntity(const base::DictionaryValue&, Entity*, int recursion_level);
// Returns true if a property can be of the Duration class type.
bool HasDuration(const property::PropertyConfiguration& config) {
for (const auto& thing : config.thing_types) {
GURL thing_url = GURL(thing);
DCHECK(thing_url.is_valid() && !thing_url.path().empty());
std::string thing_name = thing_url.path().substr(1);
if (thing_name == schema_org::entity::kDuration)
return true;
}
return false;
}
// Parses a string into a property value. The string may be parsed as a
// double, date, or time, depending on the types that the property supports.
// If the property supports text, uses the string itself.
bool ParseStringValue(const std::string& property_type,
base::StringPiece value,
Values* values) {
value = value.substr(0, kMaxStringLength);
schema_org::property::PropertyConfiguration prop_config =
schema_org::property::GetPropertyConfiguration(property_type);
if (prop_config.number) {
int64_t l;
bool parsed_long = base::StringToInt64(value, &l);
if (parsed_long) {
values->long_values.push_back(l);
return true;
}
double d;
bool parsed_double = base::StringToDouble(value, &d);
if (parsed_double) {
values->double_values.push_back(d);
return true;
}
}
if (prop_config.date_time || prop_config.date) {
base::Time time;
bool parsed_time = base::Time::FromString(value.data(), &time);
if (parsed_time) {
values->date_time_values.push_back(time);
return true;
}
}
if (prop_config.time) {
base::Time time_of_day;
base::Time start_of_day;
bool parsed_time = base::Time::FromString(
("1970-01-01T" + value.as_string()).c_str(), &time_of_day);
bool parsed_day_start =
base::Time::FromString("1970-01-01T00:00:00", &start_of_day);
base::TimeDelta time = time_of_day - start_of_day;
// The string failed to parse as a DateTime, but did parse as a Time. Use
// this value instead.
if (parsed_time && parsed_day_start) {
values->time_values.push_back(time);
return true;
}
}
if (prop_config.boolean) {
if (value == "https://schema.org/True" ||
value == "http://schema.org/True" || value == "true") {
values->bool_values.push_back(true);
return true;
}
if (value == "https://schema.org/False" ||
value == "http://schema.org/False" || value == "false") {
values->bool_values.push_back(false);
return true;
}
}
if (HasDuration(prop_config)) {
auto time = ParseISO8601Duration(value.as_string());
if (time.has_value()) {
values->time_values.push_back(time.value());
return true;
}
}
if (!prop_config.thing_types.empty() || !prop_config.enum_types.empty() ||
prop_config.url) {
auto url = GURL(value);
if (url.is_valid()) {
values->url_values.push_back(url);
return true;
}
}
if (!prop_config.thing_types.empty() || !prop_config.enum_types.empty() ||
prop_config.text) {
values->string_values.push_back(value.as_string());
return true;
}
return false;
}
bool ParseRepeatedValue(const base::Value::ConstListView& arr,
const std::string& property_type,
Values* values,
int recursion_level,
int nested_array_level) {
DCHECK(values);
if (arr.empty()) {
return false;
}
if (nested_array_level > kMaxNestedArrayDepth) {
return false;
}
for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) {
auto& list_item = arr[j];
switch (list_item.type()) {
case base::Value::Type::BOOLEAN: {
values->bool_values.push_back(list_item.GetBool());
} break;
case base::Value::Type::INTEGER: {
values->long_values.push_back(list_item.GetInt());
} break;
case base::Value::Type::DOUBLE: {
values->double_values.push_back(list_item.GetDouble());
} break;
case base::Value::Type::STRING: {
base::StringPiece v = list_item.GetString();
if (!ParseStringValue(property_type, v, values)) {
return false;
}
} break;
case base::Value::Type::DICTIONARY: {
const base::DictionaryValue* dict_value = nullptr;
if (list_item.GetAsDictionary(&dict_value)) {
auto entity = Entity::New();
ExtractEntity(*dict_value, entity.get(), recursion_level + 1);
values->entity_values.push_back(std::move(entity));
}
} break;
case base::Value::Type::LIST: {
const base::Value::ConstListView list_view = list_item.GetList();
if (!ParseRepeatedValue(list_view, property_type, values,
recursion_level, nested_array_level + 1)) {
continue;
}
} break;
default:
break;
}
}
return true;
}
void ExtractEntity(const base::DictionaryValue& val,
Entity* entity,
int recursion_level) {
if (recursion_level >= kMaxDictionaryDepth) {
return;
}
std::string type = "";
val.GetString(kJSONLDKeyType, &type);
if (type == "") {
type = "Thing";
}
entity->type = type;
std::string id;
if (val.GetString(kJSONLDKeyId, &id)) {
entity->id = id;
}
for (const auto& entry : val.DictItems()) {
if (entity->properties.size() >= kMaxNumFields) {
break;
}
PropertyPtr property = Property::New();
property->name = entry.first;
if (property->name == kJSONLDKeyType) {
continue;
}
property->values = Values::New();
switch (entry.second.type()) {
case base::Value::Type::BOOLEAN:
property->values->bool_values.push_back(entry.second.GetBool());
break;
case base::Value::Type::INTEGER:
property->values->long_values.push_back(entry.second.GetInt());
break;
case base::Value::Type::DOUBLE:
property->values->double_values.push_back(entry.second.GetDouble());
break;
case base::Value::Type::STRING: {
base::StringPiece v = entry.second.GetString();
if (!(ParseStringValue(property->name, v, property->values.get()))) {
continue;
}
break;
}
case base::Value::Type::DICTIONARY: {
if (recursion_level + 1 >= kMaxDictionaryDepth) {
continue;
}
const base::DictionaryValue* dict_value = nullptr;
if (!entry.second.GetAsDictionary(&dict_value)) {
continue;
}
auto nested_entity = Entity::New();
ExtractEntity(*dict_value, nested_entity.get(), recursion_level + 1);
property->values->entity_values.push_back(std::move(nested_entity));
break;
}
case base::Value::Type::LIST: {
const base::Value::ConstListView list_view = entry.second.GetList();
if (!ParseRepeatedValue(list_view, property->name,
property->values.get(), recursion_level, 0)) {
continue;
}
break;
}
default: {
// Unsupported value type. Skip this property.
continue;
}
}
entity->properties.push_back(std::move(property));
}
}
} // namespace
Extractor::Extractor(base::flat_set<base::StringPiece> supported_entity_types)
: supported_types_(supported_entity_types) {}
Extractor::~Extractor() = default;
bool Extractor::IsSupportedType(const std::string& type) {
return supported_types_.find(type) != supported_types_.end();
}
// Extract a JSONObject which corresponds to a single (possibly nested)
// entity.
EntityPtr Extractor::ExtractTopLevelEntity(const base::DictionaryValue& val) {
EntityPtr entity = Entity::New();
std::string type;
val.GetString(kJSONLDKeyType, &type);
if (!IsSupportedType(type)) {
return nullptr;
}
ExtractEntity(val, entity.get(), 0);
return entity;
}
void Extractor::Extract(const std::string& content,
ExtractorCallback callback) {
data_decoder::DataDecoder::ParseJsonIsolated(
content, base::BindOnce(&Extractor::OnJsonParsed,
weak_factory_.GetWeakPtr(), std::move(callback)));
}
void Extractor::OnJsonParsed(ExtractorCallback callback,
data_decoder::DataDecoder::ValueOrError result) {
const base::DictionaryValue* dict_value = nullptr;
if (!result.value || !result.value.value().GetAsDictionary(&dict_value)) {
std::move(callback).Run(nullptr);
return;
}
std::move(callback).Run(ExtractTopLevelEntity(*dict_value));
}
} // namespace schema_org