blob: 09a16ab56477174376c1146e465f00a06149deba [file] [log] [blame]
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "services/data_decoder/xml_parser.h"
#include <map>
#include <utility>
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/values.h"
#include "third_party/libxml/chromium/libxml_utils.h"
#include "third_party/libxml/chromium/xml_reader.h"
namespace data_decoder {
using AttributeMap = std::map<std::string, std::string>;
using NamespaceMap = std::map<std::string, std::string>;
namespace {
void ReportError(XmlParser::ParseCallback callback,
const std::string& generic_error,
const std::string& libxml_error) {
std::string error;
if (!libxml_error.empty()) {
error = base::StrCat({generic_error, ": ", libxml_error});
// libxml errors have trailing lines, spaces, and a carrot to try and
// indicate where an error is. For instance, an error string may be:
// Entity: line 1: parser error : Opening and ending tag mismatch: hello
// line 1 and goodbye
// <hello>bad tag</goodbye>
// ^
// This is helpful in a terminal, but not when gathering and returning the
// error. Instead, just trim the trailing whitespace and '^'.
base::TrimString(error, " \n^", &error);
} else {
error = generic_error;
}
std::move(callback).Run(/*result=*/absl::nullopt,
absl::make_optional(std::move(error)));
}
enum class TextNodeType { kText, kCData };
// Returns false if the current node in |xml_reader| is not text or CData.
// Otherwise returns true and sets |text| to the text/CData of the current node
// and |node_type| to kText or kCData.
bool GetTextFromNode(XmlReader* xml_reader,
std::string* text,
TextNodeType* node_type) {
if (xml_reader->GetTextIfTextElement(text)) {
*node_type = TextNodeType::kText;
return true;
}
if (xml_reader->GetTextIfCDataElement(text)) {
*node_type = TextNodeType::kCData;
return true;
}
return false;
}
base::Value CreateTextNode(const std::string& text, TextNodeType node_type) {
base::Value element(base::Value::Type::DICTIONARY);
element.SetKey(mojom::XmlParser::kTypeKey,
base::Value(node_type == TextNodeType::kText
? mojom::XmlParser::kTextNodeType
: mojom::XmlParser::kCDataNodeType));
element.SetKey(mojom::XmlParser::kTextKey, base::Value(text));
return element;
}
// Creates and returns new element node with the tag name |name|.
base::Value CreateNewElement(const std::string& name) {
base::Value element(base::Value::Type::DICTIONARY);
element.SetKey(mojom::XmlParser::kTypeKey,
base::Value(mojom::XmlParser::kElementType));
element.SetKey(mojom::XmlParser::kTagKey, base::Value(name));
return element;
}
// Adds |child| as a child of |element|, creating the children list if
// necessary. Returns a ponter to |child|.
base::Value* AddChildToElement(base::Value* element, base::Value child) {
DCHECK(element->is_dict());
base::Value* children = element->FindKey(mojom::XmlParser::kChildrenKey);
DCHECK(!children || children->is_list());
if (!children)
children = element->SetKey(mojom::XmlParser::kChildrenKey,
base::Value(base::Value::Type::LIST));
children->Append(std::move(child));
return &children->GetListDeprecated().back();
}
void PopulateNamespaces(base::Value* node_value, XmlReader* xml_reader) {
DCHECK(node_value->is_dict());
NamespaceMap namespaces;
if (!xml_reader->GetAllDeclaredNamespaces(&namespaces) || namespaces.empty())
return;
base::Value namespace_dict(base::Value::Type::DICTIONARY);
for (auto ns : namespaces)
namespace_dict.SetKey(ns.first, base::Value(ns.second));
node_value->SetKey(mojom::XmlParser::kNamespacesKey,
std::move(namespace_dict));
}
void PopulateAttributes(base::Value* node_value, XmlReader* xml_reader) {
DCHECK(node_value->is_dict());
AttributeMap attributes;
if (!xml_reader->GetAllNodeAttributes(&attributes) || attributes.empty())
return;
base::Value attribute_dict(base::Value::Type::DICTIONARY);
for (auto attribute : attributes)
attribute_dict.SetKey(attribute.first, base::Value(attribute.second));
node_value->SetKey(mojom::XmlParser::kAttributesKey,
std::move(attribute_dict));
}
// A function to capture XML errors. Otherwise, by default, they are printed to
// stderr. `context` is a pointer to a std::string stack-allocated in the
// Parse(); `message` and the subsequent arguments are passed by libxml.
void CaptureXmlErrors(void* context, const char* message, ...) {
va_list args;
va_start(args, message);
std::string* error = static_cast<std::string*>(context);
base::StringAppendV(error, message, args);
va_end(args);
}
} // namespace
XmlParser::XmlParser() = default;
XmlParser::~XmlParser() = default;
void XmlParser::Parse(const std::string& xml,
WhitespaceBehavior whitespace_behavior,
ParseCallback callback) {
std::string errors;
ScopedXmlErrorFunc error_func(&errors, CaptureXmlErrors);
XmlReader xml_reader;
if (!xml_reader.Load(xml)) {
ReportError(std::move(callback), "Invalid XML: failed to load", errors);
return;
}
base::Value root_element;
std::vector<base::Value*> element_stack;
while (xml_reader.Read()) {
if (xml_reader.IsClosingElement()) {
if (element_stack.empty()) {
ReportError(std::move(callback), "Invalid XML: unbalanced elements",
errors);
return;
}
element_stack.pop_back();
continue;
}
std::string text;
TextNodeType text_node_type = TextNodeType::kText;
base::Value* current_element =
element_stack.empty() ? nullptr : element_stack.back();
bool push_new_node_to_stack = false;
base::Value new_element;
if (GetTextFromNode(&xml_reader, &text, &text_node_type)) {
if (!base::IsStringUTF8(text)) {
ReportError(std::move(callback), "Invalid XML: invalid UTF8 text.",
errors);
return;
}
new_element = CreateTextNode(text, text_node_type);
} else if (xml_reader.IsElement()) {
new_element = CreateNewElement(xml_reader.NodeFullName());
PopulateNamespaces(&new_element, &xml_reader);
PopulateAttributes(&new_element, &xml_reader);
// Self-closing (empty) element have no close tag (or children); don't
// push them on the element stack.
push_new_node_to_stack = !xml_reader.IsEmptyElement();
} else if (whitespace_behavior ==
WhitespaceBehavior::kPreserveSignificant &&
xml_reader.GetTextIfSignificantWhitespaceElement(&text)) {
new_element = CreateTextNode(text, TextNodeType::kText);
} else {
// Ignore all other node types (comments, processing instructions,
// DTDs...).
continue;
}
base::Value* new_element_ptr;
if (current_element) {
new_element_ptr =
AddChildToElement(current_element, std::move(new_element));
} else {
// First element we are parsing, it becomes the root element.
DCHECK(xml_reader.IsElement());
DCHECK(root_element.is_none());
root_element = std::move(new_element);
new_element_ptr = &root_element;
}
if (push_new_node_to_stack)
element_stack.push_back(new_element_ptr);
}
if (!element_stack.empty()) {
ReportError(std::move(callback), "Invalid XML: unbalanced elements",
errors);
return;
}
if (!root_element.is_dict() || root_element.DictEmpty()) {
ReportError(std::move(callback), "Invalid XML: bad content", errors);
return;
}
std::move(callback).Run(absl::make_optional(std::move(root_element)),
absl::optional<std::string>());
}
} // namespace data_decoder