blob: c4e90d350a22345fe304cc175d214f9804fdb85d [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_formatter/top_domains/top_domain_state_generator.h"
#include <cstdint>
#include <memory>
#include <string>
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "net/tools/huffman_trie/huffman/huffman_builder.h"
#include "net/tools/huffman_trie/trie/trie_bit_buffer.h"
#include "net/tools/huffman_trie/trie/trie_writer.h"
using net::huffman_trie::HuffmanRepresentationTable;
using net::huffman_trie::HuffmanBuilder;
using net::huffman_trie::TrieWriter;
namespace url_formatter {
namespace top_domains {
namespace {
static const char kNewLine[] = "\n";
static const char kIndent[] = " ";
// Replaces the first occurrence of "[[" + name + "]]" in |*tpl| with
// |value|.
bool ReplaceTag(const std::string& name,
const std::string& value,
std::string* tpl) {
std::string tag = "[[" + name + "]]";
size_t start_pos = tpl->find(tag);
if (start_pos == std::string::npos) {
return false;
}
tpl->replace(start_pos, tag.length(), value);
return true;
}
// Formats the bytes in |bytes| as an C++ array initializer and returns the
// resulting string.
std::string FormatVectorAsArray(const std::vector<uint8_t>& bytes) {
std::string output = "{";
output.append(kNewLine);
output.append(kIndent);
output.append(kIndent);
size_t bytes_on_current_line = 0;
for (size_t i = 0; i < bytes.size(); ++i) {
base::StringAppendF(&output, "0x%02x,", bytes[i]);
bytes_on_current_line++;
if (bytes_on_current_line >= 12 && (i + 1) < bytes.size()) {
output.append(kNewLine);
output.append(kIndent);
output.append(kIndent);
bytes_on_current_line = 0;
} else if ((i + 1) < bytes.size()) {
output.append(" ");
}
}
output.append(kNewLine);
output.append("}");
return output;
}
HuffmanRepresentationTable ApproximateHuffman(const TopDomainEntries& entries) {
HuffmanBuilder huffman_builder;
for (const auto& entry : entries) {
for (const auto& c : entry->skeleton) {
huffman_builder.RecordUsage(c);
}
for (const auto& c : entry->top_domain) {
huffman_builder.RecordUsage(c);
}
huffman_builder.RecordUsage(net::huffman_trie::kTerminalValue);
huffman_builder.RecordUsage(net::huffman_trie::kEndOfTableValue);
}
return huffman_builder.ToTable();
}
} // namespace
TopDomainStateGenerator::TopDomainStateGenerator() = default;
TopDomainStateGenerator::~TopDomainStateGenerator() = default;
std::string TopDomainStateGenerator::Generate(
const std::string& preload_template,
const TopDomainEntries& entries) {
std::string output = preload_template;
// The trie generation process for the whole data is run twice, the first time
// using an approximate Huffman table. During this first run, the correct
// character frequencies are collected which are then used to calculate the
// most space efficient Huffman table for the given inputs. This table is used
// for the second run.
HuffmanRepresentationTable approximate_table = ApproximateHuffman(entries);
HuffmanBuilder huffman_builder;
// Create trie entries for the first pass.
std::vector<std::unique_ptr<TopDomainTrieEntry>> trie_entries;
std::vector<net::huffman_trie::TrieEntry*> raw_trie_entries;
for (const auto& entry : entries) {
auto trie_entry = std::make_unique<TopDomainTrieEntry>(
approximate_table, &huffman_builder, entry.get());
raw_trie_entries.push_back(trie_entry.get());
trie_entries.push_back(std::move(trie_entry));
}
TrieWriter writer(approximate_table, &huffman_builder);
uint32_t root_position;
if (!writer.WriteEntries(raw_trie_entries, &root_position))
return std::string();
HuffmanRepresentationTable optimal_table = huffman_builder.ToTable();
TrieWriter new_writer(optimal_table, &huffman_builder);
// Create trie entries using the optimal table for the second pass.
raw_trie_entries.clear();
trie_entries.clear();
for (const auto& entry : entries) {
auto trie_entry = std::make_unique<TopDomainTrieEntry>(
optimal_table, &huffman_builder, entry.get());
raw_trie_entries.push_back(trie_entry.get());
trie_entries.push_back(std::move(trie_entry));
}
if (!new_writer.WriteEntries(raw_trie_entries, &root_position))
return std::string();
uint32_t new_length = new_writer.position();
std::vector<uint8_t> huffman_tree = huffman_builder.ToVector();
new_writer.Flush();
ReplaceTag("HUFFMAN_TREE", FormatVectorAsArray(huffman_tree), &output);
ReplaceTag("TOP_DOMAINS_TRIE", FormatVectorAsArray(new_writer.bytes()),
&output);
ReplaceTag("TOP_DOMAINS_TRIE_BITS", base::NumberToString(new_length),
&output);
ReplaceTag("TOP_DOMAINS_TRIE_ROOT", base::NumberToString(root_position),
&output);
return output;
}
} // namespace top_domains
} // namespace url_formatter