blob: ce15f72aba77fb52b4cd55be706adabeb20e2650 [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is the Mork Reader.
*
* The Initial Developer of the Original Code is
* Google Inc.
* Portions created by the Initial Developer are Copyright (C) 2006
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Brian Ryner <bryner@brianryner.com> (original author)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
// Source:
// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
// This file has been converted to google style.
#include "chrome/browser/importer/mork_reader.h"
#include <algorithm>
#include "base/file_path.h"
#include "base/i18n/icu_string_conversions.h"
#include "base/logging.h"
#include "base/message_loop.h"
#include "base/string_number_conversions.h"
#include "base/string_util.h"
#include "chrome/browser/history/history_types.h"
#include "chrome/browser/importer/firefox_importer_utils.h"
#include "chrome/browser/importer/importer_bridge.h"
namespace {
// Convert a hex character (0-9, A-F) to its corresponding byte value.
// Returns -1 if the character is invalid.
inline int HexCharToInt(char c) {
if ('0' <= c && c <= '9')
return c - '0';
if ('A' <= c && c <= 'F')
return c - 'A' + 10;
return -1;
}
// Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII
// characters. Additionally, '$' and '\' are backslash-escaped.
// The result of the unescape is in returned.
std::string MorkUnescape(const std::string& input) {
// We optimize for speed over space here -- size the result buffer to
// the size of the source, which is an upper bound on the size of the
// unescaped string.
std::string result;
size_t input_length = input.size();
result.reserve(input_length);
for (size_t i = 0; i < input_length; i++) {
char c = input[i];
if (c == '\\') {
// Escaped literal, slip the backslash, append the next character.
i++;
if (i < input_length)
result.push_back(input[i]);
} else if (c == '$') {
// Dollar sign denotes a hex character.
if (i < input_length - 2) {
// Would be nice to use ToInteger() here, but it currently
// requires a null-terminated string.
int first = HexCharToInt(input[++i]);
int second = HexCharToInt(input[++i]);
if (first >= 0 && second >= 0)
result.push_back((first << 4) | second);
}
} else {
// Regular character, just append.
result.push_back(input[i]);
}
}
return result;
}
} // namespace
MorkReader::MorkReader() {
}
MorkReader::~MorkReader() {
// Need to delete all the pointers to vectors we have in the table.
for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
delete i->second;
}
bool MorkReader::Read(const FilePath& path) {
stream_.open(path.value().c_str());
if (!stream_.is_open())
return false;
std::string line;
if (!ReadLine(&line) ||
line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
return false; // Unexpected file format.
IndexMap column_map;
while (ReadLine(&line)) {
// Trim off leading spaces
size_t idx = 0;
size_t len = line.size();
while (idx < len && line[idx] == ' ')
++idx;
if (idx >= len)
continue;
// Look at the line to figure out what section type this is
if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
// Column map. We begin by creating a hash of column id to column name.
StringMap column_name_map;
ParseMap(line, idx, &column_name_map);
// Now that we have the list of columns, we put them into a flat array.
// Rows will have value arrays of the same size, with indexes that
// correspond to the columns array. As we insert each column into the
// array, we also make an entry in columnMap so that we can look up the
// index given the column id.
columns_.reserve(column_name_map.size());
for (StringMap::const_iterator i = column_name_map.begin();
i != column_name_map.end(); ++i) {
column_map[i->first] = static_cast<int>(columns_.size());
MorkColumn col(i->first, i->second);
columns_.push_back(col);
}
} else if (StartsWithASCII(&line[idx], "<(", true)) {
// Value map.
ParseMap(line, idx, &value_map_);
} else if (line[idx] == '{' || line[idx] == '[') {
// Table / table row.
ParseTable(line, idx, &column_map);
} else {
// Don't know, hopefully don't care.
}
}
return true;
}
// Parses a key/value map of the form
// <(k1=v1)(k2=v2)...>
bool MorkReader::ParseMap(const std::string& first_line,
size_t start_index,
StringMap* map) {
// If the first line is the a=c line (column map), just skip over it.
std::string line(first_line);
if (StartsWithASCII(line, "< <(a=c)>", true)) {
if (!ReadLine(&line))
return false;
}
std::string key;
do {
size_t idx = start_index;
size_t len = line.size();
size_t token_start;
while (idx < len) {
switch (line[idx++]) {
case '(':
// Beginning of a key/value pair.
if (!key.empty()) {
DLOG(WARNING) << "unterminated key/value pair?";
key.clear();
}
token_start = idx;
while (idx < len && line[idx] != '=')
++idx;
key.assign(&line[token_start], idx - token_start);
break;
case '=': {
// Beginning of the value.
if (key.empty()) {
DLOG(WARNING) << "stray value";
break;
}
token_start = idx;
while (idx < len && line[idx] != ')') {
if (line[idx] == '\\')
++idx; // Skip escaped ')' characters.
++idx;
}
size_t token_end = std::min(idx, len);
++idx;
std::string value = MorkUnescape(
std::string(&line[token_start], token_end - token_start));
(*map)[key] = value;
key.clear();
break;
}
case '>':
// End of the map.
DLOG_IF(WARNING, key.empty()) <<
"map terminates inside of key/value pair";
return true;
}
}
// We should start reading the next line at the beginning.
start_index = 0;
} while (ReadLine(&line));
// We ran out of lines and the map never terminated. This probably indicates
// a parsing error.
DLOG(WARNING) << "didn't find end of key/value map";
return false;
}
// Parses a table row of the form [123(^45^67)..]
// (row id 123 has the value with id 67 for the column with id 45).
// A '^' prefix for a column or value references an entry in the column or
// value map. '=' is used as the separator when the value is a literal.
void MorkReader::ParseTable(const std::string& first_line,
size_t start_index,
const IndexMap* column_map) {
std::string line(first_line);
// Column index of the cell we're parsing, minus one if invalid.
int column_index = -1;
// Points to the current row we're parsing inside of the |table_|, will be
// NULL if we're not inside a row.
ColumnDataList* current_row = NULL;
bool in_meta_row = false;
do {
size_t idx = start_index;
size_t len = line.size();
while (idx < len) {
switch (line[idx++]) {
case '{':
// This marks the beginning of a table section. There's a lot of
// junk before the first row that looks like cell values but isn't.
// Skip to the first '['.
while (idx < len && line[idx] != '[') {
if (line[idx] == '{') {
in_meta_row = true; // The meta row is enclosed in { }
} else if (line[idx] == '}') {
in_meta_row = false;
}
++idx;
}
break;
case '[': {
// Start of a new row. Consume the row id, up to the first '('.
// Row edits also have a table namespace, separated from the row id
// by a colon. We don't make use of the namespace, but we need to
// make sure not to consider it part of the row id.
if (current_row) {
DLOG(WARNING) << "unterminated row?";
current_row = NULL;
}
// Check for a '-' at the start of the id. This signifies that
// if the row already exists, we should delete all columns from it
// before adding the new values.
bool cut_columns;
if (idx < len && line[idx] == '-') {
cut_columns = true;
++idx;
} else {
cut_columns = false;
}
// Locate the range of the ID.
size_t token_start = idx; // Index of the first char of the token.
while (idx < len &&
line[idx] != '(' &&
line[idx] != ']' &&
line[idx] != ':') {
++idx;
}
size_t token_end = idx; // Index of the char following the token.
while (idx < len && line[idx] != '(' && line[idx] != ']') {
++idx;
}
if (in_meta_row) {
// Need to create the meta row.
meta_row_.resize(columns_.size());
current_row = &meta_row_;
} else {
// Find or create the regular row for this.
IDString row_id(&line[token_start], token_end - token_start);
RowMap::iterator found_row = table_.find(row_id);
if (found_row == table_.end()) {
// We don't already have this row, create a new one for it.
current_row = new ColumnDataList(columns_.size());
table_[row_id] = current_row;
} else {
// The row already exists and we're adding/replacing things.
current_row = found_row->second;
}
}
if (cut_columns) {
for (size_t i = 0; i < current_row->size(); ++i)
(*current_row)[i].clear();
}
break;
}
case ']':
// We're done with the row.
current_row = NULL;
in_meta_row = false;
break;
case '(': {
if (!current_row) {
DLOG(WARNING) << "cell value outside of row";
break;
}
bool column_is_atom;
if (line[idx] == '^') {
column_is_atom = true;
++idx; // This is not part of the column id, advance past it.
} else {
column_is_atom = false;
}
size_t token_start = idx;
while (idx < len && line[idx] != '^' && line[idx] != '=') {
if (line[idx] == '\\')
++idx; // Skip escaped characters.
++idx;
}
size_t token_end = std::min(idx, len);
IDString column;
if (column_is_atom)
column.assign(&line[token_start], token_end - token_start);
else
column = MorkUnescape(line.substr(token_start,
token_end - token_start));
IndexMap::const_iterator found_column = column_map->find(column);
if (found_column == column_map->end()) {
DLOG(WARNING) << "Column not in column map, discarding it";
column_index = -1;
} else {
column_index = found_column->second;
}
break;
}
case '=':
case '^': {
if (column_index == -1) {
DLOG(WARNING) << "stray ^ or = marker";
break;
}
bool value_is_atom = (line[idx - 1] == '^');
size_t token_start = idx - 1; // Include the '=' or '^' marker.
while (idx < len && line[idx] != ')') {
if (line[idx] == '\\')
++idx; // Skip escaped characters.
++idx;
}
size_t token_end = std::min(idx, len);
++idx;
if (value_is_atom) {
(*current_row)[column_index].assign(&line[token_start],
token_end - token_start);
} else {
(*current_row)[column_index] =
MorkUnescape(line.substr(token_start, token_end - token_start));
}
column_index = -1;
}
break;
}
}
// Start parsing the next line at the beginning.
start_index = 0;
} while (current_row && ReadLine(&line));
}
bool MorkReader::ReadLine(std::string* line) {
line->resize(256);
std::getline(stream_, *line);
if (stream_.eof() || stream_.bad())
return false;
while (!line->empty() && (*line)[line->size() - 1] == '\\') {
// There is a continuation for this line. Read it and append.
std::string new_line;
std::getline(stream_, new_line);
if (stream_.eof())
return false;
line->erase(line->size() - 1);
line->append(new_line);
}
return true;
}
void MorkReader::NormalizeValue(std::string* value) const {
if (value->empty())
return;
MorkReader::StringMap::const_iterator i;
switch (value->at(0)) {
case '^':
// Hex ID, lookup the name for it in the |value_map_|.
i = value_map_.find(value->substr(1));
if (i == value_map_.end())
value->clear();
else
*value = i->second;
break;
case '=':
// Just use the literal after the equals sign.
value->erase(value->begin());
break;
default:
// Anything else is invalid.
value->clear();
break;
}
}
// Source:
// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
// Columns for entry (non-meta) history rows
enum {
kURLColumn,
kNameColumn,
kVisitCountColumn,
kHiddenColumn,
kTypedColumn,
kLastVisitColumn,
kColumnCount // Keep me last.
};
static const char * const gColumnNames[] = {
"URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
};
struct TableReadClosure {
explicit TableReadClosure(const MorkReader& r)
: reader(r),
swap_bytes(false),
byte_order_column(-1) {
for (int i = 0; i < kColumnCount; ++i)
column_indexes[i] = -1;
}
// Backpointers to the reader and history we're operating on.
const MorkReader& reader;
// Whether we need to swap bytes (file format is other-endian).
bool swap_bytes;
// Indexes of the columns that we care about.
int column_indexes[kColumnCount];
int byte_order_column;
};
void AddToHistory(MorkReader::ColumnDataList* column_values,
const TableReadClosure& data,
history::URLRows* rows) {
std::string values[kColumnCount];
for (size_t i = 0; i < kColumnCount; ++i) {
if (data.column_indexes[i] != -1) {
values[i] = column_values->at(data.column_indexes[i]);
data.reader.NormalizeValue(&values[i]);
// Do not import hidden records.
if (i == kHiddenColumn && values[i] == "1")
return;
}
}
GURL url(values[kURLColumn]);
if (CanImportURL(url)) {
history::URLRow row(url);
string16 title;
if (data.swap_bytes) {
base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE,
base::OnStringConversionError::SKIP, &title);
} else {
base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE,
base::OnStringConversionError::SKIP, &title);
}
row.set_title(title);
int count = atoi(values[kVisitCountColumn].c_str());
if (count == 0)
count = 1;
row.set_visit_count(count);
int64 date;
base::StringToInt64(values[kLastVisitColumn], &date);
if (date != 0)
row.set_last_visit(base::Time::FromTimeT(date / 1000000));
bool is_typed = (values[kTypedColumn] == "1");
if (is_typed)
row.set_typed_count(1);
rows->push_back(row);
}
}
// It sets up the file stream and loops over the lines in the file to
// parse them, then adds the resulting row set to history.
void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) {
MorkReader reader;
reader.Read(file);
// Gather up the column ids so we don't need to find them on each row
TableReadClosure data(reader);
const MorkReader::MorkColumnList& columns = reader.columns();
for (size_t i = 0; i < columns.size(); ++i) {
for (int j = 0; j < kColumnCount; ++j)
if (columns[i].name == gColumnNames[j]) {
data.column_indexes[j] = static_cast<int>(i);
break;
}
if (columns[i].name == "ByteOrder")
data.byte_order_column = static_cast<int>(i);
}
// Determine the byte order from the table's meta-row.
const MorkReader::ColumnDataList& meta_row = reader.meta_row();
if (!meta_row.empty() && data.byte_order_column != -1) {
std::string byte_order = meta_row[data.byte_order_column];
if (!byte_order.empty()) {
// Note whether the file uses a non-native byte ordering.
// If it does, we'll have to swap bytes for PRUnichar values.
// "BE" and "LE" are the only recognized values, anything
// else is garbage and the file will be treated as native-endian
// (no swapping).
std::string byte_order_value(byte_order);
reader.NormalizeValue(&byte_order_value);
data.swap_bytes = (byte_order_value == "BE");
}
}
history::URLRows rows;
for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
AddToHistory(i->second, data, &rows);
if (!rows.empty())
bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED);
}