| // Copyright 2014 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/password_manager/core/browser/import/csv_reader.h" |
| |
| #include <stddef.h> |
| |
| #include <algorithm> |
| #include <utility> |
| |
| #include "base/logging.h" |
| #include "base/macros.h" |
| #include "base/strings/string_util.h" |
| |
| namespace { |
| |
| // Returns all the characters from the start of |input| until the first '\n', |
| // '\r' (exclusive) or the end of |input|. Cuts the returned part (inclusive the |
| // line breaks) from |input|. Skips blocks of matching quotes. Examples: |
| // old input -> returned value, new input |
| // "ab\ncd" -> "ab", "cd" |
| // "\r\n" -> "", "\n" |
| // "abcd" -> "abcd", "" |
| // "a\"\n\"b" -> "a\"\n\"b", "" |
| base::StringPiece ConsumeLine(base::StringPiece* input) { |
| DCHECK(input); |
| DCHECK(!input->empty()); |
| |
| bool inside_quotes = false; |
| for (size_t current = 0; current < input->size(); ++current) { |
| switch ((*input)[current]) { |
| case '\n': |
| case '\r': |
| if (!inside_quotes) { |
| base::StringPiece ret(input->data(), current); |
| *input = input->substr(current + 1); |
| return ret; |
| } |
| break; |
| case '"': |
| inside_quotes = !inside_quotes; |
| break; |
| default: |
| break; |
| } |
| } |
| |
| // The whole |*input| is one line. |
| base::StringPiece ret = *input; |
| *input = base::StringPiece(); |
| return ret; |
| } |
| |
| // Created for a row (line) of comma-separated-values, iteratively returns |
| // individual fields. |
| class FieldParser { |
| public: |
| explicit FieldParser(base::StringPiece row); |
| ~FieldParser(); |
| |
| // Advances the parser over the next comma-separated field and writes its |
| // contents into |field_contents| (comma separator excluded, enclosing |
| // quotation marks excluded, if present). Returns true if there were no |
| // errors. The input must not be empty (check with HasMoreFields() before |
| // calling). |
| // TODO(crbug.com/918530): Also unescape the field contents. |
| bool NextField(base::StringPiece* field_contents); |
| |
| bool HasMoreFields() const { |
| return state_ != State::kError && position_ <= row_.size(); |
| } |
| |
| private: |
| enum class State { |
| // The state just before a new field begins. |
| kInit, |
| // The state after parsing a syntax error. |
| kError, |
| // When inside a non-escaped block. |
| kPlain, |
| // When inside a quotation-mark-escaped block. |
| kQuoted, |
| // When after reading a block starting and ending with quotation maks. For |
| // the following input, the state would be visited after reading characters |
| // 4 and 7: |
| // a,"b""c",d |
| // 0123456789 |
| kAfter, |
| }; |
| |
| // Returns the next character to be read and updates |position_|. |
| char ConsumeChar(); |
| |
| // Updates |state_| based on the next character to be read, according to this |
| // diagram (made with help of asciiflow.com): |
| // |
| // , |
| // +--+ +--------------------------+ |
| // | | | | |
| // +V--+--V+all but " or , +--------+| |
| // | +---------------> || |
| // | kInit | | kPlain || |
| // | <---------------+ || |
| // ++------+ , +^------++| |
| // | | | | |
| // "| +------+ | |
| // | all but , |, |
| // | | |
| // | | |
| // | +---------+ " +-------++ |
| // | | +----------> | |
| // +---> kQuoted | | kAfter | |
| // | <----------+ | |
| // +---------+ " +-----+--+ |
| // | |
| // +--------+ | |
| // | | | |
| // | kError <----------------+ |
| // | | all but " or , |
| // +--------+ |
| // |
| // The state kError has no outgoing transitions and so UpdateState should not |
| // be called when this state has been entered. |
| void UpdateState(); |
| |
| // State of the parser. |
| State state_ = State::kInit; |
| // The input. |
| const base::StringPiece row_; |
| // If |position_| is >=0 and < |row_.size()|, then it points at the character |
| // to be read next from |row_|. If it is equal to |row_.size()|, then it means |
| // a fake trailing "," will be read next. If it is |row_.size() + 1|, then |
| // reading is done. |
| size_t position_ = 0; |
| |
| DISALLOW_COPY_AND_ASSIGN(FieldParser); |
| }; |
| |
| FieldParser::FieldParser(base::StringPiece row) : row_(row) {} |
| |
| FieldParser::~FieldParser() = default; |
| |
| bool FieldParser::NextField(base::StringPiece* field_contents) { |
| DCHECK(HasMoreFields()); |
| |
| if (state_ != State::kInit) { |
| state_ = State::kError; |
| return false; |
| } |
| |
| const size_t start = position_; |
| do { |
| UpdateState(); |
| } while (state_ != State::kInit && state_ != State::kError); |
| |
| if (state_ != State::kError) { |
| DCHECK_GT(position_, start); // There must have been at least the ','. |
| *field_contents = |
| base::StringPiece(row_.data() + start, position_ - start - 1); |
| |
| if (field_contents->starts_with("\"")) { |
| DCHECK(field_contents->ends_with("\"")) << *field_contents; |
| DCHECK_GE(field_contents->size(), 2u); |
| field_contents->remove_prefix(1); |
| field_contents->remove_suffix(1); |
| } |
| return true; |
| } |
| return false; |
| } |
| |
| char FieldParser::ConsumeChar() { |
| DCHECK_LE(position_, row_.size()); |
| // The default character to return once all from |row_| are consumed and |
| // |position_| == |row_.size()|. |
| char ret = ','; |
| if (position_ < row_.size()) |
| ret = row_[position_]; |
| ++position_; |
| return ret; |
| } |
| |
| void FieldParser::UpdateState() { |
| if (position_ > row_.size()) { |
| // If in state |kInit| then the program attempts to read one field too many. |
| DCHECK_NE(state_, State::kInit); |
| // Otherwise a quotation mark was not matched before the end of input. |
| state_ = State::kError; |
| return; |
| } |
| |
| char read = ConsumeChar(); |
| switch (state_) { |
| case State::kInit: |
| switch (read) { |
| case ',': |
| break; |
| case '"': |
| state_ = State::kQuoted; |
| break; |
| default: |
| state_ = State::kPlain; |
| break; |
| } |
| break; |
| case State::kPlain: |
| switch (read) { |
| case ',': |
| state_ = State::kInit; |
| break; |
| default: |
| break; |
| } |
| break; |
| case State::kQuoted: |
| switch (read) { |
| case '"': |
| state_ = State::kAfter; |
| break; |
| default: |
| break; |
| } |
| break; |
| case State::kAfter: |
| switch (read) { |
| case ',': |
| state_ = State::kInit; |
| break; |
| case '"': |
| state_ = State::kQuoted; |
| break; |
| default: |
| state_ = State::kError; |
| break; |
| } |
| break; |
| case State::kError: |
| NOTREACHED(); |
| break; |
| } |
| } |
| |
| // Created for a string with potentially multiple rows of |
| // comma-separated-values, iteratively returns individual fields from row after |
| // row. |
| class CSVParser { |
| public: |
| explicit CSVParser(base::StringPiece csv); |
| ~CSVParser(); |
| |
| // Reads and unescapes values from the next row, and writes them to |fields|. |
| // Consumes the end-of-line terminator. Returns false on syntax error. The |
| // input must not be empty (check with HasMoreRows() before calling). |
| bool ParseNextCSVRow(std::vector<std::string>* fields); |
| |
| bool HasMoreRows() const { return !remaining_csv_piece_.empty(); } |
| |
| private: |
| base::StringPiece remaining_csv_piece_; |
| |
| DISALLOW_COPY_AND_ASSIGN(CSVParser); |
| }; |
| |
| CSVParser::CSVParser(base::StringPiece csv) : remaining_csv_piece_(csv) {} |
| |
| CSVParser::~CSVParser() = default; |
| |
| bool CSVParser::ParseNextCSVRow(std::vector<std::string>* fields) { |
| fields->clear(); |
| |
| DCHECK(HasMoreRows()); |
| FieldParser parser(ConsumeLine(&remaining_csv_piece_)); |
| base::StringPiece current_field; |
| while (parser.HasMoreFields()) { |
| if (!parser.NextField(¤t_field)) |
| return false; |
| // TODO(crbug.com/918530): Unescape the field contents in-place, as part of |
| // NextField(). |
| std::string field_copy(current_field); |
| base::ReplaceSubstringsAfterOffset(&field_copy, 0, "\"\"", "\""); |
| fields->push_back(std::move(field_copy)); |
| } |
| return true; |
| } |
| |
| } // namespace |
| |
| namespace password_manager { |
| |
| CSVTable::CSVTable() = default; |
| |
| CSVTable::~CSVTable() = default; |
| |
| bool CSVTable::ReadCSV(base::StringPiece csv) { |
| records_.clear(); |
| column_names_.clear(); |
| |
| // Normalize EOL sequences so that we uniformly use a single LF character. |
| std::string normalized_csv(csv); |
| base::ReplaceSubstringsAfterOffset(&normalized_csv, 0, "\r\n", "\n"); |
| |
| // Read header row. |
| CSVParser parser(normalized_csv); |
| if (!parser.HasMoreRows()) { |
| // The empty CSV is a special case. It can be seen as having one row, with a |
| // single field, which is an empty string. |
| column_names_.emplace_back(); |
| return true; |
| } |
| if (!parser.ParseNextCSVRow(&column_names_)) |
| return false; |
| |
| // Reader data records rows. |
| std::vector<std::string> fields; |
| while (parser.HasMoreRows()) { |
| if (!parser.ParseNextCSVRow(&fields)) |
| return false; |
| |
| std::map<base::StringPiece, std::string> row_map; |
| const size_t available_columns = |
| std::min(column_names_.size(), fields.size()); |
| for (size_t i = 0; i < available_columns; ++i) { |
| row_map[column_names_[i]] = std::move(fields[i]); |
| } |
| records_.push_back(std::move(row_map)); |
| } |
| |
| return true; |
| } |
| |
| } // namespace password_manager |