extensions/browser/api/web_request/form_data_parser.cc - chromium/src.git - Git at Google

 // Copyright 2012 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "extensions/browser/api/web_request/form_data_parser.h"

 #include <stddef.h>

 #include <string_view>
 #include <vector>

 #include "base/check.h"
 #include "base/containers/to_vector.h"
 #include "base/memory/raw_ptr.h"
 #include "base/no_destructor.h"
 #include "base/notreached.h"
 #include "base/strings/escape.h"
 #include "base/strings/string_util.h"
 #include "base/types/optional_util.h"
 #include "base/values.h"
 #include "extensions/buildflags/buildflags.h"
 #include "net/http/http_request_headers.h"
 #include "third_party/re2/src/re2/re2.h"

 static_assert(BUILDFLAG(ENABLE_EXTENSIONS_CORE));

 using re2::RE2;

 namespace extensions {

 namespace {

 const char kContentDisposition[] = "content-disposition:";
 const size_t kContentDispositionLength = std::size(kContentDisposition) - 1;
 // kCharacterPattern is an allowed character in a URL encoding. Definition is
 // from RFC 1738, end of section 2.2.
 const char kCharacterPattern[] =
     "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))";
 const char kCRLF[] = "\r\n";
 const char kContentTypeOctetString[] =
     "Content-Type: application/octet-stream\r\n";

 // A wrapper struct for static RE2 objects to be held as a singleton.
 struct Patterns {
   Patterns();
   // Patterns is only instantiated as a NoDestructor static local instance, so
   // the destructor is never called.
   ~Patterns() = delete;
   const RE2 transfer_padding_pattern;
   const RE2 closing_pattern;
   const RE2 epilogue_pattern;
   const RE2 crlf_free_pattern;
   const RE2 preamble_pattern;
   const RE2 header_pattern;
   const RE2 content_disposition_pattern;
   const RE2 name_pattern;
   const RE2 value_pattern;
   const RE2 url_encoded_pattern;
 };

 Patterns::Patterns()
     : transfer_padding_pattern("[ \\t]*\\r\\n"),
       closing_pattern("--[ \\t]*"),
       epilogue_pattern("|\\r\\n(?s:.)*"),
       crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"),
       preamble_pattern(".+?"),
       header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
       content_disposition_pattern(std::string("(?i:") + kContentDisposition +
                                   ")"),
       name_pattern("\\bname=\"([^\"]*)\""),
       value_pattern("\\bfilename=\"([^\"]*)\""),
       url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" +
                           kCharacterPattern + "*)") {}

 const Patterns* GetPatterns() {
   static base::NoDestructor<Patterns> instance;
   return instance.get();
 }

 bool ConsumePrefix(std::string_view* str, std::string_view prefix) {
   if (!str->starts_with(prefix)) {
     return false;
   }
   str->remove_prefix(prefix.size());
   return true;
 }

 }  // namespace

 // Parses URLencoded forms, see
 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
 class FormDataParserUrlEncoded : public FormDataParser {
  public:
   FormDataParserUrlEncoded();

   FormDataParserUrlEncoded(const FormDataParserUrlEncoded&) = delete;
   FormDataParserUrlEncoded& operator=(const FormDataParserUrlEncoded&) = delete;

   ~FormDataParserUrlEncoded() override;

   // Implementation of FormDataParser.
   bool AllDataReadOK() override;
   bool GetNextNameValue(Result* result) override;
   bool SetSource(std::string_view source) override;

  private:
   // Returns the pattern to match a single name-value pair. This could be even
   // static, but then we would have to spend more code on initializing the
   // cached pointer to `GetPatterns()`.
   const RE2& pattern() const { return patterns_->url_encoded_pattern; }

   // Auxiliary constant for using RE2. Number of arguments for parsing
   // name-value pairs (one for name, one for value).
   static const size_t args_size_ = 2u;

   std::string_view source_;
   bool source_set_;
   bool source_malformed_;

   // Auxiliary store for using RE2.
   std::string name_;
   std::string value_;
   const RE2::Arg arg_name_;
   const RE2::Arg arg_value_;
   std::array<const RE2::Arg*, args_size_> args_;

   // Caching the pointer to `GetPatterns()`.
   raw_ptr<const Patterns> patterns_;
 };

 // The following class, FormDataParserMultipart, parses forms encoded as
 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
 // encoding) and 5322 (MIME-headers).
 //
 // Implementation details
 //
 // The original grammar from RFC 2046 is this, "multipart-body" being the root
 // non-terminal:
 //
 // boundary := 0*69<bchars> bcharsnospace
 // bchars := bcharsnospace / " "
 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
 //                  / "-" / "." / "/" / ":" / "=" / "?"
 // dash-boundary := "--" boundary
 // multipart-body := [preamble CRLF]
 //                        dash-boundary transport-padding CRLF
 //                        body-part *encapsulation
 //                        close-delimiter transport-padding
 //                        [CRLF epilogue]
 // transport-padding := *LWSP-char
 // encapsulation := delimiter transport-padding CRLF body-part
 // delimiter := CRLF dash-boundary
 // close-delimiter := delimiter "--"
 // preamble := discard-text
 // epilogue := discard-text
 // discard-text := *(*text CRLF) *text
 // body-part := MIME-part-headers [CRLF *OCTET]
 // OCTET := <any 0-255 octet value>
 //
 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
 // English alphabet, respectively.
 // The non-terminal "text" is presumably just any text, excluding line breaks.
 // The non-terminal "LWSP-char" is not directly defined in the original grammar
 // but it means "linear whitespace", which is a space or a horizontal tab.
 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
 //
 // MIME-part-headers := field-name ":" unstructured CRLF
 // field-name := 1*ftext
 // ftext := %d33-57 /          ; Printable US-ASCII
 //          %d59-126           ;  characters not including ":".
 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
 // "CRLF<horizontal tab>", which serve for "folding".
 //
 // The FormDataParseMultipart class reads the input source and tries to parse it
 // according to the grammar above, rooted at the "multipart-body" non-terminal.
 // This happens in stages:
 //
 // 1. The optional preamble and the initial dash-boundary with transport padding
 // and a CRLF are read and ignored.
 //
 // 2. Repeatedly each body part is read. The body parts can either serve to
 //    upload a file, or just a string of bytes.
 // 2.a. The headers of that part are searched for the "content-disposition"
 //      header, which contains the name of the value represented by that body
 //      part. If the body-part is for file upload, that header also contains a
 //      filename.
 // 2.b. The "*OCTET" part of the body part is then read and passed as the value
 //      of the name-value pair for body parts representing a string of bytes.
 //      For body parts for uploading a file the "*OCTET" part is just ignored
 //      and the filename is used for value instead.
 //
 // 3. The final close-delimiter and epilogue are read and ignored.
 //
 // IMPORTANT NOTE
 // This parser supports sources split into multiple chunks. Therefore SetSource
 // can be called multiple times if the source is spread over several chunks.
 // However, the split may only occur inside a body part, right after the
 // trailing CRLF of headers.
 class FormDataParserMultipart : public FormDataParser {
  public:
   explicit FormDataParserMultipart(const std::string& boundary_separator);

   FormDataParserMultipart(const FormDataParserMultipart&) = delete;
   FormDataParserMultipart& operator=(const FormDataParserMultipart&) = delete;

   ~FormDataParserMultipart() override;

   // Implementation of FormDataParser.
   bool AllDataReadOK() override;
   bool GetNextNameValue(Result* result) override;
   bool SetSource(std::string_view source) override;

  private:
   enum State {
     STATE_INIT,      // No input read yet.
     STATE_READY,     // Ready to call GetNextNameValue.
     STATE_FINISHED,  // Read the input until the end.
     STATE_SUSPEND,   // Waiting until a new |source_| is set.
     STATE_ERROR
   };

   // Tests whether |input| has a prefix matching |pattern|.
   static bool StartsWithPattern(std::string_view input, const RE2& pattern);

   // If |source_| starts with a header, seeks |source_| beyond the header. If
   // the header is Content-Disposition, extracts |name| from "name=" and
   // possibly |value| from "filename=" fields of that header. Only if the
   // "name" or "filename" fields are found, then |name| or |value| are touched.
   // Returns true iff |source_| is seeked forward. Sets |value_assigned|
   // to true iff |value| has been assigned to. Sets |value_is_binary| to true if
   // header has content-type: application/octet-stream.
   bool TryReadHeader(std::string_view* name,
                      std::string_view* value,
                      bool* value_assigned,
                      bool* value_is_binary);

   // Helper to GetNextNameValue. Expects that the input starts with a data
   // portion of a body part. An attempt is made to read the input until the end
   // of that body part. If |data| is not NULL, it is set to contain the data
   // portion. Returns true iff the reading was successful.
   bool FinishReadingPart(std::string_view* data);

   // These methods could be even static, but then we would have to spend more
   // code on initializing the cached pointer to `GetPatterns()`.
   const RE2& transfer_padding_pattern() const {
     return patterns_->transfer_padding_pattern;
   }
   const RE2& closing_pattern() const {
     return patterns_->closing_pattern;
   }
   const RE2& epilogue_pattern() const {
     return patterns_->epilogue_pattern;
   }
   const RE2& crlf_free_pattern() const {
     return patterns_->crlf_free_pattern;
   }
   const RE2& preamble_pattern() const {
     return patterns_->preamble_pattern;
   }
   const RE2& header_pattern() const {
     return patterns_->header_pattern;
   }
   const RE2& content_disposition_pattern() const {
     return patterns_->content_disposition_pattern;
   }
   const RE2& name_pattern() const {
     return patterns_->name_pattern;
   }
   const RE2& value_pattern() const {
     return patterns_->value_pattern;
   }

   std::string dash_boundary_separator_;

   // Because of initialisation dependency, |state_| needs to be declared after
   // |dash_boundary_pattern_|.
   State state_;

   // The parsed message can be split into multiple sources which we read
   // sequentially.
   std::string_view source_;

   // Caching the pointer to `GetPatterns()`.
   raw_ptr<const Patterns> patterns_;
 };

 FormDataParser::Result::Result() = default;
 FormDataParser::Result::~Result() = default;

 void FormDataParser::Result::SetBinaryValue(std::string_view str) {
   value_ = base::Value(base::ToVector(str));
 }

 void FormDataParser::Result::SetStringValue(std::string str) {
   value_ = base::Value(std::move(str));
 }

 FormDataParser::~FormDataParser() = default;

 // static
 std::unique_ptr<FormDataParser> FormDataParser::Create(
     const net::HttpRequestHeaders& request_headers) {
   return CreateFromContentTypeHeader(base::OptionalToPtr(
       request_headers.GetHeader(net::HttpRequestHeaders::kContentType)));
 }

 // static
 std::unique_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader(
     const std::string* content_type_header) {
   enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
   ParserChoice choice = ERROR_CHOICE;
   std::string boundary;

   if (content_type_header == nullptr) {
     choice = URL_ENCODED;
   } else {
     const std::string content_type(
         content_type_header->substr(0, content_type_header->find(';')));

     if (base::EqualsCaseInsensitiveASCII(content_type,
                                          "application/x-www-form-urlencoded")) {
       choice = URL_ENCODED;
     } else if (base::EqualsCaseInsensitiveASCII(content_type,
                                                 "multipart/form-data")) {
       static const char kBoundaryString[] = "boundary=";
       size_t offset = content_type_header->find(kBoundaryString);
       if (offset == std::string::npos) {
         // Malformed header.
         return nullptr;
       }
       offset += sizeof(kBoundaryString) - 1;
       boundary = content_type_header->substr(
           offset, content_type_header->find(';', offset));
       if (!boundary.empty()) {
         choice = MULTIPART;
       }
     }
   }
   // Other cases are unparseable, including when |content_type| is "text/plain".

   switch (choice) {
     case URL_ENCODED:
       return std::unique_ptr<FormDataParser>(new FormDataParserUrlEncoded());
     case MULTIPART:
       return std::unique_ptr<FormDataParser>(
           new FormDataParserMultipart(boundary));
     case ERROR_CHOICE:
       return nullptr;
   }
   NOTREACHED();  // Some compilers do not believe this is unreachable.
 }

 FormDataParser::FormDataParser() = default;

 FormDataParserUrlEncoded::FormDataParserUrlEncoded()
     : source_set_(false),
       source_malformed_(false),
       arg_name_(&name_),
       arg_value_(&value_),
       patterns_(GetPatterns()) {
   args_[0] = &arg_name_;
   args_[1] = &arg_value_;
 }

 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() = default;

 bool FormDataParserUrlEncoded::AllDataReadOK() {
   // All OK means we read the whole source.
   return source_set_ && source_.empty() && !source_malformed_;
 }

 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
   if (!source_set_ || source_malformed_) {
     return false;
   }

   bool success = RE2::ConsumeN(&source_, pattern(), args_.data(), args_size_);
   if (success) {
     const base::UnescapeRule::Type kUnescapeRules =
         base::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

     std::string unescaped_name =
         base::UnescapeBinaryURLComponent(name_, kUnescapeRules);
     result->set_name(unescaped_name);
     std::string unescaped_value =
         base::UnescapeBinaryURLComponent(value_, kUnescapeRules);
     if (base::IsStringUTF8(unescaped_value)) {
       result->SetStringValue(std::move(unescaped_value));
     } else {
       result->SetBinaryValue(unescaped_value);
     }
   }
   if (source_.length() > 0) {
     if (source_[0] == '&') {
       source_.remove_prefix(1);  // Remove the leading '&'.
     } else {
       source_malformed_ = true;  // '&' missing between two name-value pairs.
     }
   }
   return success && !source_malformed_;
 }

 bool FormDataParserUrlEncoded::SetSource(std::string_view source) {
   if (source_set_) {
     return false;  // We do not allow multiple sources for this parser.
   }
   source_ = source;
   source_set_ = true;
   source_malformed_ = false;
   return true;
 }

 // static
 bool FormDataParserMultipart::StartsWithPattern(std::string_view input,
                                                 const RE2& pattern) {
   return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, nullptr, 0);
 }

 FormDataParserMultipart::FormDataParserMultipart(
     const std::string& boundary_separator)
     : dash_boundary_separator_("--" + boundary_separator),
       state_(STATE_INIT),
       patterns_(GetPatterns()) {}

 FormDataParserMultipart::~FormDataParserMultipart() = default;

 bool FormDataParserMultipart::AllDataReadOK() {
   return state_ == STATE_FINISHED;
 }

 bool FormDataParserMultipart::FinishReadingPart(std::string_view* data) {
   std::string_view orig = source_;
   while (!source_.starts_with(dash_boundary_separator_)) {
     if (!RE2::Consume(&source_, crlf_free_pattern()) ||
         !ConsumePrefix(&source_, kCRLF)) {
       state_ = STATE_ERROR;
       return false;
     }
   }
   if (data != nullptr) {
     if (orig.size() == source_.size()) {
       // No data in this body part.
       state_ = STATE_ERROR;
       return false;
     }
     // Return the data consumed, minus two bytes for the trailing "\r\n".
     orig.remove_suffix(source_.size() + 2);
     *data = orig;
   }

   // Finally, read the dash-boundary and either skip to the next body part, or
   // finish reading the source.
   CHECK(ConsumePrefix(&source_, dash_boundary_separator_));
   if (StartsWithPattern(source_, closing_pattern())) {
     CHECK(RE2::Consume(&source_, closing_pattern()));
     if (RE2::Consume(&source_, epilogue_pattern())) {
       state_ = STATE_FINISHED;
     } else {
       state_ = STATE_ERROR;
     }
   } else {  // Next body part ahead.
     if (!RE2::Consume(&source_, transfer_padding_pattern())) {
       state_ = STATE_ERROR;
     }
   }
   return state_ != STATE_ERROR;
 }

 bool FormDataParserMultipart::GetNextNameValue(Result* result) {
   if (source_.empty() || state_ != STATE_READY) {
     return false;
   }

   // 1. Read body-part headers.
   std::string_view name;
   std::string_view value;
   bool value_assigned = false;
   bool value_is_binary = false;
   bool value_assigned_temp;
   bool value_is_binary_temp;
   while (TryReadHeader(&name, &value, &value_assigned_temp,
                        &value_is_binary_temp)) {
     value_is_binary |= value_is_binary_temp;
     value_assigned |= value_assigned_temp;
   }
   if (name.empty() || state_ == STATE_ERROR) {
     state_ = STATE_ERROR;
     return false;
   }

   // 2. Read the trailing CRLF after headers.
   if (!ConsumePrefix(&source_, kCRLF)) {
     state_ = STATE_ERROR;
     return false;
   }

   // 3. Read the data of this body part, i.e., everything until the first
   // dash-boundary.
   bool return_value;
   if (value_assigned && source_.empty()) {  // Wait for a new source?
     return_value = true;
     state_ = STATE_SUSPEND;
   } else {
     return_value = FinishReadingPart(value_assigned ? nullptr : &value);
   }

   result->set_name(base::UnescapeBinaryURLComponent(name));
   if (value_assigned) {
     // Hold filename as value.
     result->SetStringValue(std::string(value));
   } else if (value_is_binary) {
     result->SetBinaryValue(value);
   } else {
     result->SetStringValue(std::string(value));
   }

   return return_value;
 }

 bool FormDataParserMultipart::SetSource(std::string_view source) {
   if (source.data() == nullptr || !source_.empty()) {
     return false;
   }
   source_ = source;

   switch (state_) {
     case STATE_INIT:
       // Seek behind the preamble.
       while (!source_.starts_with(dash_boundary_separator_)) {
         if (!RE2::Consume(&source_, preamble_pattern())) {
           state_ = STATE_ERROR;
           break;
         }
       }
       // Read dash-boundary, transfer padding, and CRLF.
       if (state_ != STATE_ERROR) {
         if (!ConsumePrefix(&source_, dash_boundary_separator_) ||
             !RE2::Consume(&source_, transfer_padding_pattern())) {
           state_ = STATE_ERROR;
         } else {
           state_ = STATE_READY;
         }
       }
       break;
     case STATE_READY:  // Nothing to do.
       break;
     case STATE_SUSPEND:
       state_ = FinishReadingPart(nullptr) ? STATE_READY : STATE_ERROR;
       break;
     default:
       state_ = STATE_ERROR;
   }
   return state_ != STATE_ERROR;
 }

 bool FormDataParserMultipart::TryReadHeader(std::string_view* name,
                                             std::string_view* value,
                                             bool* value_assigned,
                                             bool* value_is_binary) {
   *value_assigned = false;
   *value_is_binary = false;
   // Support Content-Type: application/octet-stream.
   // Form data with this content type is represented as string of bytes.
   if (ConsumePrefix(&source_, kContentTypeOctetString)) {
     *value_is_binary = true;
     return true;
   }
   const char* header_start = source_.data();
   if (!RE2::Consume(&source_, header_pattern())) {
     return false;
   }
   // (*) After this point we must return true, because we consumed one header.

   // Subtract 2 for the trailing "\r\n".
   std::string_view header(header_start, source_.data() - header_start - 2);

   if (!StartsWithPattern(header, content_disposition_pattern())) {
     return true;  // Skip headers that don't describe the content-disposition.
   }

   std::string_view groups[2];

   if (!name_pattern().Match(header,
                             kContentDispositionLength, header.size(),
                             RE2::UNANCHORED, groups, 2)) {
     state_ = STATE_ERROR;
     return true;  // See (*) for why true.
   }
   *name = groups[1];

   if (value_pattern().Match(header,
                             kContentDispositionLength, header.size(),
                             RE2::UNANCHORED, groups, 2)) {
     *value = groups[1];
     *value_assigned = true;
   }
   return true;
 }

 }  // namespace extensions
	// Copyright 2012 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "extensions/browser/api/web_request/form_data_parser.h"

	#include <stddef.h>

	#include <string_view>
	#include <vector>

	#include "base/check.h"
	#include "base/containers/to_vector.h"
	#include "base/memory/raw_ptr.h"
	#include "base/no_destructor.h"
	#include "base/notreached.h"
	#include "base/strings/escape.h"
	#include "base/strings/string_util.h"
	#include "base/types/optional_util.h"
	#include "base/values.h"
	#include "extensions/buildflags/buildflags.h"
	#include "net/http/http_request_headers.h"
	#include "third_party/re2/src/re2/re2.h"

	static_assert(BUILDFLAG(ENABLE_EXTENSIONS_CORE));

	using re2::RE2;

	namespace extensions {

	namespace {

	const char kContentDisposition[] = "content-disposition:";
	const size_t kContentDispositionLength = std::size(kContentDisposition) - 1;
	// kCharacterPattern is an allowed character in a URL encoding. Definition is
	// from RFC 1738, end of section 2.2.
	const char kCharacterPattern[] =
	"(?:[a-zA-Z0-9$_.+!*'(),]\|-\|(?:%[a-fA-F0-9]{2}))";
	const char kCRLF[] = "\r\n";
	const char kContentTypeOctetString[] =
	"Content-Type: application/octet-stream\r\n";

	// A wrapper struct for static RE2 objects to be held as a singleton.
	struct Patterns {
	Patterns();
	// Patterns is only instantiated as a NoDestructor static local instance, so
	// the destructor is never called.
	~Patterns() = delete;
	const RE2 transfer_padding_pattern;
	const RE2 closing_pattern;
	const RE2 epilogue_pattern;
	const RE2 crlf_free_pattern;
	const RE2 preamble_pattern;
	const RE2 header_pattern;
	const RE2 content_disposition_pattern;
	const RE2 name_pattern;
	const RE2 value_pattern;
	const RE2 url_encoded_pattern;
	};

	Patterns::Patterns()
	: transfer_padding_pattern("[ \\t]*\\r\\n"),
	closing_pattern("--[ \\t]*"),
	epilogue_pattern("\|\\r\\n(?s:.)*"),
	crlf_free_pattern("(?:[^\\r]\|\\r+[^\\r\\n])*"),
	preamble_pattern(".+?"),
	header_pattern("[!-9;-~]+:(.\|\\r\\n[\\t ])*\\r\\n"),
	content_disposition_pattern(std::string("(?i:") + kContentDisposition +
	")"),
	name_pattern("\\bname=\"([^\"]*)\""),
	value_pattern("\\bfilename=\"([^\"]*)\""),
	url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" +
	kCharacterPattern + "*)") {}

	const Patterns* GetPatterns() {
	static base::NoDestructor<Patterns> instance;
	return instance.get();
	}

	bool ConsumePrefix(std::string_view* str, std::string_view prefix) {
	if (!str->starts_with(prefix)) {
	return false;
	}
	str->remove_prefix(prefix.size());
	return true;
	}

	} // namespace

	// Parses URLencoded forms, see
	// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
	class FormDataParserUrlEncoded : public FormDataParser {
	public:
	FormDataParserUrlEncoded();

	FormDataParserUrlEncoded(const FormDataParserUrlEncoded&) = delete;
	FormDataParserUrlEncoded& operator=(const FormDataParserUrlEncoded&) = delete;

	~FormDataParserUrlEncoded() override;

	// Implementation of FormDataParser.
	bool AllDataReadOK() override;
	bool GetNextNameValue(Result* result) override;
	bool SetSource(std::string_view source) override;

	private:
	// Returns the pattern to match a single name-value pair. This could be even
	// static, but then we would have to spend more code on initializing the
	// cached pointer to `GetPatterns()`.
	const RE2& pattern() const { return patterns_->url_encoded_pattern; }

	// Auxiliary constant for using RE2. Number of arguments for parsing
	// name-value pairs (one for name, one for value).
	static const size_t args_size_ = 2u;

	std::string_view source_;
	bool source_set_;
	bool source_malformed_;

	// Auxiliary store for using RE2.
	std::string name_;
	std::string value_;
	const RE2::Arg arg_name_;
	const RE2::Arg arg_value_;
	std::array<const RE2::Arg*, args_size_> args_;

	// Caching the pointer to `GetPatterns()`.
	raw_ptr<const Patterns> patterns_;
	};

	// The following class, FormDataParserMultipart, parses forms encoded as
	// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
	// encoding) and 5322 (MIME-headers).
	//
	// Implementation details
	//
	// The original grammar from RFC 2046 is this, "multipart-body" being the root
	// non-terminal:
	//
	// boundary := 0*69<bchars> bcharsnospace
	// bchars := bcharsnospace / " "
	// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
	// / "-" / "." / "/" / ":" / "=" / "?"
	// dash-boundary := "--" boundary
	// multipart-body := [preamble CRLF]
	// dash-boundary transport-padding CRLF
	// body-part *encapsulation
	// close-delimiter transport-padding
	// [CRLF epilogue]
	// transport-padding := *LWSP-char
	// encapsulation := delimiter transport-padding CRLF body-part
	// delimiter := CRLF dash-boundary
	// close-delimiter := delimiter "--"
	// preamble := discard-text
	// epilogue := discard-text
	// discard-text := (text CRLF) *text
	// body-part := MIME-part-headers [CRLF *OCTET]
	// OCTET := <any 0-255 octet value>
	//
	// Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
	// DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
	// English alphabet, respectively.
	// The non-terminal "text" is presumably just any text, excluding line breaks.
	// The non-terminal "LWSP-char" is not directly defined in the original grammar
	// but it means "linear whitespace", which is a space or a horizontal tab.
	// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
	// the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
	//
	// MIME-part-headers := field-name ":" unstructured CRLF
	// field-name := 1*ftext
	// ftext := %d33-57 / ; Printable US-ASCII
	// %d59-126 ; characters not including ":".
	// Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
	// does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
	// "CRLF<horizontal tab>", which serve for "folding".
	//
	// The FormDataParseMultipart class reads the input source and tries to parse it
	// according to the grammar above, rooted at the "multipart-body" non-terminal.
	// This happens in stages:
	//
	// 1. The optional preamble and the initial dash-boundary with transport padding
	// and a CRLF are read and ignored.
	//
	// 2. Repeatedly each body part is read. The body parts can either serve to
	// upload a file, or just a string of bytes.
	// 2.a. The headers of that part are searched for the "content-disposition"
	// header, which contains the name of the value represented by that body
	// part. If the body-part is for file upload, that header also contains a
	// filename.
	// 2.b. The "*OCTET" part of the body part is then read and passed as the value
	// of the name-value pair for body parts representing a string of bytes.
	// For body parts for uploading a file the "*OCTET" part is just ignored
	// and the filename is used for value instead.
	//
	// 3. The final close-delimiter and epilogue are read and ignored.
	//
	// IMPORTANT NOTE
	// This parser supports sources split into multiple chunks. Therefore SetSource
	// can be called multiple times if the source is spread over several chunks.
	// However, the split may only occur inside a body part, right after the
	// trailing CRLF of headers.
	class FormDataParserMultipart : public FormDataParser {
	public:
	explicit FormDataParserMultipart(const std::string& boundary_separator);

	FormDataParserMultipart(const FormDataParserMultipart&) = delete;
	FormDataParserMultipart& operator=(const FormDataParserMultipart&) = delete;

	~FormDataParserMultipart() override;

	// Implementation of FormDataParser.
	bool AllDataReadOK() override;
	bool GetNextNameValue(Result* result) override;
	bool SetSource(std::string_view source) override;

	private:
	enum State {
	STATE_INIT, // No input read yet.
	STATE_READY, // Ready to call GetNextNameValue.
	STATE_FINISHED, // Read the input until the end.
	STATE_SUSPEND, // Waiting until a new \|source_\| is set.
	STATE_ERROR
	};

	// Tests whether \|input\| has a prefix matching \|pattern\|.
	static bool StartsWithPattern(std::string_view input, const RE2& pattern);

	// If \|source_\| starts with a header, seeks \|source_\| beyond the header. If
	// the header is Content-Disposition, extracts \|name\| from "name=" and
	// possibly \|value\| from "filename=" fields of that header. Only if the
	// "name" or "filename" fields are found, then \|name\| or \|value\| are touched.
	// Returns true iff \|source_\| is seeked forward. Sets \|value_assigned\|
	// to true iff \|value\| has been assigned to. Sets \|value_is_binary\| to true if
	// header has content-type: application/octet-stream.
	bool TryReadHeader(std::string_view* name,
	std::string_view* value,
	bool* value_assigned,
	bool* value_is_binary);

	// Helper to GetNextNameValue. Expects that the input starts with a data
	// portion of a body part. An attempt is made to read the input until the end
	// of that body part. If \|data\| is not NULL, it is set to contain the data
	// portion. Returns true iff the reading was successful.
	bool FinishReadingPart(std::string_view* data);

	// These methods could be even static, but then we would have to spend more
	// code on initializing the cached pointer to `GetPatterns()`.
	const RE2& transfer_padding_pattern() const {
	return patterns_->transfer_padding_pattern;
	}
	const RE2& closing_pattern() const {
	return patterns_->closing_pattern;
	}
	const RE2& epilogue_pattern() const {
	return patterns_->epilogue_pattern;
	}
	const RE2& crlf_free_pattern() const {
	return patterns_->crlf_free_pattern;
	}
	const RE2& preamble_pattern() const {
	return patterns_->preamble_pattern;
	}
	const RE2& header_pattern() const {
	return patterns_->header_pattern;
	}
	const RE2& content_disposition_pattern() const {
	return patterns_->content_disposition_pattern;
	}
	const RE2& name_pattern() const {
	return patterns_->name_pattern;
	}
	const RE2& value_pattern() const {
	return patterns_->value_pattern;
	}

	std::string dash_boundary_separator_;

	// Because of initialisation dependency, \|state_\| needs to be declared after
	// \|dash_boundary_pattern_\|.
	State state_;

	// The parsed message can be split into multiple sources which we read
	// sequentially.
	std::string_view source_;

	// Caching the pointer to `GetPatterns()`.
	raw_ptr<const Patterns> patterns_;
	};

	FormDataParser::Result::Result() = default;
	FormDataParser::Result::~Result() = default;

	void FormDataParser::Result::SetBinaryValue(std::string_view str) {
	value_ = base::Value(base::ToVector(str));
	}

	void FormDataParser::Result::SetStringValue(std::string str) {
	value_ = base::Value(std::move(str));
	}

	FormDataParser::~FormDataParser() = default;

	// static
	std::unique_ptr<FormDataParser> FormDataParser::Create(
	const net::HttpRequestHeaders& request_headers) {
	return CreateFromContentTypeHeader(base::OptionalToPtr(
	request_headers.GetHeader(net::HttpRequestHeaders::kContentType)));
	}

	// static
	std::unique_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader(
	const std::string* content_type_header) {
	enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
	ParserChoice choice = ERROR_CHOICE;
	std::string boundary;

	if (content_type_header == nullptr) {
	choice = URL_ENCODED;
	} else {
	const std::string content_type(
	content_type_header->substr(0, content_type_header->find(';')));

	if (base::EqualsCaseInsensitiveASCII(content_type,
	"application/x-www-form-urlencoded")) {
	choice = URL_ENCODED;
	} else if (base::EqualsCaseInsensitiveASCII(content_type,
	"multipart/form-data")) {
	static const char kBoundaryString[] = "boundary=";
	size_t offset = content_type_header->find(kBoundaryString);
	if (offset == std::string::npos) {
	// Malformed header.
	return nullptr;
	}
	offset += sizeof(kBoundaryString) - 1;
	boundary = content_type_header->substr(
	offset, content_type_header->find(';', offset));
	if (!boundary.empty()) {
	choice = MULTIPART;
	}
	}
	}
	// Other cases are unparseable, including when \|content_type\| is "text/plain".

	switch (choice) {
	case URL_ENCODED:
	return std::unique_ptr<FormDataParser>(new FormDataParserUrlEncoded());
	case MULTIPART:
	return std::unique_ptr<FormDataParser>(
	new FormDataParserMultipart(boundary));
	case ERROR_CHOICE:
	return nullptr;
	}
	NOTREACHED(); // Some compilers do not believe this is unreachable.
	}

	FormDataParser::FormDataParser() = default;

	FormDataParserUrlEncoded::FormDataParserUrlEncoded()
	: source_set_(false),
	source_malformed_(false),
	arg_name_(&name_),
	arg_value_(&value_),
	patterns_(GetPatterns()) {
	args_[0] = &arg_name_;
	args_[1] = &arg_value_;
	}

	FormDataParserUrlEncoded::~FormDataParserUrlEncoded() = default;

	bool FormDataParserUrlEncoded::AllDataReadOK() {
	// All OK means we read the whole source.
	return source_set_ && source_.empty() && !source_malformed_;
	}

	bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
	if (!source_set_ \|\| source_malformed_) {
	return false;
	}

	bool success = RE2::ConsumeN(&source_, pattern(), args_.data(), args_size_);
	if (success) {
	const base::UnescapeRule::Type kUnescapeRules =
	base::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

	std::string unescaped_name =
	base::UnescapeBinaryURLComponent(name_, kUnescapeRules);
	result->set_name(unescaped_name);
	std::string unescaped_value =
	base::UnescapeBinaryURLComponent(value_, kUnescapeRules);
	if (base::IsStringUTF8(unescaped_value)) {
	result->SetStringValue(std::move(unescaped_value));
	} else {
	result->SetBinaryValue(unescaped_value);
	}
	}
	if (source_.length() > 0) {
	if (source_[0] == '&') {
	source_.remove_prefix(1); // Remove the leading '&'.
	} else {
	source_malformed_ = true; // '&' missing between two name-value pairs.
	}
	}
	return success && !source_malformed_;
	}

	bool FormDataParserUrlEncoded::SetSource(std::string_view source) {
	if (source_set_) {
	return false; // We do not allow multiple sources for this parser.
	}
	source_ = source;
	source_set_ = true;
	source_malformed_ = false;
	return true;
	}

	// static
	bool FormDataParserMultipart::StartsWithPattern(std::string_view input,
	const RE2& pattern) {
	return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, nullptr, 0);
	}

	FormDataParserMultipart::FormDataParserMultipart(
	const std::string& boundary_separator)
	: dash_boundary_separator_("--" + boundary_separator),
	state_(STATE_INIT),
	patterns_(GetPatterns()) {}

	FormDataParserMultipart::~FormDataParserMultipart() = default;

	bool FormDataParserMultipart::AllDataReadOK() {
	return state_ == STATE_FINISHED;
	}

	bool FormDataParserMultipart::FinishReadingPart(std::string_view* data) {
	std::string_view orig = source_;
	while (!source_.starts_with(dash_boundary_separator_)) {
	if (!RE2::Consume(&source_, crlf_free_pattern()) \|\|
	!ConsumePrefix(&source_, kCRLF)) {
	state_ = STATE_ERROR;
	return false;
	}
	}
	if (data != nullptr) {
	if (orig.size() == source_.size()) {
	// No data in this body part.
	state_ = STATE_ERROR;
	return false;
	}
	// Return the data consumed, minus two bytes for the trailing "\r\n".
	orig.remove_suffix(source_.size() + 2);
	*data = orig;
	}

	// Finally, read the dash-boundary and either skip to the next body part, or
	// finish reading the source.
	CHECK(ConsumePrefix(&source_, dash_boundary_separator_));
	if (StartsWithPattern(source_, closing_pattern())) {
	CHECK(RE2::Consume(&source_, closing_pattern()));
	if (RE2::Consume(&source_, epilogue_pattern())) {
	state_ = STATE_FINISHED;
	} else {
	state_ = STATE_ERROR;
	}
	} else { // Next body part ahead.
	if (!RE2::Consume(&source_, transfer_padding_pattern())) {
	state_ = STATE_ERROR;
	}
	}
	return state_ != STATE_ERROR;
	}

	bool FormDataParserMultipart::GetNextNameValue(Result* result) {
	if (source_.empty() \|\| state_ != STATE_READY) {
	return false;
	}

	// 1. Read body-part headers.
	std::string_view name;
	std::string_view value;
	bool value_assigned = false;
	bool value_is_binary = false;
	bool value_assigned_temp;
	bool value_is_binary_temp;
	while (TryReadHeader(&name, &value, &value_assigned_temp,
	&value_is_binary_temp)) {
	value_is_binary \|= value_is_binary_temp;
	value_assigned \|= value_assigned_temp;
	}
	if (name.empty() \|\| state_ == STATE_ERROR) {
	state_ = STATE_ERROR;
	return false;
	}

	// 2. Read the trailing CRLF after headers.
	if (!ConsumePrefix(&source_, kCRLF)) {
	state_ = STATE_ERROR;
	return false;
	}

	// 3. Read the data of this body part, i.e., everything until the first
	// dash-boundary.
	bool return_value;
	if (value_assigned && source_.empty()) { // Wait for a new source?
	return_value = true;
	state_ = STATE_SUSPEND;
	} else {
	return_value = FinishReadingPart(value_assigned ? nullptr : &value);
	}

	result->set_name(base::UnescapeBinaryURLComponent(name));
	if (value_assigned) {
	// Hold filename as value.
	result->SetStringValue(std::string(value));
	} else if (value_is_binary) {
	result->SetBinaryValue(value);
	} else {
	result->SetStringValue(std::string(value));
	}

	return return_value;
	}

	bool FormDataParserMultipart::SetSource(std::string_view source) {
	if (source.data() == nullptr \|\| !source_.empty()) {
	return false;
	}
	source_ = source;

	switch (state_) {
	case STATE_INIT:
	// Seek behind the preamble.
	while (!source_.starts_with(dash_boundary_separator_)) {
	if (!RE2::Consume(&source_, preamble_pattern())) {
	state_ = STATE_ERROR;
	break;
	}
	}
	// Read dash-boundary, transfer padding, and CRLF.
	if (state_ != STATE_ERROR) {
	if (!ConsumePrefix(&source_, dash_boundary_separator_) \|\|
	!RE2::Consume(&source_, transfer_padding_pattern())) {
	state_ = STATE_ERROR;
	} else {
	state_ = STATE_READY;
	}
	}
	break;
	case STATE_READY: // Nothing to do.
	break;
	case STATE_SUSPEND:
	state_ = FinishReadingPart(nullptr) ? STATE_READY : STATE_ERROR;
	break;
	default:
	state_ = STATE_ERROR;
	}
	return state_ != STATE_ERROR;
	}

	bool FormDataParserMultipart::TryReadHeader(std::string_view* name,
	std::string_view* value,
	bool* value_assigned,
	bool* value_is_binary) {
	*value_assigned = false;
	*value_is_binary = false;
	// Support Content-Type: application/octet-stream.
	// Form data with this content type is represented as string of bytes.
	if (ConsumePrefix(&source_, kContentTypeOctetString)) {
	*value_is_binary = true;
	return true;
	}
	const char* header_start = source_.data();
	if (!RE2::Consume(&source_, header_pattern())) {
	return false;
	}
	// (*) After this point we must return true, because we consumed one header.

	// Subtract 2 for the trailing "\r\n".
	std::string_view header(header_start, source_.data() - header_start - 2);

	if (!StartsWithPattern(header, content_disposition_pattern())) {
	return true; // Skip headers that don't describe the content-disposition.
	}

	std::string_view groups[2];

	if (!name_pattern().Match(header,
	kContentDispositionLength, header.size(),
	RE2::UNANCHORED, groups, 2)) {
	state_ = STATE_ERROR;
	return true; // See (*) for why true.
	}
	*name = groups[1];

	if (value_pattern().Match(header,
	kContentDispositionLength, header.size(),
	RE2::UNANCHORED, groups, 2)) {
	*value = groups[1];
	*value_assigned = true;
	}
	return true;
	}

	} // namespace extensions