blob: 3c7625572143dcf3d6816aebf04c60eb83f20f33 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROMEOS_PRINTING_URI_H_
#define CHROMEOS_PRINTING_URI_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "chromeos/chromeos_export.h"
namespace chromeos {
// This is a simple URI builder/parser.
// This class has similar functionality as GURL (Google's URL parsing library).
// However, we were not able to use GURL because of the following reasons:
// - GURL has no support for ipp/ipps scheme
// - we need general parser and builder for http-like URIs with schemes
// different than http/https (expressions like scheme://host/path?query)
// - we need simple methods for replacing particular components (by SetX/GetX)
// - we do not care too much about edge cases, like empty Host or Path
//
// This class is a container for general http-like URI. It can parse any
// reasonable formatted http-like URI (see the grammar below) and return
// the normalized form. Valid UTF-8 characters and the escape character % are
// supported and normalized according to the rules specified in the standard
// https://tools.ietf.org/html/std66. While the general syntax of the URI is
// enforced, this class does no validate semantics of the URI. It means that
// you can freely set/modify every component with Set*(...) methods.
//
//
// General Rules
//===============
//
// The URI consists of the following components:
// * Scheme
// * Userinfo
// * Host
// * Port
// * Path
// * Query
// * Fragment
// Objects of this class do not store original (input) strings. All parsed data
// is stored and returned in a normalized form. The syntax and the normalization
// algorithm is based on https://tools.ietf.org/html/std66 with the following
// modifications:
//
// 1. The grammar is simplified:
//
// uri = [ Scheme ":" ] [ authority ] [ Path ] [ "?" Query ] [ "#" Fragment ]
//
// authority = "//" [ Userinfo "@" ] Host [ ":" Port ]
//
// The grammar is written in ABNF notation (RFC2234). Square brackets ([...])
// means optional.
//
// 2. The empty Scheme/Userinfo/Host/Path/Query/Fragment is treated the same way
// as "not specified", e.g.:
//
// http:///a = http:///a? = http:/a? = http:/a# = http:/a
//
// 3. Relative paths are not supported. Path must be empty or start with '/'.
//
// 4. Non-printable ASCII characters (0x00-0x1F and 0x7F-0xFF) are not
// supported, even when coded as %-escaped characters. The only exceptions
// are bytes coding UTF-8 characters.
//
//
// Example
//=========
//
// Let's say that we want to parse an URI:
// Uri uri("ipp://home.net:1234/my/printer/");
// if (uri.GetLastParsingError().status != Uri::ParserStatus::kNoErrors) {
// std::cout << "Invalid URI" << std::endl;
// } else {
// std::cout << "Normalized form: " << uri.GetNormalized() << std::endl;
// std::cout << "Scheme: " << uri.GetScheme() << std::endl;
// std::cout << "Host: " << uri.GetHost() << std::endl;
// std::cout << "Port: " << uri.GetPort() << std::endl;
// std::cout << "Path: " << uri.GetPath() << std::endl;
// }
// // Change Port to the default one:
// uri.SetPort(-1);
// // Change Path "/ipp/printer"
// uri.SetPath({"ipp", "printer"});
//
//
// Default Port Numbers
//======================
//
// Some schemes have default port number. This Port number is set automatically
// when both of the following conditions are met:
// * the current Port number is unspecified (equals -1)
// * the Scheme equals to one of schemes from the list below
// The following schemes are recognize and has a default port number:
// * http : 80
// * https : 443
// * ipp : 631
// * ipps : 443
// * lpd : 515
// * socket : 9100
//
//
// Encoding
//==========
//
// The parser accepts valid UTF-8 characters and %-escaped characters. In the
// normalized form, all bytes coding UTF-8 characters are coded as %-escaped
// characters. The components Scheme and Port do not allow for %-escaped and
// UTF-8characters. See the section Components below for details.
//
// By %-escaped character we understand here a single byte coded as three ASCII
// characters: the percent sign ('%') and two hex digits coding the value. Both
// lowercase and uppercase letters may be used as a hex digit on the input, but
// they are always normalized to uppercase letters.
//
// In general, non-printable ASCII characters are not allowed, even as %-escaped
// characters. After decoding %-escaped characters, the parser applies the
// following criteria:
// * 0x00-0x1F - a disallowed ASCII character
// * 0x20-0x7E - a valid ASCII character
// * 0x7F-0xBF - a disallowed ASCII character
// * 0xC0-0xF7 - the beginning of UTF-8 character (try to parse UTF-8 sequence)
// * 0xF8-0xFF - a disallowed ASCII character
//
//
// Components
//============
//
// These three sets of ASCII characters are used in components definitions:
// * ALPHA - any letter (A-Z or a-z)
// * DIGIT - any digit (0-9)
// * STD_CHARS = ALPHA | DIGIT | "-" | "." | "_" | "~" | "!" | "$" | "'"
// | "(" | ")" | "*" | "," | ";"
//
// These three properties are used in components' descriptions:
// * Allowed characters - a set of characters that is allowed in
// the normalized form of the component
// * %-escaped characters - if NO, then %-escaped characters are not allowed,
// neither on the input nor in the normalized form; it also means that
// only characters from the "Allowed characters" property are allowed
// on the input
// * Case-sensitive - if NO, then lowercase and uppercase letters have the
// same meaning and they are adjusted by the normalization algorithm
//
// Scheme
//--------
// The first character must be ALPHA.
// Allowed characters : ALPHA | DIGIT | "+" | "-" | "."
// %-escaped characters: NO
// Case-sensitive : NO - normalized to lowercase
//
// Userinfo
//----------
// Allowed characters : STD_CHARS | "+" | "&" | "=" | ":"
// %-escaped characters: YES
// Case-sensitive : YES
//
// Host
//------
// Allowed characters : STD_CHARS | "+" | "&" | "="
// %-escaped characters: YES
// Case-sensitive : NO - normalized to lowercase
//
// Port
//------
// It is a non-negative number; it cannot be larger than 65535.
// If not-specified and the Scheme has default Port number then the default
// number is set. In normalized URI, the Port is omitted if it equals default
// port from the Scheme. Allowed characters : DIGIT
// %-escaped characters: NO
//
// Path
//------
// It must match to the following grammar:
// Path = "/" segment [ Path ] | "/"
// Path equals "/" is normalized to empty Path.
// Segments "." and ".." are special and are reduced during normalization, e.g:
// /abac/./123/def/../x -> /abac/123/x
// /xzy/../../sss/ -> /../sss
// Segment is a non-empty string with the following properties:
// Allowed characters : STD_CHARS | "+" | "&" | "=" | ":" | "@"
// %-escaped characters: YES
// Case-sensitive : YES
//
// Query
//-------
// It must match to the following grammar:
// Query = [ pairs [ "&" ] ]
// pairs = pair [ "&" pairs ]
// pair = name [ "=" value ]
// The parser replaces all occurrences of "+" in Name and Value by " " (space).
// Name cannot be empty. When Value is empty, the separator "=" is omitted in
// the normalized form.
// Name and Value are strings with the following properties:
// Allowed characters : STD_CHARS | ":" | "@" | "/" | "?"
// %-escaped characters: YES
// Case-sensitive : YES
//
// Fragment
//----------
// Allowed characters : STD_CHARS | "+" | "&" | "=" | ":" | "@" | "/" | "?"
// %-escaped characters: YES
// Case-sensitive : YES
//
class CHROMEOS_EXPORT Uri {
public:
enum class ParserStatus {
kNoErrors,
kInvalidPercentEncoding, // cannot parse hex number after % sign
kDisallowedASCIICharacter, // non-printable ASCII character
kInvalidUTF8Character, // error when tried to parse UTF-8 character
kInvalidScheme, // invalid Scheme format
kInvalidPortNumber,
kRelativePathsNotAllowed, // non-empty Path that does not start with '/'
kEmptySegmentInPath,
kEmptyParameterNameInQuery
};
// This struct contains the last parser error. The parser error is always
// set/reset by the following methods:
// - the constructor with a parameter
// - Set*(...) methods
// - Set*Encoded(...) methods
// The parser stops on the first error and reports its position in
// |parsed_chars| as a number of successfully parsed characters from the
// string given on the input. Methods SetQuery(...), SetQueryEncoded(...),
// SetPath(...) and SetPathEncoded(...) may take as a parameter more than
// one string. For them, the parser reports the number of successfully
// parsed strings in |parsed_strings| and the position of the error in the
// invalid string as |parsed_chars|.
// If |status| == kNoErrors, values of the fields |parsed_chars| and
// |parsed_strings| are undefined.
struct ParserError {
ParserStatus status = ParserStatus::kNoErrors;
// The position in the input string where the parser error occurred.
// When an error occurred for %-escaped character, it is the position of
// the corresponding '%' sign.
// If |status| == kNoErrors, then this value is undefined.
size_t parsed_chars = 0;
// This field is relevant only for the methods SetQuery(...),
// SetPath(...), SetQueryEncoded(...) and SetPathEncoded(...).
// In case of a parser error, it holds the number of successfully
// parsed strings. For SetPath*(...) methods, it is an index of the
// invalid string in the input vector. For SetQuery*(...) methods, the
// index of invalid pair is (|parsed_strings|/2) and the value of
// (|parsed_strings|%2) indicates the invalid string in the pair.
// If |status| == kNoErrors, then this value is undefined.
size_t parsed_strings = 0;
};
// Returns the default port number for given |scheme|. If |scheme| is not
// known or it does not have a default port number, this method returns -1.
static int GetDefaultPort(const std::string& scheme);
// Constructor, creates an empty URI.
Uri();
// Constructor, it tries to parse |uri|.
// Leading and trailing whitespaces (space, \t, \n, \r, \f, \v) are ignored.
explicit Uri(const std::string& uri);
Uri(const Uri&);
Uri(Uri&&);
~Uri();
Uri& operator=(const Uri&);
Uri& operator=(Uri&&);
// Returns the last parser error. The parser error is set/reset by the
// following methods:
// - the constructor with parameter
// - Set*(...) methods
// - Set*Encoded(...) methods
const ParserError& GetLastParsingError() const;
// Returns the URL in the normalized form. It returns empty string if and only
// if all components are empty (see the grammar).
// If the Port is specified (GetPort() != -1) and |always_print_port| is set
// to true, a Port number is always included in the returned URI (even when
// it equals to a Scheme's default port number).
std::string GetNormalized(bool always_print_port = true) const;
// Returns true <=> whole URL has no UTF-8 characters.
bool IsASCII() const;
// Returns the Scheme. Scheme cannot have %-escaped or UTF-8 characters.
std::string GetScheme() const;
// Sets Scheme. When the new Scheme has a default port value and the current
// Port value is non-specified (=-1), the Port is set to the default value.
// Scheme cannot have %-escaped or UTF-8 characters.
// Returns false when |scheme| is invalid. In this case, the current Scheme
// is not modified.
bool SetScheme(const std::string& scheme);
// Returns the Port number or -1 if the Port number is not specified.
int GetPort() const;
// Sets Port. |port| must be from the interval [-1,65535]. -1 means
// "not-specified". If the current Scheme has a default port value, setting
// -1 results in setting the default port value from the Scheme.
// Returns false when |port| is invalid. In this case, the current port is
// not modified.
bool SetPort(int port);
// A version of the method above for a string parameter. Empty string means
// "not-specified" and has the same effect as passing -1 to the method above.
bool SetPort(const std::string& port);
// These methods return values of components. There is no %-escaped sequences
// and returned string may contain UTF-8 characters.
std::string GetUserinfo() const;
std::string GetHost() const;
std::vector<std::string> GetPath() const;
std::vector<std::pair<std::string, std::string>> GetQuery() const;
std::string GetFragment() const;
// These methods are similar to aforementioned Get* methods. The only
// difference is that all strings are %-escaped according to the
// normalization rules. In other words, returned values are the same as
// in the normalized URI form returned by GetNormalized().
std::string GetUserinfoEncoded() const;
std::string GetHostEncoded() const;
std::vector<std::string> GetPathEncoded() const;
std::string GetPathEncodedAsString() const;
std::vector<std::pair<std::string, std::string>> GetQueryEncoded() const;
std::string GetQueryEncodedAsString() const;
std::string GetFragmentEncoded() const;
// These methods set value of a component. They DO NOT interpret % as an
// escape character. Input strings may contain UTF-8 characters.
// Returned value has the following meaning:
// - true - no parser errors => the component was set to a new value
// - false - a parser error occurred => no changes were made to the component
// Every call to one of these methods resets the state returned by the method
// GetLastParsingError(...).
bool SetUserinfo(const std::string&);
bool SetHost(const std::string&);
bool SetPath(const std::vector<std::string>&);
bool SetQuery(const std::vector<std::pair<std::string, std::string>>&);
bool SetFragment(const std::string&);
// These methods are similar to aforementioned Set* methods. The only
// difference is that the DO interpret % as an escape character. UTF-8
// characters are still allowed.
bool SetUserinfoEncoded(const std::string&);
bool SetHostEncoded(const std::string&);
bool SetPathEncoded(const std::vector<std::string>&);
bool SetPathEncoded(const std::string&);
bool SetQueryEncoded(const std::vector<std::pair<std::string, std::string>>&);
bool SetQueryEncoded(const std::string&);
bool SetFragmentEncoded(const std::string&);
// <=> operators. The order is determined by ASCII-wise comparison of the
// vector of components (GetScheme(),GetUserinfo(),GetHost(),GetPort(),
// GetPath(),GetQuery(),GetFragment()). The value of GetLastParsingError()
// is not taken into account during comparison (URIs with the same components
// but different ParserError are ==).
bool operator<(const Uri& uri) const;
bool operator<=(const Uri& uri) const { return !(uri < *this); }
bool operator>(const Uri& uri) const { return (uri < *this); }
bool operator>=(const Uri& uri) const { return !(*this < uri); }
bool operator==(const Uri& uri) const;
bool operator!=(const Uri& uri) const { return !(*this == uri); }
private:
class Pim;
bool ShouldPrintPort(bool always_print_port) const;
std::unique_ptr<Pim> pim_;
};
} // namespace chromeos
#endif // CHROMEOS_PRINTING_URI_H_