| // Copyright 2015 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/feedback/redaction_tool/redaction_tool.h" |
| |
| #include <algorithm> |
| #include <set> |
| #include <string_view> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/compiler_specific.h" |
| #include "base/containers/span.h" |
| #include "base/files/file_path.h" |
| #include "base/no_destructor.h" |
| #include "base/strings/strcat.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/stringprintf.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/task/sequenced_task_runner.h" |
| #include "base/threading/thread_restrictions.h" |
| #include "components/autofill/core/common/credit_card_number_validation.h" |
| #include "components/feedback/redaction_tool/ip_address.h" |
| #include "components/feedback/redaction_tool/pii_types.h" |
| #ifdef USE_SYSTEM_RE2 |
| #include <re2/re2.h> |
| #else |
| #include "third_party/re2/src/re2/re2.h" |
| #endif // USE_SYSTEM_RE2 |
| |
| using re2::RE2; |
| using redaction_internal::IPAddress; |
| |
| namespace redaction { |
| |
| namespace features { |
| BASE_FEATURE(kEnableCreditCardRedaction, base::FEATURE_ENABLED_BY_DEFAULT); |
| |
| BASE_FEATURE(kEnableIbanRedaction, base::FEATURE_ENABLED_BY_DEFAULT); |
| } // namespace features |
| |
| namespace { |
| |
| // Helper macro: Non capturing group |
| #define NCG(x) "(?:" x ")" |
| // Helper macro: Optional non capturing group |
| #define OPT_NCG(x) NCG(x) "?" |
| |
| ////////////////////////////////////////////////////////////////////////// |
| // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial |
| // limitation on the scheme to increase precision. Otherwise anything |
| // like "ID:" would be considered an IRI. |
| |
| #define UNRESERVED "[-a-z0-9._~]" |
| #define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS) |
| #define SUB_DELIMS "[!$&'()*+,;=]" |
| #define GEN_DELIMS "[:/?#[\\]@]" |
| |
| #define DIGIT "[0-9]" |
| #define HEXDIG "[0-9a-f]" |
| |
| #define PCT_ENCODED "%" HEXDIG HEXDIG |
| |
| #define DEC_OCTET NCG("1[0-9][0-9]|2[0-4][0-9]|25[0-5]|[1-9][0-9]|[0-9]") |
| |
| #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET |
| |
| #define H16 NCG(HEXDIG) "{1,4}" |
| #define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS) |
| #define WB "\\b" |
| |
| // clang-format off |
| #define IPV6ADDRESS NCG( \ |
| WB NCG(H16 ":") "{6}" LS32 WB "|" \ |
| "::" NCG(H16 ":") "{5}" LS32 WB "|" \ |
| OPT_NCG( WB H16) "::" NCG(H16 ":") "{4}" LS32 WB "|" \ |
| OPT_NCG( WB NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 WB "|" \ |
| OPT_NCG( WB NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 WB "|" \ |
| OPT_NCG( WB NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 WB "|" \ |
| OPT_NCG( WB NCG(H16 ":") "{0,4}" H16) "::" LS32 WB "|" \ |
| OPT_NCG( WB NCG(H16 ":") "{0,5}" H16) "::" H16 WB "|" \ |
| OPT_NCG( WB NCG(H16 ":") "{0,6}" H16) "::") |
| // clang-format on |
| |
| #define IPVFUTURE \ |
| "v" HEXDIG \ |
| "+" \ |
| "\\." NCG(UNRESERVED "|" SUB_DELIMS \ |
| "|" \ |
| ":") "+" |
| |
| #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]" |
| |
| #define PORT DIGIT "*" |
| |
| // This is a diversion of RFC 3987 |
| #define SCHEME \ |
| NCG("http|https|ftp|chrome|chrome-extension|android|rtsp|file|isolated-app") |
| |
| #define IPRIVATE \ |
| "[" \ |
| "\\x{E000}-\\x{F8FF}" \ |
| "\\x{F0000}-\\x{FFFFD}" \ |
| "\\x{100000}-\\x{10FFFD}" \ |
| "]" |
| |
| #define UCSCHAR \ |
| "[" \ |
| "\\x{A0}-\\x{D7FF}" \ |
| "\\x{F900}-\\x{FDCF}" \ |
| "\\x{FDF0}-\\x{FFEF}" \ |
| "\\x{10000}-\\x{1FFFD}" \ |
| "\\x{20000}-\\x{2FFFD}" \ |
| "\\x{30000}-\\x{3FFFD}" \ |
| "\\x{40000}-\\x{4FFFD}" \ |
| "\\x{50000}-\\x{5FFFD}" \ |
| "\\x{60000}-\\x{6FFFD}" \ |
| "\\x{70000}-\\x{7FFFD}" \ |
| "\\x{80000}-\\x{8FFFD}" \ |
| "\\x{90000}-\\x{9FFFD}" \ |
| "\\x{A0000}-\\x{AFFFD}" \ |
| "\\x{B0000}-\\x{BFFFD}" \ |
| "\\x{C0000}-\\x{CFFFD}" \ |
| "\\x{D0000}-\\x{DFFFD}" \ |
| "\\x{E1000}-\\x{EFFFD}" \ |
| "]" |
| |
| #define IUNRESERVED \ |
| NCG("[-a-z0-9._~]" \ |
| "|" UCSCHAR) |
| |
| #define IPCHAR \ |
| NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ |
| "|" \ |
| "[:@]") |
| #define IFRAGMENT \ |
| NCG(IPCHAR \ |
| "|" \ |
| "[/?]") \ |
| "*" |
| #define IQUERY \ |
| NCG(IPCHAR "|" IPRIVATE \ |
| "|" \ |
| "[/?]") \ |
| "*" |
| |
| #define ISEGMENT IPCHAR "*" |
| #define ISEGMENT_NZ IPCHAR "+" |
| #define ISEGMENT_NZ_NC \ |
| NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ |
| "|" \ |
| "@") \ |
| "+" |
| |
| #define IPATH_EMPTY "" |
| #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*" |
| #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*" |
| #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*") |
| #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*" |
| |
| #define IPATH \ |
| NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" IPATH_ROOTLESS \ |
| "|" IPATH_EMPTY) |
| |
| #define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*" |
| |
| #define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME) |
| #define IUSERINFO \ |
| NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ |
| "|" \ |
| ":") \ |
| "*" |
| #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT) |
| |
| #define IRELATIVE_PART \ |
| "//" NCG(IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME \ |
| "|" IPATH_EMPTY) |
| |
| #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT) |
| |
| // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements |
| // that end with "Android:" for example are not considered a URL. |
| #define IHIER_PART \ |
| "//" NCG(IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_ROOTLESS) |
| |
| #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY) |
| |
| #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT) |
| |
| #define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF) |
| |
| // The |kCustomPatternsWithContext| array defines patterns to match and |
| // redact. Each pattern needs to define three capturing parentheses groups: |
| // |
| // - a group for the pattern before the identifier to be redacted; |
| // - a group for the identifier to be redacted; |
| // - a group for the pattern after the identifier to be redacted. |
| // |
| // The first and the last capture group are the origin of the "WithContext" |
| // suffix in the name of this constant. |
| // |
| // Every matched identifier (in the context of the whole pattern) is redacted |
| // by replacing it with an incremental instance identifier. Every different |
| // pattern defines a separate instance identifier space. See the unit test for |
| // RedactionToolTest::RedactCustomPatterns for pattern redaction examples. |
| // |
| // Useful regular expression syntax: |
| // |
| // +? is a non-greedy (lazy) +. |
| // \b matches a word boundary. |
| // (?i) turns on case insensitivity for the remainder of the regex. |
| // (?-s) turns off "dot matches newline" for the remainder of the regex. |
| // (?:regex) denotes non-capturing parentheses group. |
| CustomPatternWithAlias kCustomPatternsWithContext[] = { |
| // ModemManager |
| {"CellID", "(\\bCell ID: ')([0-9a-fA-F]+)(')", |
| PIIType::kCellularLocationInfo}, |
| {"LocAC", "(\\bLocation area code: ')([0-9a-fA-F]+)(')", |
| PIIType::kCellularLocationInfo}, |
| |
| // Android. Must run first since this expression matches the replacement. |
| // |
| // If we don't get helpful delimiters like a single/double quote, then we |
| // can only try our best and take out the next 32 characters, the max length |
| // of a SSID. Require at least one non-quote character though so we skip |
| // over the quoted SSIDs (which the following patterns will catch and |
| // redact). |
| {"SSID", "(?i-s)(\\bSSID: )([^'\"]{1,32})(.*)", PIIType::kSSID}, |
| // Replace any SSID inside quotes. |
| {"SSID", "(?i-s)(\\bSSID: ['\"])(.+)(['\"])", PIIType::kSSID}, |
| // Special WifiNetworkSpecifier#toString. |
| {"SSID", "(?i-s)(\\bSSID Match pattern=[^ ]*\\s?)(.+)(\\})", |
| PIIType::kSSID}, |
| |
| // wpa_supplicant |
| {"SSID", "(?i-s)(\\bssid[= ]')(.+)(')", PIIType::kSSID}, |
| {"SSID", "(?i-s)(\\bssid[= ]\")(.+)(\")", PIIType::kSSID}, |
| {"SSID", "(\\* SSID=)([^\n]+)(.*)", PIIType::kSSID}, |
| {"SSIDHex", "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", |
| PIIType::kSSID}, |
| |
| // shill |
| {"SSID", "(?-s)(\\[SSID=)(.+?)(\\])", PIIType::kSSID}, |
| |
| // Serial numbers. The actual serial number itself can include any alphanum |
| // char as well as dashes, periods, colons, slashes and unprintable ASCII |
| // chars (except newline). The second one is for a special case in |
| // edid-decode, where if we genericized it further then we would catch too |
| // many other cases that we don't want to redact. |
| {"Serial", |
| "(?i-s)(\\bserial\\s*_?(?:number)?['\"]?\\s*[:=|]\\s*['\"]?)" |
| "([0-9a-zA-Z\\-.:\\/\\\\\\x00-\\x09\\x0B-\\x1F]+)(\\b)", |
| PIIType::kSerial}, |
| {"Serial", "( Serial Number )(\\d+)(\\b)", PIIType::kSerial}, |
| // USB Serial numbers, as outputted from the lsusb --verbose tool. |
| // "iSerial" followed by some spaces, then up to 5 digits of the iSerial |
| // index which is not part of the serial number itself, followed by the |
| // serial number string. |
| // The iSerial index must be nonzero, as an index of zero indicates no |
| // string descriptor is present. |
| // The serial number string itself is up to the manufacturer, but is |
| // observed to be alphanumetric (numbers, and both upper and lower case |
| // letters). |
| {"Serial", "(iSerial\\s*[1-9]\\d{0,4}\\s)([0-9a-zA-Z-]+)(\\b)", |
| PIIType::kSerial}, |
| // USB Serial number as generated by usbguard. |
| {"Serial", |
| "(?i-s)(\\bserial\\s\")" |
| "([0-9a-z\\-.:\\/\\\\\\x00-\\x09\\x0B-\\x1F]+)(\")", |
| PIIType::kSerial}, |
| // The attested device id, a serial number, that comes from vpd_2.0.txt. |
| // The pattern was recently clarified as being a case insensitive string of |
| // ASCII letters and digits, plus the dash/hyphen character. The dash cannot |
| // appear first or last |
| {"Serial", "(\"attested_device_id\"=\")([^-][0-9a-zA-Z-]+[^-])(\")", |
| PIIType::kSerial}, |
| // PSM identifier is a 4-character brand code, which can be encoded as 8 hex |
| // digits, followed by a slash ('/') and a serial number. |
| {"PSM ID", |
| "(?i)(PSM.*[\t ]+.*\\b)((?:[a-z]{4}|[0-9a-f]{8})\\/" |
| "[0-9a-z\\-.:\\/\\\\\\x00-\\x09\\x0B-\\x1F]+)(\\b)", |
| PIIType::kSerial}, |
| |
| // GAIA IDs |
| {"GAIA", R"xxx((\"?\bgaia_id\"?[=:]['\"])(\d+)(\b['\"]))xxx", |
| PIIType::kGaiaID}, |
| {"GAIA", R"xxx((\{id: )(\d+)(, email:))xxx", PIIType::kGaiaID}, |
| // The next two patterns are used by support tool when exporting PII. |
| {"GAIA", R"xxx(("accountId":\s*")([^"]+)("))xxx", PIIType::kGaiaID}, |
| {"GAIA", |
| R"xxx(("label":\s*"(?:Account|Gaia) Id",\s*"status":\s*")([^"]+)("))xxx", |
| PIIType::kGaiaID}, |
| |
| // UUIDs given by the 'blkid' tool. These don't necessarily look like |
| // standard UUIDs, so treat them specially. |
| {"UUID", R"xxx((UUID=")([0-9a-zA-Z-]+)("))xxx", PIIType::kStableIdentifier}, |
| // Also cover UUIDs given by the 'lvs' and 'pvs' tools, which similarly |
| // don't necessarily look like standard UUIDs. |
| {"UUID", R"xxx(("[lp]v_uuid":")([0-9a-zA-Z-]+)("))xxx", |
| PIIType::kStableIdentifier}, |
| // Cover UUIDs generated by vgcfgbackup, which also don't look like standard |
| // UUIDs. |
| {"UUID", R"xxx((id = ")([0-9a-zA-Z-]+)("))xxx", PIIType::kStableIdentifier}, |
| |
| // Volume labels presented in the 'blkid' tool, and as part of removable |
| // media paths shown in various logs such as cros-disks (in syslog). |
| // There isn't a well-defined format for these. For labels in blkid, |
| // capture everything between the open and closing quote. |
| {"Volume Label", R"xxx((LABEL=")([^"]+)("))xxx", PIIType::kVolumeLabel}, |
| // For paths, this is harder. The only restricted characters are '/' and |
| // NUL, so use a simple heuristic. cros-disks generally quotes paths using |
| // single-quotes, so capture everything until a quote character. For lsblk, |
| // capture everything until the end of the line, since the mount path is the |
| // last field. |
| {"Volume Label", R"xxx((/media/removable/)(.+?)(['"/\n]|$))xxx", |
| PIIType::kVolumeLabel}, |
| |
| // IPP (Internet Printing Protocol) Addresses |
| {"IPP Address", R"xxx((ipp:\/\/)(.+?)(\/ipp))xxx", PIIType::kIPPAddress}, |
| // Crash ID. This pattern only applies to ChromeOS and it matches the |
| // log entries from ChromeOS's crash_sender program. |
| {"Crash ID", R"xxx((Crash report receipt ID )([0-9a-fA-F]+)(.+?))xxx", |
| PIIType::kCrashId}, |
| |
| // Names of ChromeOS cryptohome logical volumes and device mapper devices, |
| // which include a partial hash of the user id. |
| {"UID", R"xxx(((?:cryptohome|dmcrypt)-+)([0-9a-fA-F]+)(-+))xxx", |
| PIIType::kStableIdentifier}, |
| |
| // GSC device id unique to each chip. |
| {"Serial", |
| R"xxx((DEV_ID:\s+)(0x[0-9a-zA-Z-]{8}\s+0x[0-9a-zA-Z-]{8})(.*?))xxx", |
| PIIType::kSerial}, |
| |
| // Chromebook serial hash stored in GSC. |
| {"Serial", |
| R"xxx((SN:\s+)([0-9a-zA-Z-]{8}\s+[0-9a-zA-Z-]{8}\s+[0-9a-zA-Z-]{8}))xxx" |
| R"xxx((.*?))xxx", |
| PIIType::kSerial}, |
| |
| // Memory dump from GSC log. |
| {"Memory Dump", |
| R"xxx((\[\s*[0-9]+\.[0-9]+\]\s+)(0x[0-9a-zA-Z-]{8}:\s+[0-9a-zA-Z-]{8})xxx" |
| R"xxx(\s+[0-9a-zA-Z-]{8}\s+[0-9a-zA-Z-]{8}\s+[0-9a-zA-Z-]{8})(.*?))xxx", |
| PIIType::kMemory}, |
| |
| // IPv4 addresses should not be prefixed or postfixed by a '.' or a '-' |
| // which indicates a version number or other identifier. |
| {"IPv4", |
| "([^-\\.0-9]|^)" |
| "(" IPV4ADDRESS ")" |
| "([^-\\.0-9]|$)", |
| PIIType::kIPAddress}, |
| |
| // Redacts PII from kernel logs for virtual input devices (e.g., Bluetooth). |
| // Matches lines like: |
| // input: Edman Paes dos Anjos’s Keyboard as |
| // /devices/virtual/misc/uhid/0005:... |
| // Redacts the name part only. |
| {"Bluetooth HID Device", |
| "(input: )([^\\r\\n]+?)(\\s+as\\s+/devices/virtual/misc/uhid/0005:.*?)", |
| PIIType::kBluetoothHidDevice}, |
| |
| // Redacts PII from kernel logs for explicit Bluetooth HID devices. |
| // Matches lines like: |
| // ... [Edman Paes dos Anjos’s Keyboard] on ... |
| // Redacts the name part found inside the brackets. |
| {"Bluetooth HID Device", R"((BLUETOOTH HID.+?\[)([^\]]+)(\]))", |
| PIIType::kBluetoothHidDevice}, |
| }; |
| |
| bool MaybeUnmapAddress(IPAddress* addr) { |
| if (!addr->IsIPv4MappedIPv6()) { |
| return false; |
| } |
| |
| *addr = ConvertIPv4MappedIPv6ToIPv4(*addr); |
| return true; |
| } |
| |
| bool MaybeUntranslateAddress(IPAddress* addr) { |
| if (!addr->IsIPv6()) { |
| return false; |
| } |
| |
| static const base::NoDestructor<IPAddress> kTranslated6To4( |
| 0, 0x64, 0xff, 0x9b, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); |
| if (!IPAddressMatchesPrefix(*addr, *kTranslated6To4, 96)) { |
| return false; |
| } |
| |
| const auto bytes = addr->bytes(); |
| *addr = IPAddress(bytes[12], bytes[13], bytes[14], bytes[15]); |
| return true; |
| } |
| |
| // If |addr| points to a valid IPv6 address, this function truncates it at /32. |
| bool MaybeTruncateIPv6(IPAddress* addr) { |
| if (!addr->IsIPv6()) { |
| return false; |
| } |
| |
| const auto bytes = addr->bytes(); |
| *addr = IPAddress(bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0); |
| return true; |
| } |
| |
| // Returns an appropriately scrubbed version of |addr| if applicable. |
| std::string MaybeScrubIPAddress(const std::string& addr) { |
| struct IPAddresScrub { |
| IPAddress ip_addr; |
| int prefix_length; |
| bool scrub; |
| }; |
| static const base::NoDestructor<std::vector<IPAddresScrub>> |
| kNonIdentifyingIPRanges({ |
| // Private. |
| {IPAddress(10, 0, 0, 0), 8, true}, |
| {IPAddress(172, 16, 0, 0), 12, true}, |
| {IPAddress(192, 168, 0, 0), 16, true}, |
| // Chrome OS containers and VMs. |
| {IPAddress(100, 115, 92, 0), 24, false}, |
| // Loopback. |
| {IPAddress(127, 0, 0, 0), 8, true}, |
| // Any. |
| {IPAddress(0, 0, 0, 0), 8, true}, |
| // DNS. |
| {IPAddress(8, 8, 8, 8), 32, false}, |
| {IPAddress(8, 8, 4, 4), 32, false}, |
| {IPAddress(1, 1, 1, 1), 32, false}, |
| // Multicast. |
| {IPAddress(224, 0, 0, 0), 4, true}, |
| // Link local. |
| {IPAddress(169, 254, 0, 0), 16, true}, |
| {IPAddress(0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 10, |
| true}, |
| // Broadcast. |
| {IPAddress(255, 255, 255, 255), 32, false}, |
| // IPv6 loopback, unspecified and non-address strings. |
| {IPAddress::IPv6AllZeros(), 112, false}, |
| // IPv6 multicast all nodes and routers. |
| {IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), 128, |
| false}, |
| {IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2), 128, |
| false}, |
| {IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), 128, |
| false}, |
| {IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2), 128, |
| false}, |
| // IPv6 other multicast (link and interface local). |
| {IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 16, |
| true}, |
| {IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 16, |
| true}, |
| }); |
| IPAddress input_addr; |
| if (input_addr.AssignFromIPLiteral(addr) && input_addr.IsValid()) { |
| bool mapped = MaybeUnmapAddress(&input_addr); |
| bool translated = !mapped && MaybeUntranslateAddress(&input_addr); |
| for (const auto& range : *kNonIdentifyingIPRanges) { |
| if (IPAddressMatchesPrefix(input_addr, range.ip_addr, |
| range.prefix_length)) { |
| std::string prefix; |
| std::string out_addr = addr; |
| if (mapped) { |
| prefix = "M "; |
| out_addr = input_addr.ToString(); |
| } else if (translated) { |
| prefix = "T "; |
| out_addr = input_addr.ToString(); |
| } |
| if (range.scrub) { |
| out_addr = base::StringPrintf( |
| "%s/%d", range.ip_addr.ToString().c_str(), range.prefix_length); |
| } |
| return base::StrCat({prefix, out_addr}); |
| } |
| } |
| // |addr| may have been over-aggressively matched as an IPv6 address when |
| // it's really just an arbitrary part of a sentence. If the string is the |
| // same as the coarsely truncated address then keep it because even if |
| // it happens to be a real address, there is no leak. |
| if (MaybeTruncateIPv6(&input_addr) && input_addr.ToString() == addr) { |
| return addr; |
| } |
| } |
| return ""; |
| } |
| |
| // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email |
| // addresses. Capture names as well ("First Lastname" <foo@bar.com>). |
| |
| // The |kCustomPatternWithoutContext| array defines further patterns to match |
| // and redact. Each pattern consists of a single capturing group. |
| CustomPatternWithAlias kCustomPatternsWithoutContext[] = { |
| {"URL", "(?i)(" IRI ")", PIIType::kURL}, |
| // Email Addresses need to come after URLs because they can be part |
| // of a query parameter. |
| {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})", PIIType::kEmail}, |
| // IPv4 uses context to avoid false positives in version numbers, etc. |
| {"IPv6", "(?i)(" IPV6ADDRESS ")", PIIType::kIPAddress}, |
| // Universal Unique Identifiers (UUIDs). |
| {"UUID", |
| "(?i)([0-9a-zA-Z]{8}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{4}-" |
| "[0-9a-zA-Z]{12})", |
| PIIType::kStableIdentifier}, |
| // Eche UID which is a base64 conversion of a 32 bytes public key. |
| {"UID", |
| "(?:[^A-Za-z0-9+/])" |
| "((?:[A-Za-z0-9+/]{4}){10}(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=))", |
| PIIType::kStableIdentifier}, |
| }; |
| |
| // Like RE2's FindAndConsume, searches for the first occurrence of |pattern| in |
| // |input| and consumes the bytes until the end of the pattern matching. Unlike |
| // FindAndConsume, the bytes skipped before the match of |pattern| are stored |
| // in |skipped_input|. |args| needs to contain at least one element. |
| // Returns whether a match was found. |
| // |
| // Example: input = "aaabbbc", pattern = "(b+)" leads to skipped_input = "aaa", |
| // args[0] = "bbb", and the beginning input is moved to the right so that it |
| // only contains "c". |
| // Example: input = "aaabbbc", pattern = "(z+)" leads to input = "aaabbbc", |
| // the args values are not modified and skipped_input is not modified. |
| bool FindAndConsumeAndGetSkippedN(std::string_view* input, |
| const re2::RE2& pattern, |
| std::string_view* skipped_input, |
| base::span<std::string_view*> args, |
| int spanification_suspected_redundant_argc) { |
| // TODO(crbug.com/431824301): Remove unneeded parameter once validated to be |
| // redundant in M143. |
| CHECK(spanification_suspected_redundant_argc == static_cast<int>(args.size()), |
| base::NotFatalUntil::M143); |
| std::string_view old_input = *input; |
| |
| CHECK_GE(spanification_suspected_redundant_argc, 1); |
| re2::RE2::Arg a0(spanification_suspected_redundant_argc > 0 ? args[0] |
| : nullptr); |
| re2::RE2::Arg a1(spanification_suspected_redundant_argc > 1 ? args[1] |
| : nullptr); |
| re2::RE2::Arg a2(spanification_suspected_redundant_argc > 2 ? args[2] |
| : nullptr); |
| re2::RE2::Arg a3(spanification_suspected_redundant_argc > 3 ? args[3] |
| : nullptr); |
| const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2, &a3}; |
| CHECK_LE(spanification_suspected_redundant_argc, 4); |
| |
| bool result = re2::RE2::FindAndConsumeN( |
| input, pattern, wrapped_args, spanification_suspected_redundant_argc); |
| |
| if (skipped_input && result) { |
| size_t bytes_skipped = args[0]->data() - old_input.data(); |
| *skipped_input = old_input.substr(0, bytes_skipped); |
| } |
| return result; |
| } |
| |
| // All |match_groups| need to be of type std::string_view*. |
| template <typename... Arg> |
| bool FindAndConsumeAndGetSkipped(std::string_view* input, |
| const re2::RE2& pattern, |
| std::string_view* skipped_input, |
| Arg*... match_groups) { |
| std::string_view* args[] = {match_groups...}; |
| return FindAndConsumeAndGetSkippedN(input, pattern, skipped_input, args, |
| std::size(args)); |
| } |
| |
| bool HasRepeatedChar(std::string_view text, char c) { |
| return std::adjacent_find(text.begin(), text.end(), [c](char c1, char c2) { |
| return (c1 == c) && (c2 == c); |
| }) != text.end(); |
| } |
| |
| // The following MAC addresses will not be redacted as they are not specific |
| // to a device but have general meanings. |
| const char* const kUnredactedMacAddresses[] = { |
| "00:00:00:00:00:00", // ARP failure result MAC. |
| "ff:ff:ff:ff:ff:ff", // Broadcast MAC. |
| }; |
| constexpr size_t kNumUnredactedMacs = std::size(kUnredactedMacAddresses); |
| |
| bool IsFeatureEnabled(const base::Feature& feature) { |
| return base::FeatureList::GetInstance() |
| ? base::FeatureList::IsEnabled(feature) |
| : feature.default_state == base::FEATURE_ENABLED_BY_DEFAULT; |
| } |
| } // namespace |
| |
| RedactionTool::RedactionTool(const char* const* first_party_extension_ids) |
| : RedactionTool(first_party_extension_ids, |
| RedactionToolMetricsRecorder::Create()) {} |
| |
| RedactionTool::RedactionTool( |
| const char* const* first_party_extension_ids, |
| std::unique_ptr<RedactionToolMetricsRecorder> metrics_recorder) |
| : first_party_extension_ids_(first_party_extension_ids), |
| metrics_recorder_(std::move(metrics_recorder)) { |
| CHECK(metrics_recorder_); |
| DETACH_FROM_SEQUENCE(sequence_checker_); |
| // Identity-map these, so we don't mangle them. |
| for (const char* mac : kUnredactedMacAddresses) { |
| mac_addresses_[mac] = mac; |
| } |
| } |
| |
| RedactionTool::~RedactionTool() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| } |
| |
| std::map<PIIType, std::set<std::string>> RedactionTool::Detect( |
| const std::string& input) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| base::AssertLongCPUWorkAllowed(); |
| |
| std::map<PIIType, std::set<std::string>> detected; |
| |
| if (IsFeatureEnabled(features::kEnableCreditCardRedaction)) { |
| RedactCreditCardNumbers(input, &detected); |
| } |
| RedactMACAddresses(input, &detected); |
| // This function will add to |detected| only on Chrome OS as Android app |
| // storage paths are only detected for Chrome OS. |
| RedactAndroidAppStoragePaths(input, &detected); |
| DetectWithCustomPatterns(input, &detected); |
| // Do hashes last since they may appear in URLs and they also prevent us from |
| // properly recognizing the Android storage paths. |
| RedactHashes(input, &detected); |
| if (IsFeatureEnabled(features::kEnableIbanRedaction)) { |
| RedactIbans(input, &detected); |
| } |
| return detected; |
| } |
| |
| std::string RedactionTool::Redact(const std::string& input, |
| const base::Location& location) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| return RedactAndKeepSelected(input, /*pii_types_to_keep=*/{}, location); |
| } |
| |
| std::string RedactionTool::RedactAndKeepSelected( |
| const std::string& input, |
| const std::set<PIIType>& pii_types_to_keep, |
| const base::Location& location) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| base::AssertLongCPUWorkAllowed(); |
| |
| RedactionToolCaller caller = GetCaller(location); |
| metrics_recorder_->RecordRedactionToolCallerHistogram(caller); |
| const base::TimeTicks redaction_start = base::TimeTicks::Now(); |
| |
| // Copy |input| so we can modify it. |
| std::string redacted = input; |
| |
| // Do this before MAC addresses as credit cards can use the - as identifier as |
| // well and the length could also match a MAC address. Since the credit card |
| // check does additional validation against issuer length and Luhns checksum |
| // the number of false positives should be lower when ordered like this. |
| if (IsFeatureEnabled(features::kEnableCreditCardRedaction) && |
| pii_types_to_keep.find(PIIType::kCreditCard) == pii_types_to_keep.end()) { |
| redacted = RedactCreditCardNumbers(std::move(redacted), nullptr); |
| } |
| if (pii_types_to_keep.find(PIIType::kMACAddress) == pii_types_to_keep.end()) { |
| redacted = RedactMACAddresses(std::move(redacted), nullptr); |
| } |
| if (pii_types_to_keep.find(PIIType::kAndroidAppStoragePath) == |
| pii_types_to_keep.end()) { |
| redacted = RedactAndroidAppStoragePaths(std::move(redacted), nullptr); |
| } |
| |
| redacted = RedactAndKeepSelectedCustomPatterns(std::move(redacted), |
| pii_types_to_keep); |
| |
| // Do hashes last since they may appear in URLs and they also prevent us |
| // from properly recognizing the Android storage paths. |
| if (pii_types_to_keep.find(PIIType::kStableIdentifier) == |
| pii_types_to_keep.end()) { |
| // URLs and Android storage paths will be partially redacted (only hashes) |
| // if |pii_types_to_keep| contains PIIType::kURL or |
| // PIIType::kAndroidAppStoragePath and not PIIType::kStableIdentifier. |
| redacted = RedactHashes(std::move(redacted), nullptr); |
| } |
| if (IsFeatureEnabled(features::kEnableIbanRedaction) && |
| pii_types_to_keep.find(PIIType::kIBAN) == pii_types_to_keep.end()) { |
| redacted = RedactIbans(std::move(redacted), nullptr); |
| } |
| |
| metrics_recorder_->RecordTimeSpentRedactingHistogram(base::TimeTicks::Now() - |
| redaction_start); |
| |
| return redacted; |
| } |
| |
| void RedactionTool::EnableCreditCardRedaction(const bool enabled) { |
| redact_credit_cards_ = enabled; |
| } |
| |
| RE2* RedactionTool::GetRegExp(const std::string& pattern) { |
| if (regexp_cache_.find(pattern) == regexp_cache_.end()) { |
| RE2::Options options; |
| // set_multiline of pcre is not supported by RE2, yet. |
| options.set_dot_nl(true); // Dot matches a new line. |
| std::unique_ptr<RE2> re = std::make_unique<RE2>(pattern, options); |
| DCHECK_EQ(re2::RE2::NoError, re->error_code()) << "Failed to parse:\n" |
| << pattern << "\n" |
| << re->error(); |
| regexp_cache_[pattern] = std::move(re); |
| } |
| return regexp_cache_[pattern].get(); |
| } |
| |
| std::string RedactionTool::RedactMACAddresses( |
| const std::string& input, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| // This regular expression finds the next MAC address. It splits the data into |
| // an OUI (Organizationally Unique Identifier) part and a NIC (Network |
| // Interface Controller) specific part. We also match on dash and underscore |
| // because we have seen instances of both of those occurring. |
| |
| RE2* mac_re = GetRegExp( |
| "([0-9a-fA-F][0-9a-fA-F][:\\-_]" |
| "[0-9a-fA-F][0-9a-fA-F][:\\-_]" |
| "[0-9a-fA-F][0-9a-fA-F])[:\\-_](" |
| "[0-9a-fA-F][0-9a-fA-F][:\\-_]" |
| "[0-9a-fA-F][0-9a-fA-F][:\\-_]" |
| "[0-9a-fA-F][0-9a-fA-F])"); |
| |
| std::string result; |
| result.reserve(input.size()); |
| |
| // Keep consuming, building up a result string as we go. |
| std::string_view text(input); |
| std::string_view skipped, oui, nic; |
| static const char kMacSeparatorChars[] = "-_"; |
| while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) { |
| // Look up the MAC address in the hash. Force the separator to be a colon |
| // so that the same MAC with a different format will match in all cases. |
| std::string oui_string = base::ToLowerASCII(oui); |
| base::ReplaceChars(oui_string, kMacSeparatorChars, ":", &oui_string); |
| std::string nic_string = base::ToLowerASCII(nic); |
| base::ReplaceChars(nic_string, kMacSeparatorChars, ":", &nic_string); |
| std::string mac = oui_string + ":" + nic_string; |
| std::string replacement_mac = mac_addresses_[mac]; |
| if (replacement_mac.empty()) { |
| // If not found, build up a replacement MAC address by generating a new |
| // NIC part. |
| int mac_id = mac_addresses_.size() - kNumUnredactedMacs; |
| replacement_mac = base::StringPrintf("(MAC OUI=%s IFACE=%d)", |
| oui_string.c_str(), mac_id); |
| mac_addresses_[mac] = replacement_mac; |
| } |
| if (detected != nullptr) { |
| (*detected)[PIIType::kMACAddress].insert(mac); |
| } |
| result.append(skipped); |
| result += replacement_mac; |
| metrics_recorder_->RecordPIIRedactedHistogram(PIIType::kMACAddress); |
| } |
| |
| result.append(text); |
| |
| return result; |
| } |
| |
| std::string RedactionTool::RedactHashes( |
| const std::string& input, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| // This will match hexadecimal strings from length 32 to 64 that have a word |
| // boundary at each end. We then check to make sure they are one of our valid |
| // hash lengths before replacing. |
| // NOTE: There are some occurrences in the dump data (specifically modetest) |
| // where relevant data is formatted with 32 hex chars on a line. In this case, |
| // it is preceded by at least 3 whitespace chars, so check for that and in |
| // that case do not redact. |
| RE2* hash_re = GetRegExp(R"((\s*)\b([0-9a-fA-F]{4})([0-9a-fA-F]{28,60})\b)"); |
| |
| std::string result; |
| result.reserve(input.size()); |
| |
| // Keep consuming, building up a result string as we go. |
| std::string_view text(input); |
| std::string_view skipped, pre_whitespace, hash_prefix, hash_suffix; |
| while (FindAndConsumeAndGetSkipped(&text, *hash_re, &skipped, &pre_whitespace, |
| &hash_prefix, &hash_suffix)) { |
| result.append(skipped); |
| result.append(pre_whitespace); |
| |
| // Check if it's a valid length for our hashes or if we need to skip due to |
| // the whitespace check. |
| size_t hash_length = 4 + hash_suffix.length(); |
| if ((hash_length != 32 && hash_length != 40 && hash_length != 64) || |
| (hash_length == 32 && pre_whitespace.length() >= 3)) { |
| // This is not a hash string, skip it. |
| result.append(hash_prefix); |
| result.append(hash_suffix); |
| continue; |
| } |
| |
| // Look up the hash value address in the map of replacements. |
| std::string hash_prefix_string = base::ToLowerASCII(hash_prefix); |
| std::string hash = hash_prefix_string + base::ToLowerASCII(hash_suffix); |
| std::string replacement_hash = hashes_[hash]; |
| if (replacement_hash.empty()) { |
| // If not found, build up a replacement value. |
| replacement_hash = base::StringPrintf( |
| "(HASH:%s %zd)", hash_prefix_string.c_str(), hashes_.size()); |
| hashes_[hash] = replacement_hash; |
| } |
| if (detected != nullptr) { |
| (*detected)[PIIType::kStableIdentifier].insert(hash); |
| } |
| |
| result += replacement_hash; |
| |
| metrics_recorder_->RecordPIIRedactedHistogram(PIIType::kStableIdentifier); |
| } |
| |
| result.append(text); |
| |
| return result; |
| } |
| |
| std::string RedactionTool::RedactAndroidAppStoragePaths( |
| const std::string& input, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| // We only use this on Chrome OS and there's differences in the API for |
| // FilePath on Windows which prevents this from compiling, so only enable this |
| // code for Chrome OS. |
| #if BUILDFLAG(IS_CHROMEOS) |
| std::string result; |
| result.reserve(input.size()); |
| |
| // This is for redacting Android data paths included in 'android_app_storage' |
| // and 'audit_log' output. <app_specific_path> in the following data paths |
| // will be redacted. |
| // - /data/data/<package_name>/<app_specific_path> |
| // - /data/app/<package_name>/<app_specific_path> |
| // - /data/user_de/<number>/<package_name>/<app_specific_path> |
| // These data paths are preceded by "/home/root/<user_hash>/android-data" in |
| // 'android_app_storage' output, and preceded by "path=" or "exe=" in |
| // 'audit_log' output. |
| RE2* path_re = |
| GetRegExp(R"((?m)((path=|exe=|/home/root/[\da-f]+/android-data))" |
| R"(/data/(data|app|user_de/\d+)/[^/\n]+)(/[^\n\s]+))"); |
| |
| // Keep consuming, building up a result string as we go. |
| std::string_view text(input); |
| std::string_view skipped; |
| std::string_view path_prefix; // path before app_specific; |
| std::string_view pre_data; // (path=|exe=|/home/root/<hash>/android-data) |
| std::string_view post_data; // (data|app|user_de/\d+) |
| std::string_view app_specific; // (/[^\n\s]+) |
| while (FindAndConsumeAndGetSkipped(&text, *path_re, &skipped, &path_prefix, |
| &pre_data, &post_data, &app_specific)) { |
| // We can record these parts as-is. |
| result.append(skipped); |
| result.append(path_prefix); |
| |
| // |app_specific| has to be redacted. First, convert it into components, |
| // and then redact each component as follows: |
| // - If the component has a non-ASCII character, change it to '*'. |
| // - Otherwise, remove all the characters in the component but the first |
| // one. |
| // - If the original component has 2 or more bytes, add '_'. |
| const base::FilePath path(app_specific); |
| std::vector<std::string> components = path.GetComponents(); |
| DCHECK(!components.empty()); |
| |
| auto it = components.begin() + 1; // ignore the leading slash |
| for (; it != components.end(); ++it) { |
| const auto& component = *it; |
| DCHECK(!component.empty()); |
| result += '/'; |
| result += (base::IsStringASCII(component) ? component[0] : '*'); |
| if (component.length() > 1) { |
| result += '_'; |
| } |
| } |
| if (detected != nullptr) { |
| (*detected)[PIIType::kAndroidAppStoragePath].emplace(app_specific); |
| } |
| metrics_recorder_->RecordPIIRedactedHistogram( |
| PIIType::kAndroidAppStoragePath); |
| } |
| |
| result.append(text); |
| |
| return result; |
| #else |
| return input; |
| #endif // BUILDFLAG(IS_CHROMEOS) |
| } |
| |
| std::string RedactionTool::RedactCreditCardNumbers( |
| const std::string& input, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| std::string result; |
| result.reserve(input.size()); |
| |
| RE2* cc_re = GetRegExp( |
| "[^\\d\\n]{1,5}[ :='\"]" // pre sequence: Make sure we're not |
| // matching a memory dump or in some |
| // continuous string of numbers. |
| "((?:[\\d -]){12,37})" // sequence: Creditcard length is 12-19 and we |
| // allow up to one separation character (space |
| // or hyphen) between each of them. |
| "(\n|\\D{2,3})"); // post sequence: Not trying to match inside a |
| // continuous number block, so the characters |
| // after the potential match should either be a |
| // newline or 2-3 non digits. |
| |
| std::string_view text(input); |
| std::string_view skipped; |
| std::string_view sequence; |
| std::string_view post_sequence; |
| |
| while (FindAndConsumeAndGetSkipped(&text, *cc_re, &skipped, &sequence, |
| &post_sequence)) { |
| result.append(skipped); |
| metrics_recorder_->RecordCreditCardRedactionHistogram( |
| CreditCardDetection::kRegexMatch); |
| |
| // Timestamps in ms have a surprisingly high number of false positives. |
| // Also log entries but those usually only match if there are several spaces |
| // tying unrelated numbers together. |
| if (post_sequence.find("ms") != std::string_view::npos) { |
| metrics_recorder_->RecordCreditCardRedactionHistogram( |
| CreditCardDetection::kTimestamp); |
| result.append(sequence); |
| result.append(post_sequence); |
| continue; |
| } |
| |
| if (HasRepeatedChar(sequence, ' ') || HasRepeatedChar(sequence, '-')) { |
| metrics_recorder_->RecordCreditCardRedactionHistogram( |
| CreditCardDetection::kRepeatedChars); |
| result.append(sequence); |
| result.append(post_sequence); |
| continue; |
| } |
| |
| const std::u16string stripped_number = |
| autofill::StripCardNumberSeparators(base::UTF8ToUTF16(sequence)); |
| const std::string u8number = base::UTF16ToUTF8(stripped_number); |
| |
| const auto cc_it = credit_cards_.find(u8number); |
| if (cc_it != credit_cards_.cend()) { |
| result += cc_it->second; |
| result.append(post_sequence); |
| metrics_recorder_->RecordCreditCardRedactionHistogram( |
| CreditCardDetection::kValidated); |
| metrics_recorder_->RecordPIIRedactedHistogram(PIIType::kCreditCard); |
| continue; |
| } |
| |
| const bool only_zeros = |
| stripped_number.find_first_not_of(u'0', 0) == std::u16string::npos; |
| if (!only_zeros && autofill::IsValidCreditCardNumber(stripped_number)) { |
| metrics_recorder_->RecordCreditCardRedactionHistogram( |
| CreditCardDetection::kValidated); |
| const auto& [it, success] = credit_cards_.emplace( |
| u8number, |
| base::StrCat({"(CREDITCARD: ", |
| base::NumberToString(credit_cards_.size() + 1), ")"})); |
| if (redact_credit_cards_) { |
| metrics_recorder_->RecordPIIRedactedHistogram(PIIType::kCreditCard); |
| result += it->second; |
| } else { |
| result.append(sequence); |
| } |
| if (detected) { |
| (*detected)[PIIType::kCreditCard].insert(it->first); |
| } |
| } else { |
| metrics_recorder_->RecordCreditCardRedactionHistogram( |
| CreditCardDetection::kDoesntValidate); |
| result.append(sequence); |
| } |
| result.append(post_sequence); |
| } |
| |
| result.append(text); |
| |
| return result; |
| } |
| |
| std::string RedactionTool::RedactIbans( |
| const std::string& input, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| std::string result; |
| result.reserve(input.size()); |
| |
| RE2* iban_re = GetRegExp( |
| "(:| )" |
| "((?:A[DELAOTZ]|B[AEFGHIJR]|C[HIMRVYZ]|D[EKOZ]|E[ES]|F[IOR]|G[BEILRT]|" |
| "H[RU]|I[ELRST]|JO|K[WZ]|L[BITUV]|M[CDEGKLRTUZ]|N[LO]|P[KLST]|QA|R[OS]|" |
| "S[AEIKMN]|T[NR]|UA|VG|XK)(?:\\d{2})[ -]?(?:[ \\-A-Z0-9]){11,30})" |
| "([^a-zA-Z0-9_\\-\\+=/])"); |
| |
| std::string_view text(input); |
| std::string_view skipped; |
| std::string_view pre_separating_char; |
| std::string_view iban; |
| std::string_view post_separating_char; |
| while (FindAndConsumeAndGetSkipped(&text, *iban_re, &skipped, |
| &pre_separating_char, &iban, |
| &post_separating_char)) { |
| result.append(skipped); |
| result.append(pre_separating_char); |
| |
| // Validation sequence as per [1]. |
| // |
| // [1] |
| // https://en.wikipedia.org/wiki/International_Bank_Account_Number#Validating_the_IBAN |
| |
| // Remove the separating characters. |
| std::string stripped; |
| base::RemoveChars(iban, " -", &stripped); |
| |
| if (const auto previous_iban = ibans_.find(stripped); |
| previous_iban != ibans_.end()) { |
| result += previous_iban->second; |
| result.append(post_separating_char); |
| metrics_recorder_->RecordPIIRedactedHistogram(PIIType::kIBAN); |
| continue; |
| } |
| |
| // Since the logic later relies on the size of this string not changing use |
| // a lambda to initialize the constant. |
| const std::string numbers_only = [](std::string_view stripped) { |
| // Move the first 2 chars+digits to the back of the string. |
| constexpr size_t prefix_offset = 4; |
| std::string rearranged = std::string(stripped.substr(prefix_offset)); |
| rearranged.append(stripped.substr(0, prefix_offset)); |
| |
| // Replace letters with two digits, where A = 10, B = 11, ..., Z = 35. |
| std::string tmp; |
| for (const char c : rearranged) { |
| if (base::IsAsciiDigit(c)) { |
| tmp.push_back(c); |
| } else { |
| const char based_char = c - 'A'; |
| constexpr size_t iban_char_conversion_offset = 10; |
| tmp.append(base::NumberToString(static_cast<int>(based_char) + |
| iban_char_conversion_offset)); |
| } |
| } |
| return tmp; |
| }(stripped); |
| |
| // Calculate the remainder using chunks. |
| constexpr size_t chunk_size = 9; |
| |
| std::string chunk; |
| chunk.reserve(chunk_size); |
| |
| unsigned remainder = 0; |
| |
| for (size_t remaining = numbers_only.size(); remaining > 0;) { |
| const size_t pos = numbers_only.size() - remaining; |
| const size_t next_chunk_size = |
| std::min(chunk_size - chunk.size(), remaining); |
| |
| chunk.append(numbers_only.substr(pos, next_chunk_size)); |
| |
| const unsigned long chunk_number = |
| UNSAFE_TODO(std::strtoul(chunk.c_str(), nullptr, 10)); |
| |
| remainder = chunk_number % 97; |
| chunk = base::NumberToString(remainder); |
| |
| remaining -= next_chunk_size; |
| } |
| |
| if (remainder != 1) { |
| result.append(iban); |
| result.append(post_separating_char); |
| continue; |
| } |
| |
| const auto& [it, success] = ibans_.emplace( |
| stripped, base::StrCat({"(IBAN: ", |
| base::NumberToString(ibans_.size() + 1), ")"})); |
| result += it->second; |
| result.append(post_separating_char); |
| |
| if (detected != nullptr) { |
| (*detected)[PIIType::kIBAN].insert(it->first); |
| } |
| |
| metrics_recorder_->RecordPIIRedactedHistogram(PIIType::kIBAN); |
| } |
| |
| result.append(text); |
| |
| return result; |
| } |
| |
| std::string RedactionTool::RedactAndKeepSelectedCustomPatterns( |
| std::string input, |
| const std::set<PIIType>& pii_types_to_keep) { |
| for (const auto& pattern : kCustomPatternsWithContext) { |
| if (pii_types_to_keep.find(pattern.pii_type) == pii_types_to_keep.end()) { |
| input = RedactCustomPatternWithContext(input, pattern, nullptr); |
| } |
| } |
| for (const auto& pattern : kCustomPatternsWithoutContext) { |
| if (pii_types_to_keep.find(pattern.pii_type) == pii_types_to_keep.end()) { |
| input = RedactCustomPatternWithoutContext(input, pattern, nullptr); |
| } |
| } |
| return input; |
| } |
| |
| void RedactionTool::DetectWithCustomPatterns( |
| std::string input, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| for (const auto& pattern : kCustomPatternsWithContext) { |
| RedactCustomPatternWithContext(input, pattern, detected); |
| } |
| for (const auto& pattern : kCustomPatternsWithoutContext) { |
| RedactCustomPatternWithoutContext(input, pattern, detected); |
| } |
| } |
| |
| RedactionToolCaller RedactionTool::GetCaller(const base::Location& location) { |
| std::string filePath = location.file_name(); |
| if (filePath.empty() || filePath.c_str() == nullptr) { |
| return RedactionToolCaller::kUndetermined; |
| } |
| |
| std::string fileName = filePath.substr(filePath.find_last_of("/\\") + 1); |
| |
| if (filePath.find("support_tool") != std::string::npos) { |
| return RedactionToolCaller::kSupportTool; |
| } else if (filePath.find("error_reporting") != std::string::npos) { |
| return RedactionToolCaller::kErrorReporting; |
| } else if (fileName == "redaction_tool_unittest.cc") { |
| return RedactionToolCaller::kUnitTest; |
| } else if (fileName == "system_log_uploader.cc") { |
| return RedactionToolCaller::kSysLogUploader; |
| } else if (fileName == "system_logs_fetcher.cc") { |
| return RedactionToolCaller::kSysLogFetcher; |
| } else if (fileName == "chrome_js_error_report_processor.cc") { |
| return RedactionToolCaller::kCrashToolJSErrors; |
| } else if (fileName == "crash_collector.cc") { |
| return RedactionToolCaller::kCrashTool; |
| } else if (fileName == "feedback_common.cc") { |
| return RedactionToolCaller::kFeedbackToolUserDescriptions; |
| } else if (fileName == "log_source_access_manager.cc") { |
| return RedactionToolCaller::kFeedbackToolHotRod; |
| } else if (fileName == "system_logs_fetcher.cc") { |
| return RedactionToolCaller::kFeedbackToolLogs; |
| } |
| return RedactionToolCaller::kUnknown; |
| } |
| |
| bool RedactionTool::ShouldSkipIPv4Address(std::string_view skipped) { |
| // Only look for patterns on the same line as the IPv4 address. |
| const auto nlpos = skipped.rfind("\n"); |
| if (nlpos != std::string_view::npos) { |
| skipped = skipped.substr(nlpos); |
| } |
| // MomdemManager can dump out firmware revision fields that can also |
| // confuse the IPv4 matcher e.g. "Revision: 81600.0000.00.29.19.16_DO" |
| // so ignore the replacement if the skipped piece looks like |
| // "Revision: .*<ipv4>". Note however that if this field contains |
| // values delimited by multiple spaces, any matches after the first |
| // will lose the context and be redacted. |
| static const std::string_view rev("Revision: "); |
| static const std::string_view space(" "); |
| const auto pos = skipped.rfind(rev); |
| if (pos != std::string_view::npos && |
| skipped.find(space, pos + rev.length()) == std::string_view::npos) { |
| return true; |
| } |
| // URLs with an IP Address should be handled by the "URL" entry in |
| // kCustomPatternsWithoutContext instead. If the skipped piece ends with an |
| // IRI, skip it. |
| RE2* re_iri = GetRegExp(".*" IRI); |
| if (RE2::FullMatch(skipped, *re_iri)) { |
| return true; |
| } |
| return false; |
| } |
| |
| std::string RedactionTool::RedactCustomPatternWithContext( |
| const std::string& input, |
| const CustomPatternWithAlias& pattern, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| RE2* re = GetRegExp(pattern.pattern); |
| DCHECK_EQ(3, re->NumberOfCapturingGroups()); |
| std::map<std::string, std::string>* identifier_space = |
| &custom_patterns_with_context_[pattern.alias]; |
| |
| std::string result; |
| result.reserve(input.size()); |
| |
| // Keep consuming, building up a result string as we go. |
| std::string_view text(input); |
| std::string_view skipped; |
| std::string_view pre_matched_id, matched_id, post_matched_id; |
| while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id, |
| &matched_id, &post_matched_id)) { |
| std::string matched_id_as_string(matched_id); |
| std::string replacement_id; |
| |
| std::string scrubbed_match; |
| if (pattern.pii_type == PIIType::kIPAddress) { |
| std::string prematch(skipped); |
| prematch.append(pre_matched_id); |
| scrubbed_match = MaybeScrubIPAddress(matched_id_as_string); |
| if (scrubbed_match == matched_id_as_string || |
| ((UNSAFE_TODO(strcmp("IPv4", pattern.alias)) == 0) && |
| ShouldSkipIPv4Address(prematch))) { |
| result.append(skipped); |
| result.append(pre_matched_id); |
| result.append(matched_id); |
| result.append(post_matched_id); |
| continue; |
| } |
| } |
| |
| if (identifier_space->count(matched_id_as_string) == 0) { |
| // The weird NumberToString trick is because Windows does not like |
| // to deal with %zu and a size_t in printf, nor does it support %llu. |
| replacement_id = base::StringPrintf( |
| "(%s: %s)", |
| scrubbed_match.empty() ? pattern.alias : scrubbed_match.c_str(), |
| base::NumberToString(identifier_space->size() + 1).c_str()); |
| (*identifier_space)[matched_id_as_string] = replacement_id; |
| } else { |
| replacement_id = (*identifier_space)[matched_id_as_string]; |
| } |
| if (detected != nullptr) { |
| (*detected)[pattern.pii_type].insert(matched_id_as_string); |
| } |
| result.append(skipped); |
| result.append(pre_matched_id); |
| result += replacement_id; |
| result.append(post_matched_id); |
| metrics_recorder_->RecordPIIRedactedHistogram(pattern.pii_type); |
| } |
| result.append(text); |
| |
| return result; |
| } |
| |
| // This takes a |url| argument and returns true if the URL is exempt from |
| // redaction, returns false otherwise. |
| bool IsUrlExempt(std::string_view url, |
| const char* const* first_party_extension_ids) { |
| // We do not exempt anything with a query parameter. |
| if (url.find("?") != std::string_view::npos) { |
| return false; |
| } |
| |
| // Last part of an SELinux context is misdetected as a URL. |
| // e.g. "u:object_r:system_data_file:s0:c512,c768" |
| if (url.starts_with("file:s0")) { |
| return true; |
| } |
| |
| // Check for chrome:// URLs that are exempt. |
| if (url.starts_with("chrome://")) { |
| // We allow everything in chrome://resources/. |
| if (url.starts_with("chrome://resources/")) { |
| return true; |
| } |
| |
| // We allow chrome://*/crisper.js. |
| if (url.ends_with("/crisper.js")) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| if (!first_party_extension_ids) { |
| return false; |
| } |
| |
| // Exempt URLs of the format chrome-extension://<first-party-id>/*.js |
| if (!url.starts_with("chrome-extension://")) { |
| return false; |
| } |
| |
| // These must end with a .js extension. |
| if (!url.ends_with(".js")) { |
| return false; |
| } |
| |
| int i = 0; |
| const char* test_id = UNSAFE_TODO(first_party_extension_ids[i]); |
| const std::string_view url_sub = |
| url.substr(sizeof("chrome-extension://") - 1); |
| while (test_id) { |
| if (url_sub.starts_with(test_id)) { |
| return true; |
| } |
| test_id = UNSAFE_TODO(first_party_extension_ids[++i]); |
| } |
| return false; |
| } |
| |
| std::string RedactionTool::RedactCustomPatternWithoutContext( |
| const std::string& input, |
| const CustomPatternWithAlias& pattern, |
| std::map<PIIType, std::set<std::string>>* detected) { |
| RE2* re = GetRegExp(pattern.pattern); |
| DCHECK_EQ(1, re->NumberOfCapturingGroups()); |
| |
| std::map<std::string, std::string>* identifier_space = |
| &custom_patterns_without_context_[pattern.alias]; |
| |
| std::string result; |
| result.reserve(input.size()); |
| |
| // Keep consuming, building up a result string as we go. |
| std::string_view text(input); |
| std::string_view skipped; |
| std::string_view matched_id; |
| while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) { |
| result.append(skipped); |
| |
| if (IsUrlExempt(matched_id, first_party_extension_ids_)) { |
| result.append(matched_id); |
| continue; |
| } |
| |
| const std::string matched_id_as_string(matched_id); |
| if (const auto previous_replacement = |
| identifier_space->find(matched_id_as_string); |
| previous_replacement != identifier_space->end()) { |
| metrics_recorder_->RecordPIIRedactedHistogram(pattern.pii_type); |
| result.append(previous_replacement->second); |
| continue; |
| } |
| |
| const std::string scrubbed_match = |
| MaybeScrubIPAddress(matched_id_as_string); |
| if (scrubbed_match == matched_id_as_string) { |
| result.append(matched_id); |
| continue; |
| } |
| |
| // The weird NumberToString trick is because Windows does not like |
| // to deal with %zu and a size_t in printf, nor does it support %llu. |
| const auto [redacted_pair, success] = identifier_space->insert_or_assign( |
| matched_id_as_string, |
| base::StringPrintf( |
| "(%s: %s)", |
| scrubbed_match.empty() ? pattern.alias : scrubbed_match.c_str(), |
| base::NumberToString(identifier_space->size() + 1).c_str())); |
| if (detected != nullptr) { |
| (*detected)[pattern.pii_type].insert(matched_id_as_string); |
| } |
| |
| result += redacted_pair->second; |
| metrics_recorder_->RecordPIIRedactedHistogram(pattern.pii_type); |
| } |
| result.append(text); |
| |
| return result; |
| } |
| |
| RedactionToolContainer::RedactionToolContainer( |
| scoped_refptr<base::SequencedTaskRunner> task_runner, |
| const char* const* first_party_extension_ids) |
| : redactor_(new RedactionTool(first_party_extension_ids)), |
| task_runner_(task_runner) {} |
| |
| RedactionToolContainer::RedactionToolContainer( |
| scoped_refptr<base::SequencedTaskRunner> task_runner, |
| const char* const* first_party_extension_ids, |
| std::unique_ptr<RedactionToolMetricsRecorder> metrics_recorder) |
| : redactor_(new RedactionTool(first_party_extension_ids, |
| std::move(metrics_recorder))), |
| task_runner_(task_runner) {} |
| |
| RedactionToolContainer::~RedactionToolContainer() { |
| task_runner_->DeleteSoon(FROM_HERE, std::move(redactor_)); |
| } |
| |
| RedactionTool* RedactionToolContainer::Get() { |
| DCHECK(task_runner_->RunsTasksInCurrentSequence()); |
| return redactor_.get(); |
| } |
| |
| } // namespace redaction |