| /* |
| * Copyright 2024 WebAssembly Community Group participants |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <ostream> |
| |
| #include "support/string.h" |
| |
| namespace wasm::String { |
| |
| Split::Split(const std::string& input, const NewLineOr& newLineOrDelim) { |
| auto first = input.find("\n", 0); |
| if (first != std::string::npos && first != input.length() - 1) { |
| split(input, "\n"); |
| } else { |
| split(input, newLineOrDelim.delim); |
| } |
| } |
| |
| void Split::split(const std::string& input, const std::string& delim) { |
| size_t lastEnd = 0; |
| while (lastEnd < input.size()) { |
| auto nextDelim = input.find(delim, lastEnd); |
| if (nextDelim == std::string::npos) { |
| nextDelim = input.size(); |
| } |
| (*this).push_back(input.substr(lastEnd, nextDelim - lastEnd)); |
| lastEnd = nextDelim + delim.size(); |
| } |
| needToHandleBracketingOperations = delim != "\n"; |
| } |
| |
| Split handleBracketingOperators(Split split) { |
| if (!split.needToHandleBracketingOperations) { |
| return split; |
| } |
| |
| Split ret; |
| std::string last; |
| int nesting = 0; |
| auto handlePart = [&](std::string part) { |
| if (part.empty()) { |
| return; |
| } |
| for (const char c : part) { |
| if (c == '(' || c == '<' || c == '[' || c == '{') { |
| nesting++; |
| } else if (c == ')' || c == '>' || c == ']' || c == '}') { |
| nesting--; |
| } |
| } |
| if (last.empty()) { |
| last = part; |
| } else { |
| last += ',' + part; |
| } |
| if (nesting == 0) { |
| ret.push_back(last); |
| last.clear(); |
| } |
| }; |
| for (auto& part : split) { |
| handlePart(part); |
| } |
| handlePart(""); |
| if (nesting != 0) { |
| Fatal() << "Asyncify: failed to parse lists"; |
| } |
| return ret; |
| } |
| |
| bool wildcardMatch(const std::string& pattern, const std::string& value) { |
| for (size_t i = 0; i < pattern.size(); i++) { |
| if (pattern[i] == '*') { |
| return wildcardMatch(pattern.substr(i + 1), value.substr(i)) || |
| (value.size() > 0 && |
| wildcardMatch(pattern.substr(i), value.substr(i + 1))); |
| } |
| if (i >= value.size()) { |
| return false; |
| } |
| if (pattern[i] != value[i]) { |
| return false; |
| } |
| } |
| return value.size() == pattern.size(); |
| } |
| |
| std::string trim(const std::string& input) { |
| size_t size = input.size(); |
| while (size > 0 && (isspace(input[size - 1]) || input[size - 1] == '\0')) { |
| size--; |
| } |
| return input.substr(0, size); |
| } |
| |
| std::ostream& printEscaped(std::ostream& os, const std::string_view str) { |
| os << '"'; |
| for (unsigned char c : str) { |
| switch (c) { |
| case '\t': |
| os << "\\t"; |
| break; |
| case '\n': |
| os << "\\n"; |
| break; |
| case '\r': |
| os << "\\r"; |
| break; |
| case '"': |
| os << "\\\""; |
| break; |
| case '\'': |
| os << "\\'"; |
| break; |
| case '\\': |
| os << "\\\\"; |
| break; |
| default: { |
| if (c >= 32 && c < 127) { |
| os << c; |
| } else { |
| os << std::hex << '\\' << (c / 16) << (c % 16) << std::dec; |
| } |
| } |
| } |
| } |
| return os << '"'; |
| } |
| |
| std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) { |
| os << '"'; |
| constexpr uint32_t replacementCharacter = 0xFFFD; |
| bool lastWasLeadingSurrogate = false; |
| for (size_t i = 0; i < str.size();) { |
| // Decode from WTF-8 into a unicode code point. |
| uint8_t leading = str[i]; |
| size_t trailingBytes; |
| uint32_t u; |
| if ((leading & 0b10000000) == 0b00000000) { |
| // 0xxxxxxx |
| trailingBytes = 0; |
| u = leading; |
| } else if ((leading & 0b11100000) == 0b11000000) { |
| // 110xxxxx 10xxxxxx |
| trailingBytes = 1; |
| u = (leading & 0b00011111) << 6; |
| } else if ((leading & 0b11110000) == 0b11100000) { |
| // 1110xxxx 10xxxxxx 10xxxxxx |
| trailingBytes = 2; |
| u = (leading & 0b00001111) << 12; |
| } else if ((leading & 0b11111000) == 0b11110000) { |
| // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| trailingBytes = 3; |
| u = (leading & 0b00000111) << 18; |
| } else { |
| std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex |
| << int(leading) << std::dec << "). Replacing.\n"; |
| trailingBytes = 0; |
| u = replacementCharacter; |
| } |
| |
| ++i; |
| |
| if (i + trailingBytes > str.size()) { |
| std::cerr << "warning: Unexpected end of string. Replacing.\n"; |
| u = replacementCharacter; |
| } else { |
| for (size_t j = 0; j < trailingBytes; ++j) { |
| uint8_t trailing = str[i + j]; |
| if ((trailing & 0b11000000) != 0b10000000) { |
| std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex |
| << int(trailing) << std::dec << "). Replacing.\n"; |
| u = replacementCharacter; |
| break; |
| } |
| // Shift 6 bits for every remaining trailing byte after this one. |
| u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1)); |
| } |
| } |
| |
| i += trailingBytes; |
| |
| bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF; |
| bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF; |
| if (lastWasLeadingSurrogate && isTrailingSurrogate) { |
| std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n"; |
| } |
| lastWasLeadingSurrogate = isLeadingSurrogate; |
| |
| // Encode unicode code point into JSON. |
| switch (u) { |
| case '"': |
| os << "\\\""; |
| continue; |
| case '\\': |
| os << "\\\\"; |
| continue; |
| case '\b': |
| os << "\\b"; |
| continue; |
| case '\f': |
| os << "\\f"; |
| continue; |
| case '\n': |
| os << "\\n"; |
| continue; |
| case '\r': |
| os << "\\r"; |
| continue; |
| case '\t': |
| os << "\\t"; |
| continue; |
| default: |
| break; |
| } |
| |
| // TODO: To minimize size, consider additionally escaping only other control |
| // characters (u <= 0x1F) and surrogates, emitting everything else directly |
| // assuming a UTF-8 encoding of the JSON text. We don't do this now because |
| // Print.cpp would consider the contents unprintable, messing up our test. |
| bool isNaivelyPrintable = 32 <= u && u < 127; |
| if (isNaivelyPrintable) { |
| assert(u < 0x80 && "need additional logic to emit valid UTF-8"); |
| os << uint8_t(u); |
| continue; |
| } |
| |
| // Escape as '\uXXXX` for code points less than 0x10000 or as a |
| // '\uXXXX\uYYYY' surrogate pair otherwise. |
| auto printEscape = [&os](uint32_t codePoint) { |
| assert(codePoint < 0x10000); |
| os << std::hex << "\\u"; |
| os << ((codePoint & 0xF000) >> 12); |
| os << ((codePoint & 0x0F00) >> 8); |
| os << ((codePoint & 0x00F0) >> 4); |
| os << (codePoint & 0x000F); |
| os << std::dec; |
| }; |
| if (u < 0x10000) { |
| printEscape(u); |
| } else { |
| assert(u <= 0x10FFFF && "unexpectedly high code point"); |
| printEscape(0xD800 + ((u - 0x10000) >> 10)); |
| printEscape(0xDC00 + ((u - 0x10000) & 0x3FF)); |
| } |
| } |
| return os << '"'; |
| } |
| |
| } // namespace wasm::String |