| #include <stdbool.h> |
| |
| #include <Python.h> |
| #include "pycore_bytesobject.h" // _PyBytes_DecodeEscape() |
| #include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal() |
| |
| #include "lexer/state.h" |
| #include "pegen.h" |
| #include "string_parser.h" |
| |
| //// STRING HANDLING FUNCTIONS //// |
| |
| static int |
| warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) |
| { |
| if (p->call_invalid_rules) { |
| // Do not report warnings if we are in the second pass of the parser |
| // to avoid showing the warning twice. |
| return 0; |
| } |
| unsigned char c = (unsigned char)*first_invalid_escape; |
| if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { |
| // in this case the tokenizer has already emitted a warning, |
| // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence |
| return 0; |
| } |
| |
| int octal = ('4' <= c && c <= '7'); |
| PyObject *msg = |
| octal |
| ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'", |
| first_invalid_escape) |
| : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c); |
| if (msg == NULL) { |
| return -1; |
| } |
| PyObject *category; |
| if (p->feature_version >= 12) { |
| category = PyExc_SyntaxWarning; |
| } |
| else { |
| category = PyExc_DeprecationWarning; |
| } |
| if (PyErr_WarnExplicitObject(category, msg, p->tok->filename, |
| t->lineno, NULL, NULL) < 0) { |
| if (PyErr_ExceptionMatches(category)) { |
| /* Replace the Syntax/DeprecationWarning exception with a SyntaxError |
| to get a more accurate error report */ |
| PyErr_Clear(); |
| |
| /* This is needed, in order for the SyntaxError to point to the token t, |
| since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the |
| error location, if p->known_err_token is not set. */ |
| p->known_err_token = t; |
| if (octal) { |
| RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", |
| first_invalid_escape); |
| } |
| else { |
| RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); |
| } |
| } |
| Py_DECREF(msg); |
| return -1; |
| } |
| Py_DECREF(msg); |
| return 0; |
| } |
| |
| static PyObject * |
| decode_utf8(const char **sPtr, const char *end) |
| { |
| const char *s; |
| const char *t; |
| t = s = *sPtr; |
| while (s < end && (*s & 0x80)) { |
| s++; |
| } |
| *sPtr = s; |
| return PyUnicode_DecodeUTF8(t, s - t, NULL); |
| } |
| |
| static PyObject * |
| decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) |
| { |
| PyObject *v; |
| PyObject *u; |
| char *buf; |
| char *p; |
| const char *end; |
| |
| /* check for integer overflow */ |
| if (len > (size_t)PY_SSIZE_T_MAX / 6) { |
| return NULL; |
| } |
| /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 |
| "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ |
| u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6); |
| if (u == NULL) { |
| return NULL; |
| } |
| p = buf = PyBytes_AsString(u); |
| if (p == NULL) { |
| return NULL; |
| } |
| end = s + len; |
| while (s < end) { |
| if (*s == '\\') { |
| *p++ = *s++; |
| if (s >= end || *s & 0x80) { |
| strcpy(p, "u005c"); |
| p += 5; |
| if (s >= end) { |
| break; |
| } |
| } |
| } |
| if (*s & 0x80) { |
| PyObject *w; |
| int kind; |
| const void *data; |
| Py_ssize_t w_len; |
| Py_ssize_t i; |
| w = decode_utf8(&s, end); |
| if (w == NULL) { |
| Py_DECREF(u); |
| return NULL; |
| } |
| kind = PyUnicode_KIND(w); |
| data = PyUnicode_DATA(w); |
| w_len = PyUnicode_GET_LENGTH(w); |
| for (i = 0; i < w_len; i++) { |
| Py_UCS4 chr = PyUnicode_READ(kind, data, i); |
| sprintf(p, "\\U%08x", chr); |
| p += 10; |
| } |
| /* Should be impossible to overflow */ |
| assert(p - buf <= PyBytes_GET_SIZE(u)); |
| Py_DECREF(w); |
| } |
| else { |
| *p++ = *s++; |
| } |
| } |
| len = (size_t)(p - buf); |
| s = buf; |
| |
| const char *first_invalid_escape; |
| v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape); |
| |
| // HACK: later we can simply pass the line no, since we don't preserve the tokens |
| // when we are decoding the string but we preserve the line numbers. |
| if (v != NULL && first_invalid_escape != NULL && t != NULL) { |
| if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { |
| /* We have not decref u before because first_invalid_escape points |
| inside u. */ |
| Py_XDECREF(u); |
| Py_DECREF(v); |
| return NULL; |
| } |
| } |
| Py_XDECREF(u); |
| return v; |
| } |
| |
| static PyObject * |
| decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) |
| { |
| const char *first_invalid_escape; |
| PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape); |
| if (result == NULL) { |
| return NULL; |
| } |
| |
| if (first_invalid_escape != NULL) { |
| if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { |
| Py_DECREF(result); |
| return NULL; |
| } |
| } |
| return result; |
| } |
| |
| PyObject * |
| _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t) |
| { |
| if (raw) { |
| return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL); |
| } |
| return decode_unicode_with_escapes(p, s, len, t); |
| } |
| |
| /* s must include the bracketing quote characters, and r, b &/or f prefixes |
| (if any), and embedded escape sequences (if any). (f-strings are handled by the parser) |
| _PyPegen_parse_string parses it, and returns the decoded Python string object. */ |
| PyObject * |
| _PyPegen_parse_string(Parser *p, Token *t) |
| { |
| const char *s = PyBytes_AsString(t->bytes); |
| if (s == NULL) { |
| return NULL; |
| } |
| |
| size_t len; |
| int quote = Py_CHARMASK(*s); |
| int bytesmode = 0; |
| int rawmode = 0; |
| |
| if (Py_ISALPHA(quote)) { |
| while (!bytesmode || !rawmode) { |
| if (quote == 'b' || quote == 'B') { |
| quote =(unsigned char)*++s; |
| bytesmode = 1; |
| } |
| else if (quote == 'u' || quote == 'U') { |
| quote = (unsigned char)*++s; |
| } |
| else if (quote == 'r' || quote == 'R') { |
| quote = (unsigned char)*++s; |
| rawmode = 1; |
| } |
| else { |
| break; |
| } |
| } |
| } |
| |
| if (quote != '\'' && quote != '\"') { |
| PyErr_BadInternalCall(); |
| return NULL; |
| } |
| |
| /* Skip the leading quote char. */ |
| s++; |
| len = strlen(s); |
| // gh-120155: 's' contains at least the trailing quote, |
| // so the code '--len' below is safe. |
| assert(len >= 1); |
| |
| if (len > INT_MAX) { |
| PyErr_SetString(PyExc_OverflowError, "string to parse is too long"); |
| return NULL; |
| } |
| if (s[--len] != quote) { |
| /* Last quote char must match the first. */ |
| PyErr_BadInternalCall(); |
| return NULL; |
| } |
| if (len >= 4 && s[0] == quote && s[1] == quote) { |
| /* A triple quoted string. We've already skipped one quote at |
| the start and one at the end of the string. Now skip the |
| two at the start. */ |
| s += 2; |
| len -= 2; |
| /* And check that the last two match. */ |
| if (s[--len] != quote || s[--len] != quote) { |
| PyErr_BadInternalCall(); |
| return NULL; |
| } |
| } |
| |
| /* Avoid invoking escape decoding routines if possible. */ |
| rawmode = rawmode || strchr(s, '\\') == NULL; |
| if (bytesmode) { |
| /* Disallow non-ASCII characters. */ |
| const char *ch; |
| for (ch = s; *ch; ch++) { |
| if (Py_CHARMASK(*ch) >= 0x80) { |
| RAISE_SYNTAX_ERROR_KNOWN_LOCATION( |
| t, |
| "bytes can only contain ASCII " |
| "literal characters"); |
| return NULL; |
| } |
| } |
| if (rawmode) { |
| return PyBytes_FromStringAndSize(s, (Py_ssize_t)len); |
| } |
| return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t); |
| } |
| return _PyPegen_decode_string(p, rawmode, s, len, t); |
| } |