| #include <Python.h> |
| #include <errcode.h> |
| |
| #include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject() |
| #include "lexer/state.h" |
| #include "lexer/lexer.h" |
| #include "pegen.h" |
| |
| // TOKENIZER ERRORS |
| |
| void |
| _PyPegen_raise_tokenizer_init_error(PyObject *filename) |
| { |
| if (!(PyErr_ExceptionMatches(PyExc_LookupError) |
| || PyErr_ExceptionMatches(PyExc_SyntaxError) |
| || PyErr_ExceptionMatches(PyExc_ValueError) |
| || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { |
| return; |
| } |
| PyObject *errstr = NULL; |
| PyObject *tuple = NULL; |
| PyObject *type; |
| PyObject *value; |
| PyObject *tback; |
| PyErr_Fetch(&type, &value, &tback); |
| errstr = PyObject_Str(value); |
| if (!errstr) { |
| goto error; |
| } |
| |
| PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
| if (!tmp) { |
| goto error; |
| } |
| |
| tuple = PyTuple_Pack(2, errstr, tmp); |
| Py_DECREF(tmp); |
| if (!value) { |
| goto error; |
| } |
| PyErr_SetObject(PyExc_SyntaxError, tuple); |
| |
| error: |
| Py_XDECREF(type); |
| Py_XDECREF(value); |
| Py_XDECREF(tback); |
| Py_XDECREF(errstr); |
| Py_XDECREF(tuple); |
| } |
| |
| static inline void |
| raise_unclosed_parentheses_error(Parser *p) { |
| int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
| int error_col = p->tok->parencolstack[p->tok->level-1]; |
| RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
| error_lineno, error_col, error_lineno, -1, |
| "'%c' was never closed", |
| p->tok->parenstack[p->tok->level-1]); |
| } |
| |
| int |
| _Pypegen_tokenizer_error(Parser *p) |
| { |
| if (PyErr_Occurred()) { |
| return -1; |
| } |
| |
| const char *msg = NULL; |
| PyObject* errtype = PyExc_SyntaxError; |
| Py_ssize_t col_offset = -1; |
| p->error_indicator = 1; |
| switch (p->tok->done) { |
| case E_TOKEN: |
| msg = "invalid token"; |
| break; |
| case E_EOF: |
| if (p->tok->level) { |
| raise_unclosed_parentheses_error(p); |
| } else { |
| RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
| } |
| return -1; |
| case E_DEDENT: |
| RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
| return -1; |
| case E_INTR: |
| if (!PyErr_Occurred()) { |
| PyErr_SetNone(PyExc_KeyboardInterrupt); |
| } |
| return -1; |
| case E_NOMEM: |
| PyErr_NoMemory(); |
| return -1; |
| case E_TABSPACE: |
| errtype = PyExc_TabError; |
| msg = "inconsistent use of tabs and spaces in indentation"; |
| break; |
| case E_TOODEEP: |
| errtype = PyExc_IndentationError; |
| msg = "too many levels of indentation"; |
| break; |
| case E_LINECONT: { |
| col_offset = p->tok->cur - p->tok->buf - 1; |
| msg = "unexpected character after line continuation character"; |
| break; |
| } |
| case E_COLUMNOVERFLOW: |
| PyErr_SetString(PyExc_OverflowError, |
| "Parser column offset overflow - source line is too big"); |
| return -1; |
| default: |
| msg = "unknown parsing error"; |
| } |
| |
| RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
| col_offset >= 0 ? col_offset : 0, |
| p->tok->lineno, -1, msg); |
| return -1; |
| } |
| |
| int |
| _Pypegen_raise_decode_error(Parser *p) |
| { |
| assert(PyErr_Occurred()); |
| const char *errtype = NULL; |
| if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { |
| errtype = "unicode error"; |
| } |
| else if (PyErr_ExceptionMatches(PyExc_ValueError)) { |
| errtype = "value error"; |
| } |
| if (errtype) { |
| PyObject *type; |
| PyObject *value; |
| PyObject *tback; |
| PyObject *errstr; |
| PyErr_Fetch(&type, &value, &tback); |
| errstr = PyObject_Str(value); |
| if (errstr) { |
| RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
| Py_DECREF(errstr); |
| } |
| else { |
| PyErr_Clear(); |
| RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
| } |
| Py_XDECREF(type); |
| Py_XDECREF(value); |
| Py_XDECREF(tback); |
| } |
| |
| return -1; |
| } |
| |
| static int |
| _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
| // Tokenize the whole input to see if there are any tokenization |
| // errors such as mismatching parentheses. These will get priority |
| // over generic syntax errors only if the line number of the error is |
| // before the one that we had for the generic error. |
| |
| // We don't want to tokenize to the end for interactive input |
| if (p->tok->prompt != NULL) { |
| return 0; |
| } |
| |
| PyObject *type, *value, *traceback; |
| PyErr_Fetch(&type, &value, &traceback); |
| |
| Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
| Py_ssize_t current_err_line = current_token->lineno; |
| |
| int ret = 0; |
| struct token new_token; |
| _PyToken_Init(&new_token); |
| |
| for (;;) { |
| switch (_PyTokenizer_Get(p->tok, &new_token)) { |
| case ERRORTOKEN: |
| if (PyErr_Occurred()) { |
| ret = -1; |
| goto exit; |
| } |
| if (p->tok->level != 0) { |
| int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
| if (current_err_line > error_lineno) { |
| raise_unclosed_parentheses_error(p); |
| ret = -1; |
| goto exit; |
| } |
| } |
| break; |
| case ENDMARKER: |
| break; |
| default: |
| continue; |
| } |
| break; |
| } |
| |
| |
| exit: |
| _PyToken_Free(&new_token); |
| // If we're in an f-string, we want the syntax error in the expression part |
| // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards |
| // do not swallow it. |
| if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) { |
| Py_XDECREF(value); |
| Py_XDECREF(type); |
| Py_XDECREF(traceback); |
| } else { |
| PyErr_Restore(type, value, traceback); |
| } |
| return ret; |
| } |
| |
| // PARSER ERRORS |
| |
| void * |
| _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...) |
| { |
| // Bail out if we already have an error set. |
| if (p->error_indicator && PyErr_Occurred()) { |
| return NULL; |
| } |
| if (p->fill == 0) { |
| va_list va; |
| va_start(va, errmsg); |
| _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
| va_end(va); |
| return NULL; |
| } |
| if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) { |
| p->error_indicator = 1; |
| return NULL; |
| } |
| Token *t = p->known_err_token != NULL |
| ? p->known_err_token |
| : p->tokens[use_mark ? p->mark : p->fill - 1]; |
| Py_ssize_t col_offset; |
| Py_ssize_t end_col_offset = -1; |
| if (t->col_offset == -1) { |
| if (p->tok->cur == p->tok->buf) { |
| col_offset = 0; |
| } else { |
| const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; |
| col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
| } |
| } else { |
| col_offset = t->col_offset + 1; |
| } |
| |
| if (t->end_col_offset != -1) { |
| end_col_offset = t->end_col_offset + 1; |
| } |
| |
| va_list va; |
| va_start(va, errmsg); |
| _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
| va_end(va); |
| |
| return NULL; |
| } |
| |
| static PyObject * |
| get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
| { |
| /* If the file descriptor is interactive, the source lines of the current |
| * (multi-line) statement are stored in p->tok->interactive_src_start. |
| * If not, we're parsing from a string, which means that the whole source |
| * is stored in p->tok->str. */ |
| assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL); |
| |
| char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; |
| if (cur_line == NULL) { |
| assert(p->tok->fp_interactive); |
| // We can reach this point if the tokenizer buffers for interactive source have not been |
| // initialized because we failed to decode the original source with the given locale. |
| return Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
| } |
| |
| Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; |
| const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; |
| |
| if (buf_end < cur_line) { |
| buf_end = cur_line + strlen(cur_line); |
| } |
| |
| for (int i = 0; i < relative_lineno - 1; i++) { |
| char *new_line = strchr(cur_line, '\n'); |
| // The assert is here for debug builds but the conditional that |
| // follows is there so in release builds we do not crash at the cost |
| // to report a potentially wrong line. |
| assert(new_line != NULL && new_line + 1 < buf_end); |
| if (new_line == NULL || new_line + 1 > buf_end) { |
| break; |
| } |
| cur_line = new_line + 1; |
| } |
| |
| char *next_newline; |
| if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line |
| next_newline = cur_line + strlen(cur_line); |
| } |
| return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
| } |
| |
| void * |
| _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
| Py_ssize_t lineno, Py_ssize_t col_offset, |
| Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
| const char *errmsg, va_list va) |
| { |
| // Bail out if we already have an error set. |
| if (p->error_indicator && PyErr_Occurred()) { |
| return NULL; |
| } |
| PyObject *value = NULL; |
| PyObject *errstr = NULL; |
| PyObject *error_line = NULL; |
| PyObject *tmp = NULL; |
| p->error_indicator = 1; |
| |
| if (end_lineno == CURRENT_POS) { |
| end_lineno = p->tok->lineno; |
| } |
| if (end_col_offset == CURRENT_POS) { |
| end_col_offset = p->tok->cur - p->tok->line_start; |
| } |
| |
| errstr = PyUnicode_FromFormatV(errmsg, va); |
| if (!errstr) { |
| goto error; |
| } |
| |
| if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { |
| error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
| } |
| else if (p->start_rule == Py_file_input) { |
| error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
| (int) lineno, p->tok->encoding); |
| } |
| |
| if (!error_line) { |
| /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
| then we need to find the error line from some other source, because |
| p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
| failed or we're parsing from a string or the REPL. There's a third edge case where |
| we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
| `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
| does not physically exist */ |
| assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
| |
| if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { |
| Py_ssize_t size = p->tok->inp - p->tok->buf; |
| error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); |
| } |
| else if (p->tok->fp == NULL || p->tok->fp == stdin) { |
| error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
| } |
| else { |
| error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
| } |
| if (!error_line) { |
| goto error; |
| } |
| } |
| |
| Py_ssize_t col_number = col_offset; |
| Py_ssize_t end_col_number = end_col_offset; |
| |
| col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
| if (col_number < 0) { |
| goto error; |
| } |
| |
| if (end_col_offset > 0) { |
| end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset); |
| if (end_col_number < 0) { |
| goto error; |
| } |
| } |
| |
| tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
| if (!tmp) { |
| goto error; |
| } |
| value = PyTuple_Pack(2, errstr, tmp); |
| Py_DECREF(tmp); |
| if (!value) { |
| goto error; |
| } |
| PyErr_SetObject(errtype, value); |
| |
| Py_DECREF(errstr); |
| Py_DECREF(value); |
| return NULL; |
| |
| error: |
| Py_XDECREF(errstr); |
| Py_XDECREF(error_line); |
| return NULL; |
| } |
| |
| void |
| _Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
| // Existing syntax error |
| if (PyErr_Occurred()) { |
| // Prioritize tokenizer errors to custom syntax errors raised |
| // on the second phase only if the errors come from the parser. |
| int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); |
| if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { |
| _PyPegen_tokenize_full_source_to_check_for_errors(p); |
| } |
| // Propagate the existing syntax error. |
| return; |
| } |
| // Initialization error |
| if (p->fill == 0) { |
| RAISE_SYNTAX_ERROR("error at start before reading any input"); |
| } |
| // Parser encountered EOF (End of File) unexpectedtly |
| if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { |
| if (p->tok->level) { |
| raise_unclosed_parentheses_error(p); |
| } else { |
| RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
| } |
| return; |
| } |
| // Indentation error in the tokenizer |
| if (last_token->type == INDENT || last_token->type == DEDENT) { |
| RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
| return; |
| } |
| // Unknown error (generic case) |
| |
| // Use the last token we found on the first pass to avoid reporting |
| // incorrect locations for generic syntax errors just because we reached |
| // further away when trying to find specific syntax errors in the second |
| // pass. |
| RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
| // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
| // generic SyntaxError we just raised if errors are found. |
| _PyPegen_tokenize_full_source_to_check_for_errors(p); |
| } |
| |
| void |
| _Pypegen_stack_overflow(Parser *p) |
| { |
| p->error_indicator = 1; |
| PyErr_SetString(PyExc_MemoryError, |
| "Parser stack overflowed - Python source too complex to parse"); |
| } |