|  | #include "Python.h" | 
|  | #include "errcode.h" | 
|  | #include "../Parser/tokenizer.h" | 
|  | #include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset() | 
|  | #include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset() | 
|  |  | 
|  | static struct PyModuleDef _tokenizemodule; | 
|  |  | 
|  | typedef struct { | 
|  | PyTypeObject *TokenizerIter; | 
|  | } tokenize_state; | 
|  |  | 
|  | static tokenize_state * | 
|  | get_tokenize_state(PyObject *module) { | 
|  | return (tokenize_state *)PyModule_GetState(module); | 
|  | } | 
|  |  | 
|  | #define _tokenize_get_state_by_type(type) \ | 
|  | get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule)) | 
|  |  | 
|  | #include "pycore_runtime.h" | 
|  | #include "clinic/Python-tokenize.c.h" | 
|  |  | 
|  | /*[clinic input] | 
|  | module _tokenizer | 
|  | class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter" | 
|  | [clinic start generated code]*/ | 
|  | /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/ | 
|  |  | 
|  | typedef struct | 
|  | { | 
|  | PyObject_HEAD struct tok_state *tok; | 
|  | int done; | 
|  | } tokenizeriterobject; | 
|  |  | 
|  | /*[clinic input] | 
|  | @classmethod | 
|  | _tokenizer.tokenizeriter.__new__ as tokenizeriter_new | 
|  |  | 
|  | readline: object | 
|  | / | 
|  | * | 
|  | extra_tokens: bool | 
|  | encoding: str(c_default="NULL") = 'utf-8' | 
|  | [clinic start generated code]*/ | 
|  |  | 
|  | static PyObject * | 
|  | tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline, | 
|  | int extra_tokens, const char *encoding) | 
|  | /*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/ | 
|  | { | 
|  | tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0); | 
|  | if (self == NULL) { | 
|  | return NULL; | 
|  | } | 
|  | PyObject *filename = PyUnicode_FromString("<string>"); | 
|  | if (filename == NULL) { | 
|  | return NULL; | 
|  | } | 
|  | self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1); | 
|  | if (self->tok == NULL) { | 
|  | Py_DECREF(filename); | 
|  | return NULL; | 
|  | } | 
|  | self->tok->filename = filename; | 
|  | if (extra_tokens) { | 
|  | self->tok->tok_extra_tokens = 1; | 
|  | } | 
|  | self->done = 0; | 
|  | return (PyObject *)self; | 
|  | } | 
|  |  | 
|  | static int | 
|  | _tokenizer_error(struct tok_state *tok) | 
|  | { | 
|  | if (PyErr_Occurred()) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | const char *msg = NULL; | 
|  | PyObject* errtype = PyExc_SyntaxError; | 
|  | switch (tok->done) { | 
|  | case E_TOKEN: | 
|  | msg = "invalid token"; | 
|  | break; | 
|  | case E_EOF: | 
|  | PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement"); | 
|  | PyErr_SyntaxLocationObject(tok->filename, tok->lineno, | 
|  | tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf)); | 
|  | return -1; | 
|  | case E_DEDENT: | 
|  | msg = "unindent does not match any outer indentation level"; | 
|  | errtype = PyExc_IndentationError; | 
|  | break; | 
|  | case E_INTR: | 
|  | if (!PyErr_Occurred()) { | 
|  | PyErr_SetNone(PyExc_KeyboardInterrupt); | 
|  | } | 
|  | return -1; | 
|  | case E_NOMEM: | 
|  | PyErr_NoMemory(); | 
|  | return -1; | 
|  | case E_TABSPACE: | 
|  | errtype = PyExc_TabError; | 
|  | msg = "inconsistent use of tabs and spaces in indentation"; | 
|  | break; | 
|  | case E_TOODEEP: | 
|  | errtype = PyExc_IndentationError; | 
|  | msg = "too many levels of indentation"; | 
|  | break; | 
|  | case E_LINECONT: { | 
|  | msg = "unexpected character after line continuation character"; | 
|  | break; | 
|  | } | 
|  | default: | 
|  | msg = "unknown tokenization error"; | 
|  | } | 
|  |  | 
|  | PyObject* errstr = NULL; | 
|  | PyObject* error_line = NULL; | 
|  | PyObject* tmp = NULL; | 
|  | PyObject* value = NULL; | 
|  | int result = 0; | 
|  |  | 
|  | Py_ssize_t size = tok->inp - tok->buf; | 
|  | assert(tok->buf[size-1] == '\n'); | 
|  | size -= 1; // Remove the newline character from the end of the line | 
|  | error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace"); | 
|  | if (!error_line) { | 
|  | result = -1; | 
|  | goto exit; | 
|  | } | 
|  |  | 
|  | Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf); | 
|  | if (offset == -1) { | 
|  | result = -1; | 
|  | goto exit; | 
|  | } | 
|  | tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None); | 
|  | if (!tmp) { | 
|  | result = -1; | 
|  | goto exit; | 
|  | } | 
|  |  | 
|  | errstr = PyUnicode_FromString(msg); | 
|  | if (!errstr) { | 
|  | result = -1; | 
|  | goto exit; | 
|  | } | 
|  |  | 
|  | value = PyTuple_Pack(2, errstr, tmp); | 
|  | if (!value) { | 
|  | result = -1; | 
|  | goto exit; | 
|  | } | 
|  |  | 
|  | PyErr_SetObject(errtype, value); | 
|  |  | 
|  | exit: | 
|  | Py_XDECREF(errstr); | 
|  | Py_XDECREF(error_line); | 
|  | Py_XDECREF(tmp); | 
|  | Py_XDECREF(value); | 
|  | return result; | 
|  | } | 
|  |  | 
|  | static PyObject * | 
|  | tokenizeriter_next(tokenizeriterobject *it) | 
|  | { | 
|  | PyObject* result = NULL; | 
|  | struct token token; | 
|  | _PyToken_Init(&token); | 
|  |  | 
|  | int type = _PyTokenizer_Get(it->tok, &token); | 
|  | if (type == ERRORTOKEN) { | 
|  | if(!PyErr_Occurred()) { | 
|  | _tokenizer_error(it->tok); | 
|  | assert(PyErr_Occurred()); | 
|  | } | 
|  | goto exit; | 
|  | } | 
|  | if (it->done || type == ERRORTOKEN) { | 
|  | PyErr_SetString(PyExc_StopIteration, "EOF"); | 
|  | it->done = 1; | 
|  | goto exit; | 
|  | } | 
|  | PyObject *str = NULL; | 
|  | if (token.start == NULL || token.end == NULL) { | 
|  | str = PyUnicode_FromString(""); | 
|  | } | 
|  | else { | 
|  | str = PyUnicode_FromStringAndSize(token.start, token.end - token.start); | 
|  | } | 
|  | if (str == NULL) { | 
|  | goto exit; | 
|  | } | 
|  |  | 
|  | int is_trailing_token = 0; | 
|  | if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) { | 
|  | is_trailing_token = 1; | 
|  | } | 
|  |  | 
|  | const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; | 
|  | PyObject* line = NULL; | 
|  | if (it->tok->tok_extra_tokens && is_trailing_token) { | 
|  | line = PyUnicode_FromString(""); | 
|  | } else { | 
|  | Py_ssize_t size = it->tok->inp - line_start; | 
|  | if (size >= 1 && it->tok->implicit_newline) { | 
|  | size -= 1; | 
|  | } | 
|  | line = PyUnicode_DecodeUTF8(line_start, size, "replace"); | 
|  | } | 
|  | if (line == NULL) { | 
|  | Py_DECREF(str); | 
|  | goto exit; | 
|  | } | 
|  |  | 
|  | Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; | 
|  | Py_ssize_t end_lineno = it->tok->lineno; | 
|  | Py_ssize_t col_offset = -1; | 
|  | Py_ssize_t end_col_offset = -1; | 
|  | if (token.start != NULL && token.start >= line_start) { | 
|  | col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start); | 
|  | } | 
|  | if (token.end != NULL && token.end >= it->tok->line_start) { | 
|  | end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start); | 
|  | } | 
|  |  | 
|  | if (it->tok->tok_extra_tokens) { | 
|  | if (is_trailing_token) { | 
|  | lineno = end_lineno = lineno + 1; | 
|  | col_offset = end_col_offset = 0; | 
|  | } | 
|  | // Necessary adjustments to match the original Python tokenize | 
|  | // implementation | 
|  | if (type > DEDENT && type < OP) { | 
|  | type = OP; | 
|  | } | 
|  | else if (type == NEWLINE) { | 
|  | Py_DECREF(str); | 
|  | if (!it->tok->implicit_newline) { | 
|  | if (it->tok->start[0] == '\r') { | 
|  | str = PyUnicode_FromString("\r\n"); | 
|  | } else { | 
|  | str = PyUnicode_FromString("\n"); | 
|  | } | 
|  | } | 
|  | end_col_offset++; | 
|  | } | 
|  | else if (type == NL) { | 
|  | if (it->tok->implicit_newline) { | 
|  | Py_DECREF(str); | 
|  | str = PyUnicode_FromString(""); | 
|  | } | 
|  | } | 
|  |  | 
|  | if (str == NULL) { | 
|  | Py_DECREF(line); | 
|  | goto exit; | 
|  | } | 
|  | } | 
|  |  | 
|  | result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line); | 
|  | exit: | 
|  | _PyToken_Free(&token); | 
|  | if (type == ENDMARKER) { | 
|  | it->done = 1; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | static void | 
|  | tokenizeriter_dealloc(tokenizeriterobject *it) | 
|  | { | 
|  | PyTypeObject *tp = Py_TYPE(it); | 
|  | _PyTokenizer_Free(it->tok); | 
|  | tp->tp_free(it); | 
|  | Py_DECREF(tp); | 
|  | } | 
|  |  | 
|  | static PyType_Slot tokenizeriter_slots[] = { | 
|  | {Py_tp_new, tokenizeriter_new}, | 
|  | {Py_tp_dealloc, tokenizeriter_dealloc}, | 
|  | {Py_tp_getattro, PyObject_GenericGetAttr}, | 
|  | {Py_tp_iter, PyObject_SelfIter}, | 
|  | {Py_tp_iternext, tokenizeriter_next}, | 
|  | {0, NULL}, | 
|  | }; | 
|  |  | 
|  | static PyType_Spec tokenizeriter_spec = { | 
|  | .name = "_tokenize.TokenizerIter", | 
|  | .basicsize = sizeof(tokenizeriterobject), | 
|  | .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE), | 
|  | .slots = tokenizeriter_slots, | 
|  | }; | 
|  |  | 
|  | static int | 
|  | tokenizemodule_exec(PyObject *m) | 
|  | { | 
|  | tokenize_state *state = get_tokenize_state(m); | 
|  | if (state == NULL) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL); | 
|  | if (state->TokenizerIter == NULL) { | 
|  | return -1; | 
|  | } | 
|  | if (PyModule_AddType(m, state->TokenizerIter) < 0) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static PyMethodDef tokenize_methods[] = { | 
|  | {NULL, NULL, 0, NULL} /* Sentinel */ | 
|  | }; | 
|  |  | 
|  | static PyModuleDef_Slot tokenizemodule_slots[] = { | 
|  | {Py_mod_exec, tokenizemodule_exec}, | 
|  | {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, | 
|  | {0, NULL} | 
|  | }; | 
|  |  | 
|  | static int | 
|  | tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg) | 
|  | { | 
|  | tokenize_state *state = get_tokenize_state(m); | 
|  | Py_VISIT(state->TokenizerIter); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int | 
|  | tokenizemodule_clear(PyObject *m) | 
|  | { | 
|  | tokenize_state *state = get_tokenize_state(m); | 
|  | Py_CLEAR(state->TokenizerIter); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void | 
|  | tokenizemodule_free(void *m) | 
|  | { | 
|  | tokenizemodule_clear((PyObject *)m); | 
|  | } | 
|  |  | 
|  | static struct PyModuleDef _tokenizemodule = { | 
|  | PyModuleDef_HEAD_INIT, | 
|  | .m_name = "_tokenize", | 
|  | .m_size = sizeof(tokenize_state), | 
|  | .m_slots = tokenizemodule_slots, | 
|  | .m_methods = tokenize_methods, | 
|  | .m_traverse = tokenizemodule_traverse, | 
|  | .m_clear = tokenizemodule_clear, | 
|  | .m_free = tokenizemodule_free, | 
|  | }; | 
|  |  | 
|  | PyMODINIT_FUNC | 
|  | PyInit__tokenize(void) | 
|  | { | 
|  | return PyModuleDef_Init(&_tokenizemodule); | 
|  | } |