| /* |
| |
| Unicode implementation based on original code by Fredrik Lundh, |
| modified by Marc-Andre Lemburg <mal@lemburg.com>. |
| |
| Major speed upgrades to the method implementations at the Reykjavik |
| NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. |
| |
| Copyright (c) Corporation for National Research Initiatives. |
| |
| -------------------------------------------------------------------- |
| The original string type implementation is: |
| |
| Copyright (c) 1999 by Secret Labs AB |
| Copyright (c) 1999 by Fredrik Lundh |
| |
| By obtaining, using, and/or copying this software and/or its |
| associated documentation, you agree that you have read, understood, |
| and will comply with the following terms and conditions: |
| |
| Permission to use, copy, modify, and distribute this software and its |
| associated documentation for any purpose and without fee is hereby |
| granted, provided that the above copyright notice appears in all |
| copies, and that both that copyright notice and this permission notice |
| appear in supporting documentation, and that the name of Secret Labs |
| AB or the author not be used in advertising or publicity pertaining to |
| distribution of the software without specific, written prior |
| permission. |
| |
| SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
| THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
| ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| -------------------------------------------------------------------- |
| |
| */ |
| |
| #define PY_SSIZE_T_CLEAN |
| #include "Python.h" |
| #include "pycore_abstract.h" // _PyIndex_Check() |
| #include "pycore_atomic_funcs.h" // _Py_atomic_size_get() |
| #include "pycore_bytesobject.h" // _PyBytes_Repeat() |
| #include "pycore_bytes_methods.h" // _Py_bytes_lower() |
| #include "pycore_format.h" // F_LJUST |
| #include "pycore_initconfig.h" // _PyStatus_OK() |
| #include "pycore_interp.h" // PyInterpreterState.fs_codec |
| #include "pycore_long.h" // _PyLong_FormatWriter() |
| #include "pycore_object.h" // _PyObject_GC_TRACK(), _Py_FatalRefcountError() |
| #include "pycore_pathconfig.h" // _Py_DumpPathConfig() |
| #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding() |
| #include "pycore_pystate.h" // _PyInterpreterState_GET() |
| #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI |
| #include "pycore_unicodeobject.h" // struct _Py_unicode_state |
| #include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings() |
| #include "stringlib/eq.h" // unicode_eq() |
| #include <stddef.h> // ptrdiff_t |
| |
| #ifdef MS_WINDOWS |
| #include <windows.h> |
| #endif |
| |
| #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION |
| # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar() |
| #endif |
| |
| /* Uncomment to display statistics on interned strings at exit |
| in _PyUnicode_ClearInterned(). */ |
| /* #define INTERNED_STATS 1 */ |
| |
| |
| /*[clinic input] |
| class str "PyObject *" "&PyUnicode_Type" |
| [clinic start generated code]*/ |
| /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/ |
| |
| /*[python input] |
| class Py_UCS4_converter(CConverter): |
| type = 'Py_UCS4' |
| converter = 'convert_uc' |
| |
| def converter_init(self): |
| if self.default is not unspecified: |
| self.c_default = ascii(self.default) |
| if len(self.c_default) > 4 or self.c_default[0] != "'": |
| self.c_default = hex(ord(self.default)) |
| |
| [python start generated code]*/ |
| /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/ |
| |
| /* --- Globals ------------------------------------------------------------ |
| |
| NOTE: In the interpreter's initialization phase, some globals are currently |
| initialized dynamically as needed. In the process Unicode objects may |
| be created before the Unicode type is ready. |
| |
| */ |
| |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111). |
| // The value must be the same in fileutils.c. |
| #define MAX_UNICODE 0x10ffff |
| |
| #ifdef Py_DEBUG |
| # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) |
| #else |
| # define _PyUnicode_CHECK(op) PyUnicode_Check(op) |
| #endif |
| |
| #define _PyUnicode_UTF8(op) \ |
| (_PyCompactUnicodeObject_CAST(op)->utf8) |
| #define PyUnicode_UTF8(op) \ |
| (assert(_PyUnicode_CHECK(op)), \ |
| PyUnicode_IS_COMPACT_ASCII(op) ? \ |
| ((char*)(_PyASCIIObject_CAST(op) + 1)) : \ |
| _PyUnicode_UTF8(op)) |
| #define _PyUnicode_UTF8_LENGTH(op) \ |
| (_PyCompactUnicodeObject_CAST(op)->utf8_length) |
| #define PyUnicode_UTF8_LENGTH(op) \ |
| (assert(_PyUnicode_CHECK(op)), \ |
| PyUnicode_IS_COMPACT_ASCII(op) ? \ |
| _PyASCIIObject_CAST(op)->length : \ |
| _PyUnicode_UTF8_LENGTH(op)) |
| |
| #define _PyUnicode_LENGTH(op) \ |
| (_PyASCIIObject_CAST(op)->length) |
| #define _PyUnicode_STATE(op) \ |
| (_PyASCIIObject_CAST(op)->state) |
| #define _PyUnicode_HASH(op) \ |
| (_PyASCIIObject_CAST(op)->hash) |
| #define _PyUnicode_KIND(op) \ |
| (assert(_PyUnicode_CHECK(op)), \ |
| _PyASCIIObject_CAST(op)->state.kind) |
| #define _PyUnicode_GET_LENGTH(op) \ |
| (assert(_PyUnicode_CHECK(op)), \ |
| _PyASCIIObject_CAST(op)->length) |
| #define _PyUnicode_DATA_ANY(op) \ |
| (_PyUnicodeObject_CAST(op)->data.any) |
| |
| #define _PyUnicode_SHARE_UTF8(op) \ |
| (assert(_PyUnicode_CHECK(op)), \ |
| assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ |
| (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) |
| |
| /* true if the Unicode object has an allocated UTF-8 memory block |
| (not shared with other data) */ |
| #define _PyUnicode_HAS_UTF8_MEMORY(op) \ |
| ((!PyUnicode_IS_COMPACT_ASCII(op) \ |
| && _PyUnicode_UTF8(op) \ |
| && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) |
| |
| /* Generic helper macro to convert characters of different types. |
| from_type and to_type have to be valid type names, begin and end |
| are pointers to the source characters which should be of type |
| "from_type *". to is a pointer of type "to_type *" and points to the |
| buffer where the result characters are written to. */ |
| #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ |
| do { \ |
| to_type *_to = (to_type *)(to); \ |
| const from_type *_iter = (const from_type *)(begin);\ |
| const from_type *_end = (const from_type *)(end);\ |
| Py_ssize_t n = (_end) - (_iter); \ |
| const from_type *_unrolled_end = \ |
| _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ |
| while (_iter < (_unrolled_end)) { \ |
| _to[0] = (to_type) _iter[0]; \ |
| _to[1] = (to_type) _iter[1]; \ |
| _to[2] = (to_type) _iter[2]; \ |
| _to[3] = (to_type) _iter[3]; \ |
| _iter += 4; _to += 4; \ |
| } \ |
| while (_iter < (_end)) \ |
| *_to++ = (to_type) *_iter++; \ |
| } while (0) |
| |
| #define LATIN1(ch) \ |
| (ch < 128 \ |
| ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \ |
| : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128]) |
| |
| #ifdef MS_WINDOWS |
| /* On Windows, overallocate by 50% is the best factor */ |
| # define OVERALLOCATE_FACTOR 2 |
| #else |
| /* On Linux, overallocate by 25% is the best factor */ |
| # define OVERALLOCATE_FACTOR 4 |
| #endif |
| |
| /* Forward declaration */ |
| static inline int |
| _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); |
| static inline void |
| _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); |
| static PyObject * |
| unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, |
| const char *errors); |
| static PyObject * |
| unicode_decode_utf8(const char *s, Py_ssize_t size, |
| _Py_error_handler error_handler, const char *errors, |
| Py_ssize_t *consumed); |
| #ifdef Py_DEBUG |
| static inline int unicode_is_finalizing(void); |
| static int unicode_is_singleton(PyObject *unicode); |
| #endif |
| |
| |
| // Return a borrowed reference to the empty string singleton. |
| static inline PyObject* unicode_get_empty(void) |
| { |
| _Py_DECLARE_STR(empty, ""); |
| return &_Py_STR(empty); |
| } |
| |
| |
| // Return a strong reference to the empty string singleton. |
| static inline PyObject* unicode_new_empty(void) |
| { |
| PyObject *empty = unicode_get_empty(); |
| return Py_NewRef(empty); |
| } |
| |
| /* This dictionary holds all interned unicode strings. Note that references |
| to strings in this dictionary are *not* counted in the string's ob_refcnt. |
| When the interned string reaches a refcnt of 0 the string deallocation |
| function will delete the reference from this dictionary. |
| */ |
| static inline PyObject *get_interned_dict(PyInterpreterState *interp) |
| { |
| return _Py_INTERP_CACHED_OBJECT(interp, interned_strings); |
| } |
| |
| Py_ssize_t |
| _PyUnicode_InternedSize() |
| { |
| return PyObject_Length(get_interned_dict(_PyInterpreterState_GET())); |
| } |
| |
| static int |
| init_interned_dict(PyInterpreterState *interp) |
| { |
| assert(get_interned_dict(interp) == NULL); |
| PyObject *interned = interned = PyDict_New(); |
| if (interned == NULL) { |
| return -1; |
| } |
| _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned; |
| return 0; |
| } |
| |
| static void |
| clear_interned_dict(PyInterpreterState *interp) |
| { |
| PyObject *interned = get_interned_dict(interp); |
| if (interned != NULL) { |
| PyDict_Clear(interned); |
| Py_DECREF(interned); |
| _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL; |
| } |
| } |
| |
| #define _Py_RETURN_UNICODE_EMPTY() \ |
| do { \ |
| return unicode_new_empty(); \ |
| } while (0) |
| |
| static inline void |
| unicode_fill(int kind, void *data, Py_UCS4 value, |
| Py_ssize_t start, Py_ssize_t length) |
| { |
| assert(0 <= start); |
| switch (kind) { |
| case PyUnicode_1BYTE_KIND: { |
| assert(value <= 0xff); |
| Py_UCS1 ch = (unsigned char)value; |
| Py_UCS1 *to = (Py_UCS1 *)data + start; |
| memset(to, ch, length); |
| break; |
| } |
| case PyUnicode_2BYTE_KIND: { |
| assert(value <= 0xffff); |
| Py_UCS2 ch = (Py_UCS2)value; |
| Py_UCS2 *to = (Py_UCS2 *)data + start; |
| const Py_UCS2 *end = to + length; |
| for (; to < end; ++to) *to = ch; |
| break; |
| } |
| case PyUnicode_4BYTE_KIND: { |
| assert(value <= MAX_UNICODE); |
| Py_UCS4 ch = value; |
| Py_UCS4 * to = (Py_UCS4 *)data + start; |
| const Py_UCS4 *end = to + length; |
| for (; to < end; ++to) *to = ch; |
| break; |
| } |
| default: Py_UNREACHABLE(); |
| } |
| } |
| |
| |
| /* Fast detection of the most frequent whitespace characters */ |
| const unsigned char _Py_ascii_whitespace[] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| /* case 0x0009: * CHARACTER TABULATION */ |
| /* case 0x000A: * LINE FEED */ |
| /* case 0x000B: * LINE TABULATION */ |
| /* case 0x000C: * FORM FEED */ |
| /* case 0x000D: * CARRIAGE RETURN */ |
| 0, 1, 1, 1, 1, 1, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| /* case 0x001C: * FILE SEPARATOR */ |
| /* case 0x001D: * GROUP SEPARATOR */ |
| /* case 0x001E: * RECORD SEPARATOR */ |
| /* case 0x001F: * UNIT SEPARATOR */ |
| 0, 0, 0, 0, 1, 1, 1, 1, |
| /* case 0x0020: * SPACE */ |
| 1, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| |
| /* forward */ |
| static PyObject* get_latin1_char(unsigned char ch); |
| static int unicode_modifiable(PyObject *unicode); |
| |
| |
| static PyObject * |
| _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); |
| static PyObject * |
| _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); |
| static PyObject * |
| _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); |
| |
| static PyObject * |
| unicode_encode_call_errorhandler(const char *errors, |
| PyObject **errorHandler,const char *encoding, const char *reason, |
| PyObject *unicode, PyObject **exceptionObject, |
| Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); |
| |
| static void |
| raise_encode_exception(PyObject **exceptionObject, |
| const char *encoding, |
| PyObject *unicode, |
| Py_ssize_t startpos, Py_ssize_t endpos, |
| const char *reason); |
| |
| /* Same for linebreaks */ |
| static const unsigned char ascii_linebreak[] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| /* 0x000A, * LINE FEED */ |
| /* 0x000B, * LINE TABULATION */ |
| /* 0x000C, * FORM FEED */ |
| /* 0x000D, * CARRIAGE RETURN */ |
| 0, 0, 1, 1, 1, 1, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| /* 0x001C, * FILE SEPARATOR */ |
| /* 0x001D, * GROUP SEPARATOR */ |
| /* 0x001E, * RECORD SEPARATOR */ |
| 0, 0, 0, 0, 1, 1, 1, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| |
| static int convert_uc(PyObject *obj, void *addr); |
| |
| struct encoding_map; |
| #include "clinic/unicodeobject.c.h" |
| |
| _Py_error_handler |
| _Py_GetErrorHandler(const char *errors) |
| { |
| if (errors == NULL || strcmp(errors, "strict") == 0) { |
| return _Py_ERROR_STRICT; |
| } |
| if (strcmp(errors, "surrogateescape") == 0) { |
| return _Py_ERROR_SURROGATEESCAPE; |
| } |
| if (strcmp(errors, "replace") == 0) { |
| return _Py_ERROR_REPLACE; |
| } |
| if (strcmp(errors, "ignore") == 0) { |
| return _Py_ERROR_IGNORE; |
| } |
| if (strcmp(errors, "backslashreplace") == 0) { |
| return _Py_ERROR_BACKSLASHREPLACE; |
| } |
| if (strcmp(errors, "surrogatepass") == 0) { |
| return _Py_ERROR_SURROGATEPASS; |
| } |
| if (strcmp(errors, "xmlcharrefreplace") == 0) { |
| return _Py_ERROR_XMLCHARREFREPLACE; |
| } |
| return _Py_ERROR_OTHER; |
| } |
| |
| |
| static _Py_error_handler |
| get_error_handler_wide(const wchar_t *errors) |
| { |
| if (errors == NULL || wcscmp(errors, L"strict") == 0) { |
| return _Py_ERROR_STRICT; |
| } |
| if (wcscmp(errors, L"surrogateescape") == 0) { |
| return _Py_ERROR_SURROGATEESCAPE; |
| } |
| if (wcscmp(errors, L"replace") == 0) { |
| return _Py_ERROR_REPLACE; |
| } |
| if (wcscmp(errors, L"ignore") == 0) { |
| return _Py_ERROR_IGNORE; |
| } |
| if (wcscmp(errors, L"backslashreplace") == 0) { |
| return _Py_ERROR_BACKSLASHREPLACE; |
| } |
| if (wcscmp(errors, L"surrogatepass") == 0) { |
| return _Py_ERROR_SURROGATEPASS; |
| } |
| if (wcscmp(errors, L"xmlcharrefreplace") == 0) { |
| return _Py_ERROR_XMLCHARREFREPLACE; |
| } |
| return _Py_ERROR_OTHER; |
| } |
| |
| |
| static inline int |
| unicode_check_encoding_errors(const char *encoding, const char *errors) |
| { |
| if (encoding == NULL && errors == NULL) { |
| return 0; |
| } |
| |
| PyInterpreterState *interp = _PyInterpreterState_GET(); |
| #ifndef Py_DEBUG |
| /* In release mode, only check in development mode (-X dev) */ |
| if (!_PyInterpreterState_GetConfig(interp)->dev_mode) { |
| return 0; |
| } |
| #else |
| /* Always check in debug mode */ |
| #endif |
| |
| /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the |
| codec registry is ready: before_PyUnicode_InitEncodings() is called. */ |
| if (!interp->unicode.fs_codec.encoding) { |
| return 0; |
| } |
| |
| /* Disable checks during Python finalization. For example, it allows to |
| call _PyObject_Dump() during finalization for debugging purpose. */ |
| if (interp->finalizing) { |
| return 0; |
| } |
| |
| if (encoding != NULL |
| // Fast path for the most common built-in encodings. Even if the codec |
| // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to |
| // create a temporary Unicode string (the key in the cache). |
| && strcmp(encoding, "utf-8") != 0 |
| && strcmp(encoding, "utf8") != 0 |
| && strcmp(encoding, "ascii") != 0) |
| { |
| PyObject *handler = _PyCodec_Lookup(encoding); |
| if (handler == NULL) { |
| return -1; |
| } |
| Py_DECREF(handler); |
| } |
| |
| if (errors != NULL |
| // Fast path for the most common built-in error handlers. |
| && strcmp(errors, "strict") != 0 |
| && strcmp(errors, "ignore") != 0 |
| && strcmp(errors, "replace") != 0 |
| && strcmp(errors, "surrogateescape") != 0 |
| && strcmp(errors, "surrogatepass") != 0) |
| { |
| PyObject *handler = PyCodec_LookupError(errors); |
| if (handler == NULL) { |
| return -1; |
| } |
| Py_DECREF(handler); |
| } |
| return 0; |
| } |
| |
| |
| int |
| _PyUnicode_CheckConsistency(PyObject *op, int check_content) |
| { |
| #define CHECK(expr) \ |
| do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) |
| |
| assert(op != NULL); |
| CHECK(PyUnicode_Check(op)); |
| |
| PyASCIIObject *ascii = _PyASCIIObject_CAST(op); |
| int kind = ascii->state.kind; |
| |
| if (ascii->state.ascii == 1 && ascii->state.compact == 1) { |
| CHECK(kind == PyUnicode_1BYTE_KIND); |
| } |
| else { |
| PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op); |
| void *data; |
| |
| if (ascii->state.compact == 1) { |
| data = compact + 1; |
| CHECK(kind == PyUnicode_1BYTE_KIND |
| || kind == PyUnicode_2BYTE_KIND |
| || kind == PyUnicode_4BYTE_KIND); |
| CHECK(ascii->state.ascii == 0); |
| CHECK(compact->utf8 != data); |
| } |
| else { |
| PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); |
| |
| data = unicode->data.any; |
| CHECK(kind == PyUnicode_1BYTE_KIND |
| || kind == PyUnicode_2BYTE_KIND |
| || kind == PyUnicode_4BYTE_KIND); |
| CHECK(ascii->state.compact == 0); |
| CHECK(data != NULL); |
| if (ascii->state.ascii) { |
| CHECK(compact->utf8 == data); |
| CHECK(compact->utf8_length == ascii->length); |
| } |
| else { |
| CHECK(compact->utf8 != data); |
| } |
| } |
| |
| if (compact->utf8 == NULL) |
| CHECK(compact->utf8_length == 0); |
| } |
| |
| /* check that the best kind is used: O(n) operation */ |
| if (check_content) { |
| Py_ssize_t i; |
| Py_UCS4 maxchar = 0; |
| const void *data; |
| Py_UCS4 ch; |
| |
| data = PyUnicode_DATA(ascii); |
| for (i=0; i < ascii->length; i++) |
| { |
| ch = PyUnicode_READ(kind, data, i); |
| if (ch > maxchar) |
| maxchar = ch; |
| } |
| if (kind == PyUnicode_1BYTE_KIND) { |
| if (ascii->state.ascii == 0) { |
| CHECK(maxchar >= 128); |
| CHECK(maxchar <= 255); |
| } |
| else |
| CHECK(maxchar < 128); |
| } |
| else if (kind == PyUnicode_2BYTE_KIND) { |
| CHECK(maxchar >= 0x100); |
| CHECK(maxchar <= 0xFFFF); |
| } |
| else { |
| CHECK(maxchar >= 0x10000); |
| CHECK(maxchar <= MAX_UNICODE); |
| } |
| CHECK(PyUnicode_READ(kind, data, ascii->length) == 0); |
| } |
| return 1; |
| |
| #undef CHECK |
| } |
| |
| static PyObject* |
| unicode_result(PyObject *unicode) |
| { |
| assert(_PyUnicode_CHECK(unicode)); |
| |
| Py_ssize_t length = PyUnicode_GET_LENGTH(unicode); |
| if (length == 0) { |
| PyObject *empty = unicode_get_empty(); |
| if (unicode != empty) { |
| Py_DECREF(unicode); |
| Py_INCREF(empty); |
| } |
| return empty; |
| } |
| |
| if (length == 1) { |
| int kind = PyUnicode_KIND(unicode); |
| if (kind == PyUnicode_1BYTE_KIND) { |
| const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); |
| Py_UCS1 ch = data[0]; |
| PyObject *latin1_char = LATIN1(ch); |
| if (unicode != latin1_char) { |
| Py_INCREF(latin1_char); |
| Py_DECREF(unicode); |
| } |
| return latin1_char; |
| } |
| } |
| |
| assert(_PyUnicode_CheckConsistency(unicode, 1)); |
| return unicode; |
| } |
| |
| static PyObject* |
| unicode_result_unchanged(PyObject *unicode) |
| { |
| if (PyUnicode_CheckExact(unicode)) { |
| return Py_NewRef(unicode); |
| } |
| else |
| /* Subtype -- return genuine unicode string with the same value. */ |
| return _PyUnicode_Copy(unicode); |
| } |
| |
| /* Implementation of the "backslashreplace" error handler for 8-bit encodings: |
| ASCII, Latin1, UTF-8, etc. */ |
| static char* |
| backslashreplace(_PyBytesWriter *writer, char *str, |
| PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) |
| { |
| Py_ssize_t size, i; |
| Py_UCS4 ch; |
| int kind; |
| const void *data; |
| |
| kind = PyUnicode_KIND(unicode); |
| data = PyUnicode_DATA(unicode); |
| |
| size = 0; |
| /* determine replacement size */ |
| for (i = collstart; i < collend; ++i) { |
| Py_ssize_t incr; |
| |
| ch = PyUnicode_READ(kind, data, i); |
| if (ch < 0x100) |
| incr = 2+2; |
| else if (ch < 0x10000) |
| incr = 2+4; |
| else { |
| assert(ch <= MAX_UNICODE); |
| incr = 2+8; |
| } |
| if (size > PY_SSIZE_T_MAX - incr) { |
| PyErr_SetString(PyExc_OverflowError, |
| "encoded result is too long for a Python string"); |
| return NULL; |
| } |
| size += incr; |
| } |
| |
| str = _PyBytesWriter_Prepare(writer, str, size); |
| if (str == NULL) |
| return NULL; |
| |
| /* generate replacement */ |
| for (i = collstart; i < collend; ++i) { |
| ch = PyUnicode_READ(kind, data, i); |
| *str++ = '\\'; |
| if (ch >= 0x00010000) { |
| *str++ = 'U'; |
| *str++ = Py_hexdigits[(ch>>28)&0xf]; |
| *str++ = Py_hexdigits[(ch>>24)&0xf]; |
| *str++ = Py_hexdigits[(ch>>20)&0xf]; |
| *str++ = Py_hexdigits[(ch>>16)&0xf]; |
| *str++ = Py_hexdigits[(ch>>12)&0xf]; |
| *str++ = Py_hexdigits[(ch>>8)&0xf]; |
| } |
| else if (ch >= 0x100) { |
| *str++ = 'u'; |
| *str++ = Py_hexdigits[(ch>>12)&0xf]; |
| *str++ = Py_hexdigits[(ch>>8)&0xf]; |
| } |
| else |
| *str++ = 'x'; |
| *str++ = Py_hexdigits[(ch>>4)&0xf]; |
| *str++ = Py_hexdigits[ch&0xf]; |
| } |
| return str; |
| } |
| |
| /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: |
| ASCII, Latin1, UTF-8, etc. */ |
| static char* |
| xmlcharrefreplace(_PyBytesWriter *writer, char *str, |
| PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) |
| { |
| Py_ssize_t size, i; |
| Py_UCS4 ch; |
| int kind; |
| const void *data; |
| |
| kind = PyUnicode_KIND(unicode); |
| data = PyUnicode_DATA(unicode); |
| |
| size = 0; |
| /* determine replacement size */ |
| for (i = collstart; i < collend; ++i) { |
| Py_ssize_t incr; |
| |
| ch = PyUnicode_READ(kind, data, i); |
| if (ch < 10) |
| incr = 2+1+1; |
| else if (ch < 100) |
| incr = 2+2+1; |
| else if (ch < 1000) |
| incr = 2+3+1; |
| else if (ch < 10000) |
| incr = 2+4+1; |
| else if (ch < 100000) |
| incr = 2+5+1; |
| else if (ch < 1000000) |
| incr = 2+6+1; |
| else { |
| assert(ch <= MAX_UNICODE); |
| incr = 2+7+1; |
| } |
| if (size > PY_SSIZE_T_MAX - incr) { |
| PyErr_SetString(PyExc_OverflowError, |
| "encoded result is too long for a Python string"); |
| return NULL; |
| } |
| size += incr; |
| } |
| |
| str = _PyBytesWriter_Prepare(writer, str, size); |
| if (str == NULL) |
| return NULL; |
| |
| /* generate replacement */ |
| for (i = collstart; i < collend; ++i) { |
| size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); |
| if (size < 0) { |
| return NULL; |
| } |
| str += size; |
| } |
| return str; |
| } |
| |
| /* --- Bloom Filters ----------------------------------------------------- */ |
| |
| /* stuff to implement simple "bloom filters" for Unicode characters. |
| to keep things simple, we use a single bitmask, using the least 5 |
| bits from each unicode characters as the bit index. */ |
| |
| /* the linebreak mask is set up by _PyUnicode_Init() below */ |
| |
| #if LONG_BIT >= 128 |
| #define BLOOM_WIDTH 128 |
| #elif LONG_BIT >= 64 |
| #define BLOOM_WIDTH 64 |
| #elif LONG_BIT >= 32 |
| #define BLOOM_WIDTH 32 |
| #else |
| #error "LONG_BIT is smaller than 32" |
| #endif |
| |
| #define BLOOM_MASK unsigned long |
| |
| static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; |
| |
| #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) |
| |
| #define BLOOM_LINEBREAK(ch) \ |
| ((ch) < 128U ? ascii_linebreak[(ch)] : \ |
| (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) |
| |
| static inline BLOOM_MASK |
| make_bloom_mask(int kind, const void* ptr, Py_ssize_t len) |
| { |
| #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ |
| do { \ |
| TYPE *data = (TYPE *)PTR; \ |
| TYPE *end = data + LEN; \ |
| Py_UCS4 ch; \ |
| for (; data != end; data++) { \ |
| ch = *data; \ |
| MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ |
| } \ |
| break; \ |
| } while (0) |
| |
| /* calculate simple bloom-style bitmask for a given unicode string */ |
| |
| BLOOM_MASK mask; |
| |
| mask = 0; |
| switch (kind) { |
| case PyUnicode_1BYTE_KIND: |
| BLOOM_UPDATE(Py_UCS1, mask, ptr, len); |
| break; |
| case PyUnicode_2BYTE_KIND: |
| BLOOM_UPDATE(Py_UCS2, mask, ptr, len); |
| break; |
| case PyUnicode_4BYTE_KIND: |
| BLOOM_UPDATE(Py_UCS4, mask, ptr, len); |
| break; |
| default: |
| Py_UNREACHABLE(); |
| } |
| return mask; |
| |
| #undef BLOOM_UPDATE |
| } |
| |
| static int |
| ensure_unicode(PyObject *obj) |
| { |
| if (!PyUnicode_Check(obj)) { |
| PyErr_Format(PyExc_TypeError, |
| "must be str, not %.100s", |
| Py_TYPE(obj)->tp_name); |
| return -1; |
| } |
| return 0; |
| } |
| |
| /* Compilation of templated routines */ |
| |
| #define STRINGLIB_GET_EMPTY() unicode_get_empty() |
| |
| #include "stringlib/asciilib.h" |
| #include "stringlib/fastsearch.h" |
| #include "stringlib/partition.h" |
| #include "stringlib/split.h" |
| #include "stringlib/count.h" |
| #include "stringlib/find.h" |
| #include "stringlib/find_max_char.h" |
| #include "stringlib/undef.h" |
| |
| #include "stringlib/ucs1lib.h" |
| #include "stringlib/fastsearch.h" |
| #include "stringlib/partition.h" |
| #include "stringlib/split.h" |
| #include "stringlib/count.h" |
| #include "stringlib/find.h" |
| #include "stringlib/replace.h" |
| #include "stringlib/find_max_char.h" |
| #include "stringlib/undef.h" |
| |
| #include "stringlib/ucs2lib.h" |
| #include "stringlib/fastsearch.h" |
| #include "stringlib/partition.h" |
| #include "stringlib/split.h" |
| #include "stringlib/count.h" |
| #include "stringlib/find.h" |
| #include "stringlib/replace.h" |
| #include "stringlib/find_max_char.h" |
| #include "stringlib/undef.h" |
| |
| #include "stringlib/ucs4lib.h" |
| #include "stringlib/fastsearch.h" |
| #include "stringlib/partition.h" |
| #include "stringlib/split.h" |
| #include "stringlib/count.h" |
| #include "stringlib/find.h" |
| #include "stringlib/replace.h" |
| #include "stringlib/find_max_char.h" |
| #include "stringlib/undef.h" |
| |
| #undef STRINGLIB_GET_EMPTY |
| |
| /* --- Unicode Object ----------------------------------------------------- */ |
| |
| static inline Py_ssize_t |
| findchar(const void *s, int kind, |
| Py_ssize_t size, Py_UCS4 ch, |
| int direction) |
| { |
| switch (kind) { |
| case PyUnicode_1BYTE_KIND: |
| if ((Py_UCS1) ch != ch) |
| return -1; |
| if (direction > 0) |
| return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch); |
| else |
| return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch); |
| case PyUnicode_2BYTE_KIND: |
| if ((Py_UCS2) ch != ch) |
| return -1; |
| if (direction > 0) |
| return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch); |
| else |
| return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch); |
| case PyUnicode_4BYTE_KIND: |
| if (direction > 0) |
| return ucs4lib_find_char((const Py_UCS4 *) s, size, ch); |
| else |
| return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch); |
| default: |
| Py_UNREACHABLE(); |
| } |
| } |
| |
| #ifdef Py_DEBUG |
| /* Fill the data of a Unicode string with invalid characters to detect bugs |
| earlier. |
| |
| _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for |
| ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an |
| invalid character in Unicode 6.0. */ |
| static void |
| unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) |
| { |
| int kind = PyUnicode_KIND(unicode); |
| Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); |
| Py_ssize_t length = _PyUnicode_LENGTH(unicode); |
| if (length <= old_length) |
| return; |
| memset(data + old_length * kind, 0xff, (length - old_length) * kind); |
| } |
| #endif |
| |
| static PyObject* |
| resize_compact(PyObject *unicode, Py_ssize_t length) |
| { |
| Py_ssize_t char_size; |
| Py_ssize_t struct_size; |
| Py_ssize_t new_size; |
| PyObject *new_unicode; |
| #ifdef Py_DEBUG |
| Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); |
| #endif |
| |
| assert(unicode_modifiable(unicode)); |
| assert(PyUnicode_IS_COMPACT(unicode)); |
| |
| char_size = PyUnicode_KIND(unicode); |
| if (PyUnicode_IS_ASCII(unicode)) |
| struct_size = sizeof(PyASCIIObject); |
| else |
| struct_size = sizeof(PyCompactUnicodeObject); |
| |
| if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { |
| PyErr_NoMemory(); |
| return NULL; |
| } |
| new_size = (struct_size + (length + 1) * char_size); |
| |
| if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { |
| PyObject_Free(_PyUnicode_UTF8(unicode)); |
| _PyUnicode_UTF8(unicode) = NULL; |
| _PyUnicode_UTF8_LENGTH(unicode) = 0; |
| } |
| #ifdef Py_TRACE_REFS |
| _Py_ForgetReference(unicode); |
| #endif |
| |
| new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size); |
| if (new_unicode == NULL) { |
| _Py_NewReferenceNoTotal(unicode); |
| PyErr_NoMemory(); |
| return NULL; |
| } |
| unicode = new_unicode; |
| _Py_NewReferenceNoTotal(unicode); |
| |
| _PyUnicode_LENGTH(unicode) = length; |
| #ifdef Py_DEBUG |
| unicode_fill_invalid(unicode, old_length); |
| #endif |
| PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), |
| length, 0); |
| assert(_PyUnicode_CheckConsistency(unicode, 0)); |
| return unicode; |
| } |
| |
| static int |
| resize_inplace(PyObject *unicode, Py_ssize_t length) |
| { |
| assert(!PyUnicode_IS_COMPACT(unicode)); |
| assert(Py_REFCNT(unicode) == 1); |
| |
| Py_ssize_t new_size; |
| Py_ssize_t char_size; |
| int share_utf8; |
| void *data; |
| #ifdef Py_DEBUG |
| Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); |
| #endif |
| |
| data = _PyUnicode_DATA_ANY(unicode); |
| char_size = PyUnicode_KIND(unicode); |
| share_utf8 = _PyUnicode_SHARE_UTF8(unicode); |
| |
| if (length > (PY_SSIZE_T_MAX / char_size - 1)) { |
| PyErr_NoMemory(); |
| return -1; |
| } |
| new_size = (length + 1) * char_size; |
| |
| if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) |
| { |
| PyObject_Free(_PyUnicode_UTF8(unicode)); |
| _PyUnicode_UTF8(unicode) = NULL; |
| _PyUnicode_UTF8_LENGTH(unicode) = 0; |
| } |
| |
| data = (PyObject *)PyObject_Realloc(data, new_size); |
| if (data == NULL) { |
| PyErr_NoMemory(); |
| return -1; |
| } |
| _PyUnicode_DATA_ANY(unicode) = data; |
| if (share_utf8) { |
| _PyUnicode_UTF8(unicode) = data; |
| _PyUnicode_UTF8_LENGTH(unicode) = length; |
| } |
| _PyUnicode_LENGTH(unicode) = length; |
| PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); |
| #ifdef Py_DEBUG |
| unicode_fill_invalid(unicode, old_length); |
| #endif |
| |
| /* check for integer overflow */ |
| if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { |
| PyErr_NoMemory(); |
| return -1; |
| } |
| assert(_PyUnicode_CheckConsistency(unicode, 0)); |
| return 0; |
| } |
| |
| static PyObject* |
| resize_copy(PyObject *unicode, Py_ssize_t length) |
| { |
| Py_ssize_t copy_length; |
| PyObject *copy; |
| |
| copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); |
| if (copy == NULL) |
| return NULL; |
| |
| copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); |
| _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); |
| return copy; |
| } |
| |
| static const char* |
| unicode_kind_name(PyObject *unicode) |
| { |
| /* don't check consistency: unicode_kind_name() is called from |
| _PyUnicode_Dump() */ |
| if (!PyUnicode_IS_COMPACT(unicode)) |
| { |
| switch (PyUnicode_KIND(unicode)) |
| { |
| case PyUnicode_1BYTE_KIND: |
| if (PyUnicode_IS_ASCII(unicode)) |
| return "legacy ascii"; |
| else |
| return "legacy latin1"; |
| case PyUnicode_2BYTE_KIND: |
| return "legacy UCS2"; |
| case PyUnicode_4BYTE_KIND: |
| return "legacy UCS4"; |
| default: |
| return "<legacy invalid kind>"; |
| } |
| } |
| switch (PyUnicode_KIND(unicode)) { |
| case PyUnicode_1BYTE_KIND: |
| if (PyUnicode_IS_ASCII(unicode)) |
| return "ascii"; |
| else |
| return "latin1"; |
| case PyUnicode_2BYTE_KIND: |
| return "UCS2"; |
| case PyUnicode_4BYTE_KIND: |
| return "UCS4"; |
| default: |
| return "<invalid compact kind>"; |
| } |
| } |
| |
| #ifdef Py_DEBUG |
| /* Functions wrapping macros for use in debugger */ |
| const char *_PyUnicode_utf8(void *unicode_raw){ |
| PyObject *unicode = _PyObject_CAST(unicode_raw); |
| return PyUnicode_UTF8(unicode); |
| } |
| |
| const void *_PyUnicode_compact_data(void *unicode_raw) { |
| PyObject *unicode = _PyObject_CAST(unicode_raw); |
| return _PyUnicode_COMPACT_DATA(unicode); |
| } |
| const void *_PyUnicode_data(void *unicode_raw) { |
| PyObject *unicode = _PyObject_CAST(unicode_raw); |
| printf("obj %p\n", (void*)unicode); |
| printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); |
| printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); |
| printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1)); |
| printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1)); |
| printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); |
| return PyUnicode_DATA(unicode); |
| } |
| |
| void |
| _PyUnicode_Dump(PyObject *op) |
| { |
| PyASCIIObject *ascii = _PyASCIIObject_CAST(op); |
| PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op); |
| PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); |
| const void *data; |
| |
| if (ascii->state.compact) |
| { |
| if (ascii->state.ascii) |
| data = (ascii + 1); |
| else |
| data = (compact + 1); |
| } |
| else |
| data = unicode->data.any; |
| printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length); |
| |
| if (!ascii->state.ascii) { |
| printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length); |
| } |
| printf(", data=%p\n", data); |
| } |
| #endif |
| |
| |
| PyObject * |
| PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) |
| { |
| /* Optimization for empty strings */ |
| if (size == 0) { |
| return unicode_new_empty(); |
| } |
| |
| PyObject *obj; |
| PyCompactUnicodeObject *unicode; |
| void *data; |
| int kind; |
| int is_ascii; |
| Py_ssize_t char_size; |
| Py_ssize_t struct_size; |
| |
| is_ascii = 0; |
| struct_size = sizeof(PyCompactUnicodeObject); |
| if (maxchar < 128) { |
| kind = PyUnicode_1BYTE_KIND; |
| char_size = 1; |
| is_ascii = 1; |
| struct_size = sizeof(PyASCIIObject); |
| } |
| else if (maxchar < 256) { |
| kind = PyUnicode_1BYTE_KIND; |
| char_size = 1; |
| } |
| else if (maxchar < 65536) { |
| kind = PyUnicode_2BYTE_KIND; |
| char_size = 2; |
| } |
| else { |
| if (maxchar > MAX_UNICODE) { |
| PyErr_SetString(PyExc_SystemError, |
| "invalid maximum character passed to PyUnicode_New"); |
| return NULL; |
| } |
| kind = PyUnicode_4BYTE_KIND; |
| char_size = 4; |
| } |
| |
| /* Ensure we won't overflow the size. */ |
| if (size < 0) { |
| PyErr_SetString(PyExc_SystemError, |
| "Negative size passed to PyUnicode_New"); |
| return NULL; |
| } |
| if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) |
| return PyErr_NoMemory(); |
| |
| /* Duplicated allocation code from _PyObject_New() instead of a call to |
| * PyObject_New() so we are able to allocate space for the object and |
| * it's data buffer. |
| */ |
| obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size); |
| if (obj == NULL) { |
| return PyErr_NoMemory(); |
| } |
| _PyObject_Init(obj, &PyUnicode_Type); |
| |
| unicode = (PyCompactUnicodeObject *)obj; |
| if (is_ascii) |
| data = ((PyASCIIObject*)obj) + 1; |
| else |
| data = unicode + 1; |
| _PyUnicode_LENGTH(unicode) = size; |
| _PyUnicode_HASH(unicode) = -1; |
| _PyUnicode_STATE(unicode).interned = 0; |
| _PyUnicode_STATE(unicode).kind = kind; |
| _PyUnicode_STATE(unicode).compact = 1; |
| _PyUnicode_STATE(unicode).ascii = is_ascii; |
| if (is_ascii) { |
| ((char*)data)[size] = 0; |
| } |
| else if (kind == PyUnicode_1BYTE_KIND) { |
| ((char*)data)[size] = 0; |
| unicode->utf8 = NULL; |
| unicode->utf8_length = 0; |
| } |
| else { |
| unicode->utf8 = NULL; |
| unicode->utf8_length = 0; |
| if (kind == PyUnicode_2BYTE_KIND) |
| ((Py_UCS2*)data)[size] = 0; |
| else /* kind == PyUnicode_4BYTE_KIND */ |
| ((Py_UCS4*)data)[size] = 0; |
| } |
| #ifdef Py_DEBUG |
| unicode_fill_invalid((PyObject*)unicode, 0); |
| #endif |
| assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); |
| return obj; |
| } |
| |
| #if SIZEOF_WCHAR_T == 2 |
| /* Helper function to convert a 16-bits wchar_t representation to UCS4, this |
| will decode surrogate pairs, the other conversions are implemented as macros |
| for efficiency. |
| |
| This function assumes that unicode can hold one more code point than wstr |
| characters for a terminating null character. */ |
| static void |
| unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, |
| PyObject *unicode) |
| { |
| const wchar_t *iter; |
| Py_UCS4 *ucs4_out; |
| |
| assert(unicode != NULL); |
| assert(_PyUnicode_CHECK(unicode)); |
| assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); |
| ucs4_out = PyUnicode_4BYTE_DATA(unicode); |
| |
| for (iter = begin; iter < end; ) { |
| assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + |
| _PyUnicode_GET_LENGTH(unicode))); |
| if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) |
| && (iter+1) < end |
| && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) |
| { |
| *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); |
| iter += 2; |
| } |
| else { |
| *ucs4_out++ = *iter; |
| iter++; |
| } |
| } |
| assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + |
| _PyUnicode_GET_LENGTH(unicode))); |
| |
| } |
| #endif |
| |
| static int |
| unicode_check_modifiable(PyObject *unicode) |
| { |
| if (!unicode_modifiable(unicode)) { |
| PyErr_SetString(PyExc_SystemError, |
| "Cannot modify a string currently used"); |
| return -1; |
| } |
| return 0; |
| } |
| |
| static int |
| _copy_characters(PyObject *to, Py_ssize_t to_start, |
| PyObject *from, Py_ssize_t from_start, |
| Py_ssize_t how_many, int check_maxchar) |
| { |
| int from_kind, to_kind; |
| const void *from_data; |
| void *to_data; |
| |
| assert(0 <= how_many); |
| assert(0 <= from_start); |
| assert(0 <= to_start); |
| assert(PyUnicode_Check(from)); |
| assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); |
| |
| assert(PyUnicode_Check(to)); |
| assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); |
| |
| if (how_many == 0) |
| return 0; |
| |
| from_kind = PyUnicode_KIND(from); |
| from_data = PyUnicode_DATA(from); |
| to_kind = PyUnicode_KIND(to); |
| to_data = PyUnicode_DATA(to); |
| |
| #ifdef Py_DEBUG |
| if (!check_maxchar |
| && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) |
| { |
| Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); |
| Py_UCS4 ch; |
| Py_ssize_t i; |
| for (i=0; i < how_many; i++) { |
| ch = PyUnicode_READ(from_kind, from_data, from_start + i); |
| assert(ch <= to_maxchar); |
| } |
| } |
| #endif |
| |
| if (from_kind == to_kind) { |
| if (check_maxchar |
| && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) |
| { |
| /* Writing Latin-1 characters into an ASCII string requires to |
| check that all written characters are pure ASCII */ |
| Py_UCS4 max_char; |
| max_char = ucs1lib_find_max_char(from_data, |
| (const Py_UCS1*)from_data + how_many); |
| if (max_char >= 128) |
| return -1; |
| } |
| memcpy((char*)to_data + to_kind * to_start, |
| (const char*)from_data + from_kind * from_start, |
| to_kind * how_many); |
| } |
| else if (from_kind == PyUnicode_1BYTE_KIND |
| && to_kind == PyUnicode_2BYTE_KIND) |
| { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS1, Py_UCS2, |
| PyUnicode_1BYTE_DATA(from) + from_start, |
| PyUnicode_1BYTE_DATA(from) + from_start + how_many, |
| PyUnicode_2BYTE_DATA(to) + to_start |
| ); |
| } |
| else if (from_kind == PyUnicode_1BYTE_KIND |
| && to_kind == PyUnicode_4BYTE_KIND) |
| { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS1, Py_UCS4, |
| PyUnicode_1BYTE_DATA(from) + from_start, |
| PyUnicode_1BYTE_DATA(from) + from_start + how_many, |
| PyUnicode_4BYTE_DATA(to) + to_start |
| ); |
| } |
| else if (from_kind == PyUnicode_2BYTE_KIND |
| && to_kind == PyUnicode_4BYTE_KIND) |
| { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS2, Py_UCS4, |
| PyUnicode_2BYTE_DATA(from) + from_start, |
| PyUnicode_2BYTE_DATA(from) + from_start + how_many, |
| PyUnicode_4BYTE_DATA(to) + to_start |
| ); |
| } |
| else { |
| assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); |
| |
| if (!check_maxchar) { |
| if (from_kind == PyUnicode_2BYTE_KIND |
| && to_kind == PyUnicode_1BYTE_KIND) |
| { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS2, Py_UCS1, |
| PyUnicode_2BYTE_DATA(from) + from_start, |
| PyUnicode_2BYTE_DATA(from) + from_start + how_many, |
| PyUnicode_1BYTE_DATA(to) + to_start |
| ); |
| } |
| else if (from_kind == PyUnicode_4BYTE_KIND |
| && to_kind == PyUnicode_1BYTE_KIND) |
| { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS4, Py_UCS1, |
| PyUnicode_4BYTE_DATA(from) + from_start, |
| PyUnicode_4BYTE_DATA(from) + from_start + how_many, |
| PyUnicode_1BYTE_DATA(to) + to_start |
| ); |
| } |
| else if (from_kind == PyUnicode_4BYTE_KIND |
| && to_kind == PyUnicode_2BYTE_KIND) |
| { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS4, Py_UCS2, |
| PyUnicode_4BYTE_DATA(from) + from_start, |
| PyUnicode_4BYTE_DATA(from) + from_start + how_many, |
| PyUnicode_2BYTE_DATA(to) + to_start |
| ); |
| } |
| else { |
| Py_UNREACHABLE(); |
| } |
| } |
| else { |
| const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); |
| Py_UCS4 ch; |
| Py_ssize_t i; |
| |
| for (i=0; i < how_many; i++) { |
| ch = PyUnicode_READ(from_kind, from_data, from_start + i); |
| if (ch > to_maxchar) |
| return -1; |
| PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); |
| } |
| } |
| } |
| return 0; |
| } |
| |
| void |
| _PyUnicode_FastCopyCharacters( |
| PyObject *to, Py_ssize_t to_start, |
| PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) |
| { |
| (void)_copy_characters(to, to_start, from, from_start, how_many, 0); |
| } |
| |
| Py_ssize_t |
| PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, |
| PyObject *from, Py_ssize_t from_start, |
| Py_ssize_t how_many) |
| { |
| int err; |
| |
| if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { |
| PyErr_BadInternalCall(); |
| return -1; |
| } |
| |
| if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { |
| PyErr_SetString(PyExc_IndexError, "string index out of range"); |
| return -1; |
| } |
| if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) { |
| PyErr_SetString(PyExc_IndexError, "string index out of range"); |
| return -1; |
| } |
| if (how_many < 0) { |
| PyErr_SetString(PyExc_SystemError, "how_many cannot be negative"); |
| return -1; |
| } |
| how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many); |
| if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { |
| PyErr_Format(PyExc_SystemError, |
| "Cannot write %zi characters at %zi " |
| "in a string of %zi characters", |
| how_many, to_start, PyUnicode_GET_LENGTH(to)); |
| return -1; |
| } |
| |
| if (how_many == 0) |
| return 0; |
| |
| if (unicode_check_modifiable(to)) |
| return -1; |
| |
| err = _copy_characters(to, to_start, from, from_start, how_many, 1); |
| if (err) { |
| PyErr_Format(PyExc_SystemError, |
| "Cannot copy %s characters " |
| "into a string of %s characters", |
| unicode_kind_name(from), |
| unicode_kind_name(to)); |
| return -1; |
| } |
| return how_many; |
| } |
| |
| /* Find the maximum code point and count the number of surrogate pairs so a |
| correct string length can be computed before converting a string to UCS4. |
| This function counts single surrogates as a character and not as a pair. |
| |
| Return 0 on success, or -1 on error. */ |
| static int |
| find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, |
| Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) |
| { |
| const wchar_t *iter; |
| Py_UCS4 ch; |
| |
| assert(num_surrogates != NULL && maxchar != NULL); |
| *num_surrogates = 0; |
| *maxchar = 0; |
| |
| for (iter = begin; iter < end; ) { |
| #if SIZEOF_WCHAR_T == 2 |
| if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) |
| && (iter+1) < end |
| && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) |
| { |
| ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); |
| ++(*num_surrogates); |
| iter += 2; |
| } |
| else |
| #endif |
| { |
| ch = *iter; |
| iter++; |
| } |
| if (ch > *maxchar) { |
| *maxchar = ch; |
| if (*maxchar > MAX_UNICODE) { |
| PyErr_Format(PyExc_ValueError, |
| "character U+%x is not in range [U+0000; U+%x]", |
| ch, MAX_UNICODE); |
| return -1; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| static void |
| unicode_dealloc(PyObject *unicode) |
| { |
| #ifdef Py_DEBUG |
| if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) { |
| _Py_FatalRefcountError("deallocating an Unicode singleton"); |
| } |
| #endif |
| /* This should never get called, but we also don't want to SEGV if |
| * we accidentally decref an immortal string out of existence. Since |
| * the string is an immortal object, just re-set the reference count. |
| */ |
| if (PyUnicode_CHECK_INTERNED(unicode)) { |
| _Py_SetImmortal(unicode); |
| return; |
| } |
| if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { |
| PyObject_Free(_PyUnicode_UTF8(unicode)); |
| } |
| if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) { |
| PyObject_Free(_PyUnicode_DATA_ANY(unicode)); |
| } |
| |
| Py_TYPE(unicode)->tp_free(unicode); |
| } |
| |
| #ifdef Py_DEBUG |
| static int |
| unicode_is_singleton(PyObject *unicode) |
| { |
| if (unicode == &_Py_STR(empty)) { |
| return 1; |
| } |
| |
| PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode); |
| if (ascii->length == 1) { |
| Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); |
| if (ch < 256 && LATIN1(ch) == unicode) { |
| return 1; |
| } |
| } |
| return 0; |
| } |
| #endif |
| |
| static int |
| unicode_modifiable(PyObject *unicode) |
| { |
| assert(_PyUnicode_CHECK(unicode)); |
| if (Py_REFCNT(unicode) != 1) |
| return 0; |
| if (_PyUnicode_HASH(unicode) != -1) |
| return 0; |
| if (PyUnicode_CHECK_INTERNED(unicode)) |
| return 0; |
| if (!PyUnicode_CheckExact(unicode)) |
| return 0; |
| #ifdef Py_DEBUG |
| /* singleton refcount is greater than 1 */ |
| assert(!unicode_is_singleton(unicode)); |
| #endif |
| return 1; |
| } |
| |
| static int |
| unicode_resize(PyObject **p_unicode, Py_ssize_t length) |
| { |
| PyObject *unicode; |
| Py_ssize_t old_length; |
| |
| assert(p_unicode != NULL); |
| unicode = *p_unicode; |
| |
| assert(unicode != NULL); |
| assert(PyUnicode_Check(unicode)); |
| assert(0 <= length); |
| |
| old_length = PyUnicode_GET_LENGTH(unicode); |
| if (old_length == length) |
| return 0; |
| |
| if (length == 0) { |
| PyObject *empty = unicode_new_empty(); |
| Py_SETREF(*p_unicode, empty); |
| return 0; |
| } |
| |
| if (!unicode_modifiable(unicode)) { |
| PyObject *copy = resize_copy(unicode, length); |
| if (copy == NULL) |
| return -1; |
| Py_SETREF(*p_unicode, copy); |
| return 0; |
| } |
| |
| if (PyUnicode_IS_COMPACT(unicode)) { |
| PyObject *new_unicode = resize_compact(unicode, length); |
| if (new_unicode == NULL) |
| return -1; |
| *p_unicode = new_unicode; |
| return 0; |
| } |
| return resize_inplace(unicode, length); |
| } |
| |
| int |
| PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) |
| { |
| PyObject *unicode; |
| if (p_unicode == NULL) { |
| PyErr_BadInternalCall(); |
| return -1; |
| } |
| unicode = *p_unicode; |
| if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) |
| { |
| PyErr_BadInternalCall(); |
| return -1; |
| } |
| return unicode_resize(p_unicode, length); |
| } |
| |
| /* Copy an ASCII or latin1 char* string into a Python Unicode string. |
| |
| WARNING: The function doesn't copy the terminating null character and |
| doesn't check the maximum character (may write a latin1 character in an |
| ASCII string). */ |
| static void |
| unicode_write_cstr(PyObject *unicode, Py_ssize_t index, |
| const char *str, Py_ssize_t len) |
| { |
| int kind = PyUnicode_KIND(unicode); |
| const void *data = PyUnicode_DATA(unicode); |
| const char *end = str + len; |
| |
| assert(index + len <= PyUnicode_GET_LENGTH(unicode)); |
| switch (kind) { |
| case PyUnicode_1BYTE_KIND: { |
| #ifdef Py_DEBUG |
| if (PyUnicode_IS_ASCII(unicode)) { |
| Py_UCS4 maxchar = ucs1lib_find_max_char( |
| (const Py_UCS1*)str, |
| (const Py_UCS1*)str + len); |
| assert(maxchar < 128); |
| } |
| #endif |
| memcpy((char *) data + index, str, len); |
| break; |
| } |
| case PyUnicode_2BYTE_KIND: { |
| Py_UCS2 *start = (Py_UCS2 *)data + index; |
| Py_UCS2 *ucs2 = start; |
| |
| for (; str < end; ++ucs2, ++str) |
| *ucs2 = (Py_UCS2)*str; |
| |
| assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); |
| break; |
| } |
| case PyUnicode_4BYTE_KIND: { |
| Py_UCS4 *start = (Py_UCS4 *)data + index; |
| Py_UCS4 *ucs4 = start; |
| |
| for (; str < end; ++ucs4, ++str) |
| *ucs4 = (Py_UCS4)*str; |
| |
| assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); |
| break; |
| } |
| default: |
| Py_UNREACHABLE(); |
| } |
| } |
| |
| static PyObject* |
| get_latin1_char(Py_UCS1 ch) |
| { |
| return Py_NewRef(LATIN1(ch)); |
| } |
| |
| static PyObject* |
| unicode_char(Py_UCS4 ch) |
| { |
| PyObject *unicode; |
| |
| assert(ch <= MAX_UNICODE); |
| |
| if (ch < 256) { |
| return get_latin1_char(ch); |
| } |
| |
| unicode = PyUnicode_New(1, ch); |
| if (unicode == NULL) |
| return NULL; |
| |
| assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); |
| if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { |
| PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; |
| } else { |
| assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); |
| PyUnicode_4BYTE_DATA(unicode)[0] = ch; |
| } |
| assert(_PyUnicode_CheckConsistency(unicode, 1)); |
| return unicode; |
| } |
| |
| PyObject * |
| PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) |
| { |
| PyObject *unicode; |
| Py_UCS4 maxchar = 0; |
| Py_ssize_t num_surrogates; |
| |
| if (u == NULL && size != 0) { |
| PyErr_BadInternalCall(); |
| return NULL; |
| } |
| |
| if (size == -1) { |
| size = wcslen(u); |
| } |
| |
| /* If the Unicode data is known at construction time, we can apply |
| some optimizations which share commonly used objects. */ |
| |
| /* Optimization for empty strings */ |
| if (size == 0) |
| _Py_RETURN_UNICODE_EMPTY(); |
| |
| #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION |
| /* Oracle Solaris uses non-Unicode internal wchar_t form for |
| non-Unicode locales and hence needs conversion to UCS-4 first. */ |
| if (_Py_LocaleUsesNonUnicodeWchar()) { |
| wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size); |
| if (!converted) { |
| return NULL; |
| } |
| PyObject *unicode = _PyUnicode_FromUCS4(converted, size); |
| PyMem_Free(converted); |
| return unicode; |
| } |
| #endif |
| |
| /* Single character Unicode objects in the Latin-1 range are |
| shared when using this constructor */ |
| if (size == 1 && (Py_UCS4)*u < 256) |
| return get_latin1_char((unsigned char)*u); |
| |
| /* If not empty and not single character, copy the Unicode data |
| into the new object */ |
| if (find_maxchar_surrogates(u, u + size, |
| &maxchar, &num_surrogates) == -1) |
| return NULL; |
| |
| unicode = PyUnicode_New(size - num_surrogates, maxchar); |
| if (!unicode) |
| return NULL; |
| |
| switch (PyUnicode_KIND(unicode)) { |
| case PyUnicode_1BYTE_KIND: |
| _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, |
| u, u + size, PyUnicode_1BYTE_DATA(unicode)); |
| break; |
| case PyUnicode_2BYTE_KIND: |
| #if Py_UNICODE_SIZE == 2 |
| memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2); |
| #else |
| _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, |
| u, u + size, PyUnicode_2BYTE_DATA(unicode)); |
| #endif |
| break; |
| case PyUnicode_4BYTE_KIND: |
| #if SIZEOF_WCHAR_T == 2 |
| /* This is the only case which has to process surrogates, thus |
| a simple copy loop is not enough and we need a function. */ |
| unicode_convert_wchar_to_ucs4(u, u + size, unicode); |
| #else |
| assert(num_surrogates == 0); |
| memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4); |
| #endif |
| break; |
| default: |
| Py_UNREACHABLE(); |
| } |
| |
| return unicode_result(unicode); |
| } |
| |
| PyObject * |
| PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) |
| { |
| if (size < 0) { |
| PyErr_SetString(PyExc_SystemError, |
| "Negative size passed to PyUnicode_FromStringAndSize"); |
| return NULL; |
| } |
| if (u != NULL) { |
| return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); |
| } |
| if (size > 0) { |
| PyErr_SetString(PyExc_SystemError, |
| "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize"); |
| return NULL; |
| } |
| return unicode_new_empty(); |
| } |
| |
| PyObject * |
| PyUnicode_FromString(const char *u) |
| { |
| size_t size = strlen(u); |
| if (size > PY_SSIZE_T_MAX) { |
| PyErr_SetString(PyExc_OverflowError, "input too long"); |
| return NULL; |
| } |
| return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); |
| } |
| |
| |
| PyObject * |
| _PyUnicode_FromId(_Py_Identifier *id) |
| { |
| PyInterpreterState *interp = _PyInterpreterState_GET(); |
| struct _Py_unicode_ids *ids = &interp->unicode.ids; |
| |
| Py_ssize_t index = _Py_atomic_size_get(&id->index); |
| if (index < 0) { |
| struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids; |
| |
| PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK); |
| // Check again to detect concurrent access. Another thread can have |
| // initialized the index while this thread waited for the lock. |
| index = _Py_atomic_size_get(&id->index); |
| if (index < 0) { |
| assert(rt_ids->next_index < PY_SSIZE_T_MAX); |
| index = rt_ids->next_index; |
| rt_ids->next_index++; |
| _Py_atomic_size_set(&id->index, index); |
| } |
| PyThread_release_lock(rt_ids->lock); |
| } |
| assert(index >= 0); |
| |
| PyObject *obj; |
| if (index < ids->size) { |
| obj = ids->array[index]; |
| if (obj) { |
| // Return a borrowed reference |
| return obj; |
| } |
| } |
| |
| obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string), |
| NULL, NULL); |
| if (!obj) { |
| return NULL; |
| } |
| PyUnicode_InternInPlace(&obj); |
| |
| if (index >= ids->size) { |
| // Overallocate to reduce the number of realloc |
| Py_ssize_t new_size = Py_MAX(index * 2, 16); |
| Py_ssize_t item_size = sizeof(ids->array[0]); |
| PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size); |
| if (new_array == NULL) { |
| PyErr_NoMemory(); |
| return NULL; |
| } |
| memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size); |
| ids->array = new_array; |
| ids->size = new_size; |
| } |
| |
| // The array stores a strong reference |
| ids->array[index] = obj; |
| |
| // Return a borrowed reference |
| return obj; |
| } |
| |
| |
| static void |
| unicode_clear_identifiers(struct _Py_unicode_state *state) |
| { |
| struct _Py_unicode_ids *ids = &state->ids; |
| for (Py_ssize_t i=0; i < ids->size; i++) { |
| Py_XDECREF(ids->array[i]); |
| } |
| ids->size = 0; |
| PyMem_Free(ids->array); |
| ids->array = NULL; |
| // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid |
| // after Py_Finalize(). |
| } |
| |
| |
| /* Internal function, doesn't check maximum character */ |
| |
| PyObject* |
| _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) |
| { |
| const unsigned char *s = (const unsigned char *)buffer; |
| PyObject *unicode; |
| if (size == 1) { |
| #ifdef Py_DEBUG |
| assert((unsigned char)s[0] < 128); |
| #endif |
| return get_latin1_char(s[0]); |
| } |
| unicode = PyUnicode_New(size, 127); |
| if (!unicode) |
| return NULL; |
| memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); |
| assert(_PyUnicode_CheckConsistency(unicode, 1)); |
| return unicode; |
| } |
| |
| static Py_UCS4 |
| kind_maxchar_limit(int kind) |
| { |
| switch (kind) { |
| case PyUnicode_1BYTE_KIND: |
| return 0x80; |
| case PyUnicode_2BYTE_KIND: |
| return 0x100; |
| case PyUnicode_4BYTE_KIND: |
| return 0x10000; |
| default: |
| Py_UNREACHABLE(); |
| } |
| } |
| |
| static PyObject* |
| _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) |
| { |
| PyObject *res; |
| unsigned char max_char; |
| |
| if (size == 0) { |
| _Py_RETURN_UNICODE_EMPTY(); |
| } |
| assert(size > 0); |
| if (size == 1) { |
| return get_latin1_char(u[0]); |
| } |
| |
| max_char = ucs1lib_find_max_char(u, u + size); |
| res = PyUnicode_New(size, max_char); |
| if (!res) |
| return NULL; |
| memcpy(PyUnicode_1BYTE_DATA(res), u, size); |
| assert(_PyUnicode_CheckConsistency(res, 1)); |
| return res; |
| } |
| |
| static PyObject* |
| _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) |
| { |
| PyObject *res; |
| Py_UCS2 max_char; |
| |
| if (size == 0) |
| _Py_RETURN_UNICODE_EMPTY(); |
| assert(size > 0); |
| if (size == 1) |
| return unicode_char(u[0]); |
| |
| max_char = ucs2lib_find_max_char(u, u + size); |
| res = PyUnicode_New(size, max_char); |
| if (!res) |
| return NULL; |
| if (max_char >= 256) |
| memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); |
| else { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); |
| } |
| assert(_PyUnicode_CheckConsistency(res, 1)); |
| return res; |
| } |
| |
| static PyObject* |
| _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) |
| { |
| PyObject *res; |
| Py_UCS4 max_char; |
| |
| if (size == 0) |
| _Py_RETURN_UNICODE_EMPTY(); |
| assert(size > 0); |
| if (size == 1) |
| return unicode_char(u[0]); |
| |
| max_char = ucs4lib_find_max_char(u, u + size); |
| res = PyUnicode_New(size, max_char); |
| if (!res) |
| return NULL; |
| if (max_char < 256) |
| _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, |
| PyUnicode_1BYTE_DATA(res)); |
| else if (max_char < 0x10000) |
| _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, |
| PyUnicode_2BYTE_DATA(res)); |
| else |
| memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); |
| assert(_PyUnicode_CheckConsistency(res, 1)); |
| return res; |
| } |
| |
| PyObject* |
| PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) |
| { |
| if (size < 0) { |
| PyErr_SetString(PyExc_ValueError, "size must be positive"); |
| return NULL; |
| } |
| switch (kind) { |
| case PyUnicode_1BYTE_KIND: |
| return _PyUnicode_FromUCS1(buffer, size); |
| case PyUnicode_2BYTE_KIND: |
| return _PyUnicode_FromUCS2(buffer, size); |
| case PyUnicode_4BYTE_KIND: |
| return _PyUnicode_FromUCS4(buffer, size); |
| default: |
| PyErr_SetString(PyExc_SystemError, "invalid kind"); |
| return NULL; |
| } |
| } |
| |
| Py_UCS4 |
| _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) |
| { |
| int kind; |
| const void *startptr, *endptr; |
| |
| assert(0 <= start); |
| assert(end <= PyUnicode_GET_LENGTH(unicode)); |
| assert(start <= end); |
| |
| if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) |
| return PyUnicode_MAX_CHAR_VALUE(unicode); |
| |
| if (start == end) |
| return 127; |
| |
| if (PyUnicode_IS_ASCII(unicode)) |
| return 127; |
| |
| kind = PyUnicode_KIND(unicode); |
| startptr = PyUnicode_DATA(unicode); |
| endptr = (char *)startptr + end * kind; |
| startptr = (char *)startptr + start * kind; |
| switch(kind) { |
| case PyUnicode_1BYTE_KIND: |
| return ucs1lib_find_max_char(startptr, endptr); |
| case PyUnicode_2BYTE_KIND: |
| return ucs2lib_find_max_char(startptr, endptr); |
| case PyUnicode_4BYTE_KIND: |
| return ucs4lib_find_max_char(startptr, endptr); |
| default: |
| Py_UNREACHABLE(); |
| } |
| } |
| |
| /* Ensure that a string uses the most efficient storage, if it is not the |
| case: create a new string with of the right kind. Write NULL into *p_unicode |
| on error. */ |
| static void |
| unicode_adjust_maxchar(PyObject **p_unicode) |
| { |
| PyObject *unicode, *copy; |
| Py_UCS4 max_char; |
| Py_ssize_t len; |
| int kind; |
| |
| assert(p_unicode != NULL); |
| unicode = *p_unicode; |
| if (PyUnicode_IS_ASCII(unicode)) |
| return; |
| |
| len = PyUnicode_GET_LENGTH(unicode); |
| kind = PyUnicode_KIND(unicode); |
| if (kind == PyUnicode_1BYTE_KIND) { |
| const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); |
| max_char = ucs1lib_find_max_char(u, u + len); |
| if (max_char >= 128) |
| return; |
| } |
| else if (kind == PyUnicode_2BYTE_KIND) { |
| const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); |
| max_char = ucs2lib_find_max_char(u, u + len); |
| if (max_char >= 256) |
| return; |
| } |
| else if (kind == PyUnicode_4BYTE_KIND) { |
| const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); |
| max_char = ucs4lib_find_max_char(u, u + len); |
| if (max_char >= 0x10000) |
| return; |
| } |
| else |
| Py_UNREACHABLE(); |
| |
| copy = PyUnicode_New(len, max_char); |
| if (copy != NULL) |
| _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); |
| Py_DECREF(unicode); |
| *p_unicode = copy; |
| } |
| |
| PyObject* |
| _PyUnicode_Copy(PyObject *unicode) |
| { |
| Py_ssize_t length; |
| PyObject *copy; |
| |
| if (!PyUnicode_Check(unicode)) { |
| PyErr_BadInternalCall(); |
| return NULL; |
| } |
| |
| length = PyUnicode_GET_LENGTH(unicode); |
| copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); |
| if (!copy) |
| return NULL; |
| assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); |
| |
| memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), |
| length * PyUnicode_KIND(unicode)); |
| assert(_PyUnicode_CheckConsistency(copy, 1)); |
| return copy; |
| } |
| |
| |
| /* Widen Unicode objects to larger buffers. Don't write terminating null |
| character. Return NULL on error. */ |
| |
| static void* |
| unicode_askind(int skind, void const *data, Py_ssize_t len, int kind) |
| { |
| void *result; |
| |
| assert(skind < kind); |
| switch (kind) { |
| case PyUnicode_2BYTE_KIND: |
| result = PyMem_New(Py_UCS2, len); |
| if (!result) |
| return PyErr_NoMemory(); |
| assert(skind == PyUnicode_1BYTE_KIND); |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS1, Py_UCS2, |
| (const Py_UCS1 *)data, |
| ((const Py_UCS1 *)data) + len, |
| result); |
| return result; |
| case PyUnicode_4BYTE_KIND: |
| result = PyMem_New(Py_UCS4, len); |
| if (!result) |
| return PyErr_NoMemory(); |
| if (skind == PyUnicode_2BYTE_KIND) { |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS2, Py_UCS4, |
| (const Py_UCS2 *)data, |
| ((const Py_UCS2 *)data) + len, |
| result); |
| } |
| else { |
| assert(skind == PyUnicode_1BYTE_KIND); |
| _PyUnicode_CONVERT_BYTES( |
| Py_UCS1, Py_UCS4, |
| (const Py_UCS1 *)data, |
| ((const Py_UCS1 *)data) + len, |
| result); |
| } |
| return result; |
| default: |
| Py_UNREACHABLE(); |
| return NULL; |
| } |
| } |
| |
| static Py_UCS4* |
| as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, |
| int copy_null) |
| { |
| int kind; |
| const void *data; |
| Py_ssize_t len, targetlen; |
| kind = PyUnicode_KIND(string); |
| data = PyUnicode_DATA(string); |
| len = PyUnicode_GET_LENGTH(string); |
| targetlen = len; |
| if (copy_null) |
| targetlen++; |
| if (!target) { |
| target = PyMem_New(Py_UCS4, targetlen); |
| if (!target) { |
| PyErr_NoMemory(); |
| return NULL; |
| } |
| } |
| else { |
| if (targetsize < targetlen) { |
| PyErr_Format(PyExc_SystemError, |
| "string is longer than the buffer"); |
| if (copy_null && 0 < targetsize) |
| target[0] = 0; |
| return NULL; |
| } |
| } |
| if (kind == PyUnicode_1BYTE_KIND) { |
| const Py_UCS1 *start = (const Py_UCS1 *) data; |
| _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); |
| } |
| else if (kind == PyUnicode_2BYTE_KIND) { |
| const Py_UCS2 *start = (const Py_UCS2 *) data; |
| _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); |
| } |
| else if (kind == PyUnicode_4BYTE_KIND) { |
| memcpy(target, data, len * sizeof(Py_UCS4)); |
| } |
| else { |
| Py_UNREACHABLE(); |
| } |
| if (copy_null) |
| target[len] = 0; |
| return target; |
| } |
| |
| Py_UCS4* |
| PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, |
| int copy_null) |
| { |
| if (target == NULL || targetsize < 0) { |
| PyErr_BadInternalCall(); |
| return NULL; |
| } |
| return as_ucs4(string, target, targetsize, copy_null); |
| } |
| |
| Py_UCS4* |
| PyUnicode_AsUCS4Copy(PyObject *string) |
| { |
| return as_ucs4(string, NULL, 0, 1); |
| } |
| |
| /* maximum number of characters required for output of %jo or %jd or %p. |
| We need at most ceil(log8(256)*sizeof(intmax_t)) digits, |
| plus 1 for the sign, plus 2 for the 0x prefix (for %p), |
| plus 1 for the terminal NUL. */ |
| #define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3) |
| |
| static int |
| unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, |
| Py_ssize_t width, Py_ssize_t precision, int flags) |
| { |
| Py_ssize_t length, fill, arglen; |
| Py_UCS4 maxchar; |
| |
| length = PyUnicode_GET_LENGTH(str); |
| if ((precision == -1 || precision >= length) |
| && width <= length) |
| return _PyUnicodeWriter_WriteStr(writer, str); |
| |
| if (precision != -1) |
| length = Py_MIN(precision, length); |
| |
| arglen = Py_MAX(length, width); |
| if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) |
| maxchar = _PyUnicode_FindMaxChar(str, 0, length); |
| else |
| maxchar = writer->maxchar; |
| |
| if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) |
| return -1; |
| |
| fill = Py_MAX(width - length, 0); |
| if (fill && !(flags & F_LJUST)) { |
| if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) |
| return -1; |
| writer->pos += fill; |
| } |
| |
| _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, |
| str, 0, length); |
| writer->pos += length; |
| |
| if (fill && (flags & F_LJUST)) { |
| if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) |
| return -1; |
| writer->pos += fill; |
| } |
| |
| return 0; |
| } |
| |
| static int |
| unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, |
| Py_ssize_t width, Py_ssize_t precision, int flags) |
| { |
| /* UTF-8 */ |
| Py_ssize_t length; |
| PyObject *unicode; |
| int res; |
| |
| if (precision == -1) { |
| length = strlen(str); |
| } |
| else { |
| length = 0; |
| while (length < precision && str[length]) { |
| length++; |
| } |
| } |
| unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); |
| if (unicode == NULL) |
| return -1; |
| |
| res = unicode_fromformat_write_str(writer, unicode, width, -1, flags); |
| Py_DECREF(unicode); |
| return res; |
| } |
| |
| static int |
| unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str, |
| Py_ssize_t width, Py_ssize_t precision, int flags) |
| { |
| /* UTF-8 */ |
| Py_ssize_t length; |
| PyObject *unicode; |
| int res; |
| |
| if (precision == -1) { |
| length = wcslen(str); |
| } |
| else { |
| length = 0; |
| while (length < precision && str[length]) { |
| length++; |
| } |
| } |
| unicode = PyUnicode_FromWideChar(str, length); |
| if (unicode == NULL) |
| return -1; |
| |
| res = unicode_fromformat_write_str(writer, unicode, width, -1, flags); |
| Py_DECREF(unicode); |
| return res; |
| } |
| |
| #define F_LONG 1 |
| #define F_LONGLONG 2 |
| #define F_SIZE 3 |
| #define F_PTRDIFF 4 |
| #define F_INTMAX 5 |
| static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"}; |
| static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"}; |
| static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"}; |
| static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"}; |
| static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"}; |
| |
| static const char* |
| unicode_fromformat_arg(_PyUnicodeWriter *writer, |
| const char *f, va_list *vargs) |
| { |
| const char *p; |
| Py_ssize_t len; |
| int flags = 0; |
| Py_ssize_t width; |
| Py_ssize_t precision; |
| |
| p = f; |
| f++; |
| if (*f == '%') { |
| if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) |
| return NULL; |
| f++; |
| return f; |
| } |
| |
| /* Parse flags. Example: "%-i" => flags=F_LJUST. */ |
| /* Flags '+', ' ' and '#' are not particularly useful. |
| * They are not worth the implementation and maintenance costs. |
| * In addition, '#' should add "0" for "o" conversions for compatibility |
| * with printf, but it would confuse Python users. */ |
| while (1) { |
| switch (*f++) { |
| case '-': flags |= F_LJUST; continue; |
| case '0': flags |= F_ZERO; continue; |
| } |
| f--; |
| break; |
| } |
| |
| /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ |
| width = -1; |
| if (*f == '*') { |
| width = va_arg(*vargs, int); |
| if (width < 0) { |
| flags |= F_LJUST; |
| width = -width; |
| } |
| f++; |
| } |
| else if (Py_ISDIGIT((unsigned)*f)) { |
| width = *f - '0'; |
| f++; |
| while (Py_ISDIGIT((unsigned)*f)) { |
| if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { |
| PyErr_SetString(PyExc_ValueError, |
| "width too big"); |
| return NULL; |
| } |
| width = (width * 10) + (*f - '0'); |
| f++; |
| } |
| } |
| precision = -1; |
| if (*f == '.') { |
| f++; |
| if (*f == '*') { |
| precision = va_arg(*vargs, int); |
| if (precision < 0) { |
| precision = -2; |
| } |
| f++; |
| } |
| else if (Py_ISDIGIT((unsigned)*f)) { |
| precision = (*f - '0'); |
| f++; |
| while (Py_ISDIGIT((unsigned)*f)) { |
| if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { |
| PyErr_SetString(PyExc_ValueError, |
| "precision too big"); |
| return NULL; |
| } |
| precision = (precision * 10) + (*f - '0'); |
| f++; |
| } |
| } |
| } |
| |
| int sizemod = 0; |
| if (*f == 'l') { |
| if (f[1] == 'l') { |
| sizemod = F_LONGLONG; |
| f += 2; |
| } |
| else { |
| sizemod = F_LONG; |
| ++f; |
| } |
| } |
| else if (*f == 'z') { |
| sizemod = F_SIZE; |
| ++f; |
| } |
| else if (*f == 't') { |
| sizemod = F_PTRDIFF; |
| ++f; |
| } |
| else if (*f == 'j') { |
| sizemod = F_INTMAX; |
| ++f; |
| } |
| if (f[0] != '\0' && f[1] == '\0') |
| writer->overallocate = 0; |
| |
| switch (*f) { |
| case 'd': case 'i': case 'o': case 'u': case 'x': case 'X': |
| break; |
| case 'c': case 'p': |
| if (sizemod || width >= 0 || precision >= 0) goto invalid_format; |
| break; |
| case 's': |
| case 'V': |
| if (sizemod && sizemod != F_LONG) goto invalid_format; |
| break; |
| default: |
| if (sizemod) goto invalid_format; |
| break; |
| } |
| |
| switch (*f) { |
| case 'c': |
| { |
| int ordinal = va_arg(*vargs, int); |
| if (ordinal < 0 || ordinal > MAX_UNICODE) { |
| PyErr_SetString(PyExc_OverflowError, |
| "character argument not in range(0x110000)"); |
| return NULL; |
| } |
| if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) |
| return NULL; |
| break; |
| } |
| |
| case 'd': case 'i': |
| case 'o': case 'u': case 'x': case 'X': |
| { |
| /* used by sprintf */ |
| char buffer[MAX_INTMAX_CHARS]; |
| const char *fmt = NULL; |
| switch (*f) { |
| case 'o': fmt = formats_o[sizemod]; break; |
| case 'u': fmt = formats_u[sizemod]; break; |
| case 'x': fmt = formats_x[sizemod]; break; |
| case 'X': fmt = formats_X[sizemod]; break; |
| default: fmt = formats[sizemod]; break; |
| } |
| int issigned = (*f == 'd' || *f == 'i'); |
| switch (sizemod) { |
| case F_LONG: |
| len = issigned ? |
| sprintf(buffer, fmt, va_arg(*vargs, long)) : |
| sprintf(buffer, fmt, va_arg(*vargs, unsigned long)); |
| break; |
| case F_LONGLONG: |
| len = issigned ? |
| sprintf(buffer, fmt, va_arg(*vargs, long long)) : |
| sprintf(buffer, fmt, va_arg(*vargs, unsigned long long)); |
| break; |
| case F_SIZE: |
| len = issigned ? |
| sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) : |
| sprintf(buffer, fmt, va_arg(*vargs, size_t)); |
| break; |
| case F_PTRDIFF: |
| len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t)); |
| break; |
| case F_INTMAX: |
| len = issigned ? |
| sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) : |
| sprintf(buffer, fmt, va_arg(*vargs, uintmax_t)); |
| break; |
| default: |
| len = issigned ? |
| sprintf(buffer, fmt, va_arg(*vargs, int)) : |
| sprintf(buffer, fmt, va_arg(*vargs, unsigned int)); |
| break; |
| } |
| assert(len >= 0); |
| |
| int sign = (buffer[0] == '-'); |
| len -= sign; |
| |
| precision = Py_MAX(precision, len); |
| width = Py_MAX(width, precision + sign); |
| if ((flags & F_ZERO) && !(flags & F_LJUST)) { |
| precision = width - sign; |
| } |
| |
| Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0); |
| Py_ssize_t zeropad = Py_MAX(precision - len, 0); |
| |
| if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1) |
| return NULL; |
| |
| if (spacepad && !(flags & F_LJUST)) { |
| if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1) |
| return NULL; |
| writer->pos += spacepad; |
| } |
| |
| if (sign) { |
| if (_PyUnicodeWriter_WriteChar(writer, '-') == -1) |
| return NULL; |
| } |
| |
| if (zeropad) { |
| if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1) |
| return NULL; |
| writer->pos += zeropad; |
| } |
| |
| if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0) |
| return NULL; |
| |
| if (spacepad && (flags & F_LJUST)) { |
| if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1) |
| return NULL; |
| writer->pos += spacepad; |
| } |
| break; |
| } |
| |
| case 'p': |
| { |
| char number[MAX_INTMAX_CHARS]; |
| |
| len = sprintf(number, "%p", va_arg(*vargs, void*)); |
| assert(len >= 0); |
| |
| /* %p is ill-defined: ensure leading 0x. */ |
| if (number[1] == 'X') |
| number[1] = 'x'; |
| else if (number[1] != 'x') { |
| memmove(number + 2, number, |
| strlen(number) + 1); |
| number[0] = '0'; |
| number[1] = 'x'; |
| len += 2; |
| } |
| |
| if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) |
| return NULL; |
| break; |
| } |
| |
| case 's': |
| { |
| if (sizemod) { |
| const wchar_t *s = va_arg(*vargs, const wchar_t*); |
| if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0) |
| return NULL; |
| } |
| else { |
| /* UTF-8 */ |
| const char *s = va_arg(*vargs, const char*); |
| if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0) |
| return NULL; |
| } |
| break; |
| } |
| |
| case 'U': |
| { |
| PyObject *obj = va_arg(*vargs, PyObject *); |
| assert(obj && _PyUnicode_CHECK(obj)); |
| |
| if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1) |
| return NULL; |
| break; |
| } |
| |
| case 'V': |
| { |
| PyObject *obj = va_arg(*vargs, PyObject *); |
| const char *str; |
| const wchar_t *wstr; |
| if (sizemod) { |
| wstr = va_arg(*vargs, const wchar_t*); |
| } |
| else { |
| str = va_arg(*vargs, const char *); |
| } |
| if (obj) { |
| assert(_PyUnicode_CHECK(obj)); |
| if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1) |
| return NULL; |
| } |
| else if (sizemod) { |
| assert(wstr != NULL); |
| if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0) |
| return NULL; |
| } |
| else { |
| assert(str != NULL); |
| if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0) |
| return NULL; |
| } |
| break; |
| } |
| |
| case 'S': |
| { |
| PyObject *obj = va_arg(*vargs, PyObject *); |
| PyObject *str; |
| assert(obj); |
| str = PyObject_Str(obj); |
| if (!str) |
| return NULL; |
| if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) { |
| Py_DECREF(str); |
| return NULL; |
| } |
| Py_DECREF(str); |
| break; |
| } |
| |
| case 'R': |
| { |
| PyObject *obj = va_arg(*vargs, PyObject *); |
| PyObject *repr; |
| assert(obj); |
| repr = PyObject_Repr(obj); |
| if (!repr) |
| return NULL; |
| if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) { |
| Py_DECREF(repr); |
| return NULL; |
| } |
| Py_DECREF(repr); |
| break; |
| } |
| |
| case 'A': |
| { |
| PyObject *obj = va_arg(*vargs, PyObject *); |
| PyObject *ascii; |
| assert(obj); |
| ascii = PyObject_ASCII(obj); |
| if (!ascii) |
| return NULL; |
| if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) { |
| Py_DECREF(ascii); |
| return NULL; |
| } |
| Py_DECREF(ascii); |
| break; |
| } |
| |
| default: |
| invalid_format: |
| PyErr_Format(PyExc_SystemError, "invalid format string: %s", p); |
| return NULL; |
| } |
| |
| f++; |
| return f; |
| } |
| |
| PyObject * |
| PyUnicode_FromFormatV(const char *format, va_list vargs) |
| { |
| va_list vargs2; |
| const char *f; |
| _PyUnicodeWriter writer; |
| |
| _PyUnicodeWriter_Init(&writer); |
| writer.min_length = strlen(format) + 100; |
| writer.overallocate = 1; |
| |
| // Copy varags to be able to pass a reference to a subfunction. |
| va_copy(vargs2, vargs); |
| |
| for (f = format; *f; ) { |
| if (*f == '%') { |
| f = unicode_fromformat_arg(&writer, f, &vargs2); |
| if (f == NULL) |
| goto fail; |
| } |
| else { |
| const char *p; |
| Py_ssize_t len; |
| |
| p = f; |
| do |
| { |
| if ((unsigned char)*p > 127) { |
| PyErr_Format(PyExc_ValueError, |
| "PyUnicode_FromFormatV() expects an ASCII-encoded format " |
| "string, got a non-ASCII byte: 0x%02x", |
| (unsigned char)*p); |
| goto fail; |
| } |
| p++; |
| } |
| while (*p != '\0' && *p != '%'); |
| len = p - f; |
| |
| if (*p == '\0') |
| writer.overallocate = 0; |
| |
| if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) |
| goto fail; |
| |
| f = p; |
| } |
| } |
| va_end(vargs2); |
| return _PyUnicodeWriter_Finish(&writer); |
| |
| fail: |
| va_end(vargs2); |
| _PyUnicodeWriter_Dealloc(&writer); |
| return NULL; |
| } |
| |
| PyObject * |
| PyUnicode_FromFormat(const char *format, ...) |
| { |
| PyObject* ret; |
| va_list vargs; |
| |
| va_start(vargs, format); |
| ret = PyUnicode_FromFormatV(format, vargs); |
| va_end(vargs); |
| return ret; |
| } |
| |
| static Py_ssize_t |
| unicode_get_widechar_size(PyObject *unicode) |
| { |
| Py_ssize_t res; |
| |
| assert(unicode != NULL); |
| assert(_PyUnicode_CHECK(unicode)); |
| |
| res = _PyUnicode_LENGTH(unicode); |
| #if SIZEOF_WCHAR_T == 2 |
| if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { |
| const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode); |
| const Py_UCS4 *end = s + res; |
| for (; s < end; ++s) { |
| if (*s > 0xFFFF) { |
| ++res; |
| } |
| } |
| } |
| #endif |
| return res; |
| } |
| |
| static void |
| unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size) |
| { |
| assert(unicode != NULL); |
| assert(_PyUnicode_CHECK(unicode)); |
| |
| if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) { |
| memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t)); |
| return; |
| } |
| |
| if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { |
| const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode); |
| for (; size--; ++s, ++w) { |
| *w = *s; |
| } |
| } |
| else { |
| #if SIZEOF_WCHAR_T == 4 |
| assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND); |
| const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode); |
| for (; size--; ++s, ++w) { |
| *
|