#include #include #include #include typedef unsigned short Py_UCS2; typedef unsigned char Py_UCS1; typedef unsigned int Py_UCS4; typedef ssize_t Py_ssize_t; typedef Py_ssize_t Py_hash_t; #define _PyObject_HEAD_EXTRA \ struct _object *_ob_next; \ struct _object *_ob_prev; typedef struct _object { _PyObject_HEAD_EXTRA Py_ssize_t ob_refcnt; struct _typeobject *ob_type; } PyObject; #define PyObject_HEAD PyObject ob_base; /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject structure. state.ascii and state.compact are set, and the data immediately follow the structure. utf8_length and wstr_length can be found in the length field; the utf8 pointer is equal to the data pointer. */ typedef struct { /* There are 4 forms of Unicode strings: - compact ascii: * structure = PyASCIIObject * kind = PyUnicode_1BYTE_KIND * compact = 1 * ascii = 1 * ready = 1 * (length is the length of the utf8 and wstr strings) * (data starts just after the structure) * (since ASCII is decoded from UTF-8, the utf8 string are the data) - compact: * structure = PyCompactUnicodeObject * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or PyUnicode_4BYTE_KIND * compact = 1 * ready = 1 * ascii = 0 * utf8 is not shared with data * utf8_length = 0 if utf8 is NULL * wstr is shared with data and wstr_length=length if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 * wstr_length = 0 if wstr is NULL * (data starts just after the structure) - legacy string, not ready: * structure = PyUnicodeObject * kind = PyUnicode_WCHAR_KIND * compact = 0 * ascii = 0 * ready = 0 * wstr is not NULL * data.any is NULL * utf8 is NULL * utf8_length = 0 * interned = SSTATE_NOT_INTERNED - legacy string, ready: * structure = PyUnicodeObject structure * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or PyUnicode_4BYTE_KIND * compact = 0 * ready = 1 * data.any is not NULL * utf8 is shared and utf8_length = length with data.any if ascii = 1 * utf8_length = 0 if utf8 is NULL * wstr is shared and wstr_length = length with data.any if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 * wstr_length = 0 if wstr is NULL Compact strings use only one memory block (structure + characters), whereas legacy strings use one block for the structure and one block for characters. Legacy strings are created by PyUnicode_FromUnicode() and PyUnicode_FromStringAndSize(NULL, size) functions. They become ready when PyUnicode_READY() is called. See also _PyUnicode_CheckConsistency(). */ PyObject_HEAD Py_ssize_t length; /* Number of code points in the string */ Py_hash_t hash; /* Hash value; -1 if not set */ struct { /* SSTATE_NOT_INTERNED (0) SSTATE_INTERNED_MORTAL (1) SSTATE_INTERNED_IMMORTAL (2) If interned != SSTATE_NOT_INTERNED, the two references from the dictionary to this object are *not* counted in ob_refcnt. */ unsigned int interned:2; /* Character size: - PyUnicode_WCHAR_KIND (0): * character type = wchar_t (16 or 32 bits, depending on the platform) - PyUnicode_1BYTE_KIND (1): * character type = Py_UCS1 (8 bits, unsigned) * if ascii is set, all characters must be in range U+0000-U+007F, otherwise at least one character must be in range U+0080-U+00FF - PyUnicode_2BYTE_KIND (2): * character type = Py_UCS2 (16 bits, unsigned) * at least one character must be in range U+0100-U+FFFF - PyUnicode_4BYTE_KIND (4): * character type = Py_UCS4 (32 bits, unsigned) * at least one character must be in range U+10000-U+10FFFF */ unsigned int kind:3; /* Compact is with respect to the allocation scheme. Compact unicode objects only require one memory block while non-compact objects use one block for the PyUnicodeObject struct and another for its data buffer. */ unsigned int compact:1; /* The string only contains characters in range U+0000-U+007F (ASCII) and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is set, use the PyASCIIObject structure. */ unsigned int ascii:1; /* The ready flag indicates whether the object layout is initialized completely. This means that this is either a compact object, or the data pointer is filled out. The bit is redundant, and helps to minimize the test in PyUnicode_IS_READY(). */ unsigned int ready:1; } state; wchar_t *wstr; /* wchar_t representation (null-terminated) */ } PyASCIIObject; /* Non-ASCII strings allocated through PyUnicode_New use the PyCompactUnicodeObject structure. state.compact is set, and the data immediately follow the structure. */ typedef struct { PyASCIIObject _base; Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the * terminating 0円. */ char *utf8; /* UTF-8 representation (null-terminated) */ Py_ssize_t wstr_length; /* Number of code points in wstr, possible * surrogates count as two code points. */ } PyCompactUnicodeObject; /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the PyUnicodeObject structure. The actual string data is initially in the wstr block, and copied into the data block using _PyUnicode_Ready. */ typedef struct { PyCompactUnicodeObject _base; union { void *any; Py_UCS1 *latin1; Py_UCS2 *ucs2; Py_UCS4 *ucs4; } data; /* Canonical, smallest-form Unicode buffer */ } PyUnicodeObject; #define PyUnicode_IS_ASCII(op) \ (((PyASCIIObject*)op)->state.ascii) /* Return true if the string is compact or 0 if not. No type checks or Ready calls are performed. */ #define PyUnicode_IS_COMPACT(op) \ (((PyASCIIObject*)(op))->state.compact) /* Return true if the string is a compact ASCII string (use PyASCIIObject structure), or 0 if not. No type checks or Ready calls are performed. */ #define PyUnicode_IS_COMPACT_ASCII(op) \ (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op)) enum PyUnicode_Kind { /* String contains only wstr byte characters. This is only possible when the string was created with a legacy API and _PyUnicode_Ready() has not been called yet. */ PyUnicode_WCHAR_KIND = 0, /* Return values of the PyUnicode_KIND() macro: */ PyUnicode_1BYTE_KIND = 1, PyUnicode_2BYTE_KIND = 2, PyUnicode_4BYTE_KIND = 4 }; /* Return pointers to the canonical representation cast to unsigned char, Py_UCS2, or Py_UCS4 for direct character access. No checks are performed, use PyUnicode_KIND() before to ensure these will work correctly. */ #define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) /* Return one of the PyUnicode_*_KIND values defined above. */ #define PyUnicode_KIND(op) \ (assert(PyUnicode_IS_READY(op)), \ ((PyASCIIObject *)(op))->state.kind) /* Return a void pointer to the raw unicode buffer. */ #define _PyUnicode_COMPACT_DATA(op) \ (PyUnicode_IS_COMPACT_ASCII(op) ? \ ((void*)((PyASCIIObject*)(op) + 1)) : \ ((void*)((PyCompactUnicodeObject*)(op) + 1))) #define _PyUnicode_NONCOMPACT_DATA(op) \ (assert(((PyUnicodeObject*)(op))->data.any), \ ((((PyUnicodeObject *)(op))->data.any))) #define PyUnicode_DATA(op) \ (PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ _PyUnicode_NONCOMPACT_DATA(op)) int main() { PyASCIIObject *ascii; PyCompactUnicodeObject *compact; char *data; compact = malloc(sizeof(PyCompactUnicodeObject) + 2); ascii = (PyASCIIObject*)compact; ascii->state.kind = 1; ascii->state.compact = 1; assert(sizeof(Py_UCS2) == 2); assert(sizeof(Py_UCS4) == 4); printf("is ascii? %i\n", PyUnicode_IS_ASCII(compact)); printf("is compact? %i\n", PyUnicode_IS_COMPACT(compact)); printf("is compact ascii? %i\n", PyUnicode_IS_COMPACT_ASCII(compact)); data = _PyUnicode_COMPACT_DATA(compact); printf("_PyUnicode_COMPACT_DATA: %u vs %u\n", (unsigned)(data - (char*)compact), sizeof(PyCompactUnicodeObject)); data = PyUnicode_DATA(compact); printf("PyUnicode_DATA: %u vs %u\n", (unsigned)(data - (char*)compact), sizeof(PyCompactUnicodeObject)); data = (char*)((PyCompactUnicodeObject*)compact + 1); printf("explicit cast: %u vs %u\n", (unsigned)(data - (char*)compact), sizeof(PyCompactUnicodeObject)); return 0; }

AltStyle によって変換されたページ (->オリジナル) /