summaryrefslogtreecommitdiff
path: root/Modules/unicodedata.c
diff options
context:
space:
mode:
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r--Modules/unicodedata.c324
1 files changed, 205 insertions, 119 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 4db5087f11..9fb1191fc5 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -93,16 +93,13 @@ new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4)
static Py_UCS4 getuchar(PyUnicodeObject *obj)
{
- Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
-
- if (PyUnicode_GET_SIZE(obj) == 1)
- return *v;
-#ifndef Py_UNICODE_WIDE
- else if ((PyUnicode_GET_SIZE(obj) == 2) &&
- (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
- (0xDC00 <= v[1] && v[1] <= 0xDFFF))
- return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
-#endif
+ if (PyUnicode_READY(obj))
+ return (Py_UCS4)-1;
+ if (PyUnicode_GET_LENGTH(obj) == 1) {
+ if (PyUnicode_READY(obj))
+ return (Py_UCS4)-1;
+ return PyUnicode_READ_CHAR(obj, 0);
+ }
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return (Py_UCS4)-1;
@@ -443,7 +440,7 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
from Tools/unicode/makeunicodedata.py, it should not be possible
to overflow decomp_prefix. */
prefix_index = decomp_data[index] & 255;
- assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
+ assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
/* copy prefix */
i = strlen(decomp_prefix[prefix_index]);
@@ -498,36 +495,54 @@ static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
- Py_UNICODE *i, *end, *o;
+ Py_UCS4 *output;
+ Py_ssize_t i, o, osize;
+ int kind;
+ void *data;
/* Longest decomposition in Unicode 3.2: U+FDFA */
- Py_UNICODE stack[20];
+ Py_UCS4 stack[20];
Py_ssize_t space, isize;
int index, prefix, count, stackptr;
unsigned char prev, cur;
stackptr = 0;
- isize = PyUnicode_GET_SIZE(input);
- /* Overallocate atmost 10 characters. */
- space = (isize > 10 ? 10 : isize) + isize;
- result = PyUnicode_FromUnicode(NULL, space);
- if (!result)
+ isize = PyUnicode_GET_LENGTH(input);
+ space = isize;
+ /* Overallocate at most 10 characters. */
+ if (space > 10) {
+ if (space <= PY_SSIZE_T_MAX - 10)
+ space += 10;
+ }
+ else {
+ space *= 2;
+ }
+ osize = space;
+ output = PyMem_NEW(Py_UCS4, space);
+ if (!output) {
+ PyErr_NoMemory();
return NULL;
- i = PyUnicode_AS_UNICODE(input);
- end = i + isize;
- o = PyUnicode_AS_UNICODE(result);
+ }
+ i = o = 0;
+ kind = PyUnicode_KIND(input);
+ data = PyUnicode_DATA(input);
- while (i < end) {
- stack[stackptr++] = *i++;
+ while (i < isize) {
+ stack[stackptr++] = PyUnicode_READ(kind, data, i++);
while(stackptr) {
- Py_UNICODE code = stack[--stackptr];
+ Py_UCS4 code = stack[--stackptr];
/* Hangul Decomposition adds three characters in
- a single step, so we need atleast that much room. */
+ a single step, so we need at least that much room. */
if (space < 3) {
- Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
+ Py_UCS4 *new_output;
+ osize += 10;
space += 10;
- if (PyUnicode_Resize(&result, newsize) == -1)
+ new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
+ if (new_output == NULL) {
+ PyMem_Free(output);
+ PyErr_NoMemory();
return NULL;
- o = PyUnicode_AS_UNICODE(result) + newsize - space;
+ }
+ output = new_output;
}
/* Hangul Decomposition. */
if (SBase <= code && code < (SBase+SCount)) {
@@ -535,11 +550,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
int L = LBase + SIndex / NCount;
int V = VBase + (SIndex % NCount) / TCount;
int T = TBase + SIndex % TCount;
- *o++ = L;
- *o++ = V;
+ output[o++] = L;
+ output[o++] = V;
space -= 2;
if (T != TBase) {
- *o++ = T;
+ output[o++] = T;
space --;
}
continue;
@@ -559,7 +574,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
if (!count || (prefix && !k)) {
- *o++ = code;
+ output[o++] = code;
space--;
continue;
}
@@ -572,15 +587,20 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
}
}
- /* Drop overallocation. Cannot fail. */
- PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
+ result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
+ output, o);
+ PyMem_Free(output);
+ if (!result)
+ return NULL;
+ /* result is guaranteed to be ready, as it is compact. */
+ kind = PyUnicode_KIND(result);
+ data = PyUnicode_DATA(result);
/* Sort canonically. */
- i = PyUnicode_AS_UNICODE(result);
- prev = _getrecord_ex(*i)->combining;
- end = i + PyUnicode_GET_SIZE(result);
- for (i++; i < end; i++) {
- cur = _getrecord_ex(*i)->combining;
+ i = 0;
+ prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
+ for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
+ cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
if (prev == 0 || cur == 0 || prev <= cur) {
prev = cur;
continue;
@@ -588,31 +608,32 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
/* Non-canonical order. Need to switch *i with previous. */
o = i - 1;
while (1) {
- Py_UNICODE tmp = o[1];
- o[1] = o[0];
- o[0] = tmp;
+ Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
+ PyUnicode_WRITE(kind, data, o+1,
+ PyUnicode_READ(kind, data, o));
+ PyUnicode_WRITE(kind, data, o, tmp);
o--;
- if (o < PyUnicode_AS_UNICODE(result))
+ if (o < 0)
break;
- prev = _getrecord_ex(*o)->combining;
+ prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
if (prev == 0 || prev <= cur)
break;
}
- prev = _getrecord_ex(*i)->combining;
+ prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
}
return result;
}
static int
-find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
+find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
{
- int index;
+ unsigned int index;
for (index = 0; nfc[index].start; index++) {
- int start = nfc[index].start;
+ unsigned int start = nfc[index].start;
if (code < start)
return -1;
if (code <= start + nfc[index].count) {
- int delta = code - start;
+ unsigned int delta = code - start;
return nfc[index].index + delta;
}
}
@@ -623,27 +644,36 @@ static PyObject*
nfc_nfkc(PyObject *self, PyObject *input, int k)
{
PyObject *result;
- Py_UNICODE *i, *i1, *o, *end;
+ int kind;
+ void *data;
+ Py_UCS4 *output;
+ Py_ssize_t i, i1, o, len;
int f,l,index,index1,comb;
- Py_UNICODE code;
- Py_UNICODE *skipped[20];
+ Py_UCS4 code;
+ Py_ssize_t skipped[20];
int cskipped = 0;
result = nfd_nfkd(self, input, k);
if (!result)
return NULL;
-
- /* We are going to modify result in-place.
- If nfd_nfkd is changed to sometimes return the input,
- this code needs to be reviewed. */
- assert(result != input);
-
- i = PyUnicode_AS_UNICODE(result);
- end = i + PyUnicode_GET_SIZE(result);
- o = PyUnicode_AS_UNICODE(result);
+ /* result will be "ready". */
+ kind = PyUnicode_KIND(result);
+ data = PyUnicode_DATA(result);
+ len = PyUnicode_GET_LENGTH(result);
+
+ /* We allocate a buffer for the output.
+ If we find that we made no changes, we still return
+ the NFD result. */
+ output = PyMem_NEW(Py_UCS4, len);
+ if (!output) {
+ PyErr_NoMemory();
+ Py_DECREF(result);
+ return 0;
+ }
+ i = o = 0;
again:
- while (i < end) {
+ while (i < len) {
for (index = 0; index < cskipped; index++) {
if (skipped[index] == i) {
/* *i character is skipped.
@@ -656,33 +686,41 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
}
/* Hangul Composition. We don't need to check for <LV,T>
pairs, since we always have decomposed data. */
- if (LBase <= *i && *i < (LBase+LCount) &&
- i + 1 < end &&
- VBase <= i[1] && i[1] <= (VBase+VCount)) {
+ code = PyUnicode_READ(kind, data, i);
+ if (LBase <= code && code < (LBase+LCount) &&
+ i + 1 < len &&
+ VBase <= PyUnicode_READ(kind, data, i+1) &&
+ PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
int LIndex, VIndex;
- LIndex = i[0] - LBase;
- VIndex = i[1] - VBase;
+ LIndex = code - LBase;
+ VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
code = SBase + (LIndex*VCount+VIndex)*TCount;
i+=2;
- if (i < end &&
- TBase <= *i && *i <= (TBase+TCount)) {
- code += *i-TBase;
+ if (i < len &&
+ TBase <= PyUnicode_READ(kind, data, i) &&
+ PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
+ code += PyUnicode_READ(kind, data, i)-TBase;
i++;
}
- *o++ = code;
+ output[o++] = code;
continue;
}
- f = find_nfc_index(self, nfc_first, *i);
+ /* code is still input[i] here */
+ f = find_nfc_index(self, nfc_first, code);
if (f == -1) {
- *o++ = *i++;
+ output[o++] = code;
+ i++;
continue;
}
/* Find next unblocked character. */
i1 = i+1;
comb = 0;
- while (i1 < end) {
- int comb1 = _getrecord_ex(*i1)->combining;
+ /* output base character for now; might be updated later. */
+ output[o] = PyUnicode_READ(kind, data, i);
+ while (i1 < len) {
+ Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
+ int comb1 = _getrecord_ex(code1)->combining;
if (comb) {
if (comb1 == 0)
break;
@@ -692,8 +730,8 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
continue;
}
}
- l = find_nfc_index(self, nfc_last, *i1);
- /* *i1 cannot be combined with *i. If *i1
+ l = find_nfc_index(self, nfc_last, code1);
+ /* i1 cannot be combined with i. If i1
is a starter, we don't need to look further.
Otherwise, record the combining class. */
if (l == -1) {
@@ -712,19 +750,28 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
goto not_combinable;
/* Replace the original character. */
- *i = code;
+ output[o] = code;
/* Mark the second character unused. */
assert(cskipped < 20);
skipped[cskipped++] = i1;
i1++;
- f = find_nfc_index(self, nfc_first, *i);
+ f = find_nfc_index(self, nfc_first, output[o]);
if (f == -1)
break;
}
- *o++ = *i++;
+ /* Output character was already written.
+ Just advance the indices. */
+ o++; i++;
}
- if (o != end)
- PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
+ if (o == len) {
+ /* No changes. Return original string. */
+ PyMem_Free(output);
+ return result;
+ }
+ Py_DECREF(result);
+ result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
+ output, o);
+ PyMem_Free(output);
return result;
}
@@ -732,7 +779,9 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
static int
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
{
- Py_UNICODE *i, *end;
+ Py_ssize_t i, len;
+ int kind;
+ void *data;
unsigned char prev_combining = 0, quickcheck_mask;
/* An older version of the database is requested, quickchecks must be
@@ -744,10 +793,13 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
- i = PyUnicode_AS_UNICODE(input);
- end = i + PyUnicode_GET_SIZE(input);
- while (i < end) {
- const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
+ i = 0;
+ kind = PyUnicode_KIND(input);
+ data = PyUnicode_DATA(input);
+ len = PyUnicode_GET_LENGTH(input);
+ while (i < len) {
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
+ const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
unsigned char combining = record->combining;
unsigned char quickcheck = record->normalization_quick_check;
@@ -776,7 +828,10 @@ unicodedata_normalize(PyObject *self, PyObject *args)
&form, &PyUnicode_Type, &input))
return NULL;
- if (PyUnicode_GetSize(input) == 0) {
+ if (PyUnicode_READY(input) == -1)
+ return NULL;
+
+ if (PyUnicode_GET_LENGTH(input) == 0) {
/* Special case empty input strings, since resizing
them later would cause internal errors. */
Py_INCREF(input);
@@ -876,15 +931,25 @@ is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
- (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
+ (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
}
+/* macros used to determine if the given codepoint is in the PUA range that
+ * we are using to store aliases and named sequences */
+#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
+#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
+ (cp < named_sequences_end))
+
static int
-_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
+_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
+ int with_alias_and_seq)
{
+ /* Find the name associated with the given codepoint.
+ * If with_alias_and_seq is 1, check for names in the Private Use Area 15
+ * that we are using for aliases and named sequences. */
int offset;
int i;
int word;
@@ -893,8 +958,16 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
if (code >= 0x110000)
return 0;
+ /* XXX should we just skip all the codepoints in the PUAs here? */
+ if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
+ return 0;
+
if (self && UCD_Check(self)) {
- const change_record *old = get_old_record(self, code);
+ /* in 3.2.0 there are no aliases and named sequences */
+ const change_record *old;
+ if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
+ return 0;
+ old = get_old_record(self, code);
if (old->category_changed == 0) {
/* unassigned */
return 0;
@@ -978,7 +1051,7 @@ _cmpname(PyObject *self, int code, const char* name, int namelen)
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN];
- if (!_getucname(self, code, buffer, sizeof(buffer)))
+ if (!_getucname(self, code, buffer, sizeof(buffer), 1))
return 0;
for (i = 0; i < namelen; i++) {
if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
@@ -994,7 +1067,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
*len = -1;
for (i = 0; i < count; i++) {
char *s = hangul_syllables[i][column];
- len1 = strlen(s);
+ len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
if (len1 <= *len)
continue;
if (strncmp(str, s, len1) == 0) {
@@ -1008,8 +1081,28 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
}
static int
-_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
+_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
+{
+ /* check if named sequences are allowed */
+ if (!with_named_seq && IS_NAMED_SEQ(cp))
+ return 0;
+ /* if the codepoint is in the PUA range that we use for aliases,
+ * convert it to obtain the right codepoint */
+ if (IS_ALIAS(cp))
+ *code = name_aliases[cp-aliases_start];
+ else
+ *code = cp;
+ return 1;
+}
+
+static int
+_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
+ int with_named_seq)
{
+ /* Return the codepoint associated with the given name.
+ * Named aliases are resolved too (unless self != NULL (i.e. we are using
+ * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
+ * using for the named sequence, and the caller must then convert it. */
unsigned int h, v;
unsigned int mask = code_size-1;
unsigned int i, incr;
@@ -1065,10 +1158,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
v = code_hash[i];
if (!v)
return 0;
- if (_cmpname(self, v, name, namelen)) {
- *code = v;
- return 1;
- }
+ if (_cmpname(self, v, name, namelen))
+ return _check_alias_and_seq(v, code, with_named_seq);
incr = (h ^ (h >> 3)) & mask;
if (!incr)
incr = mask;
@@ -1077,10 +1168,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
v = code_hash[i];
if (!v)
return 0;
- if (_cmpname(self, v, name, namelen)) {
- *code = v;
- return 1;
- }
+ if (_cmpname(self, v, name, namelen))
+ return _check_alias_and_seq(v, code, with_named_seq);
incr = incr << 1;
if (incr > mask)
incr = incr ^ code_poly;
@@ -1118,7 +1207,7 @@ unicodedata_name(PyObject* self, PyObject* args)
if (c == (Py_UCS4)-1)
return NULL;
- if (!_getucname(self, c, name, sizeof(name))) {
+ if (!_getucname(self, c, name, sizeof(name), 0)) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name");
return NULL;
@@ -1143,28 +1232,26 @@ static PyObject *
unicodedata_lookup(PyObject* self, PyObject* args)
{
Py_UCS4 code;
- Py_UNICODE str[2];
char* name;
int namelen;
+ unsigned int index;
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL;
- if (!_getcode(self, name, namelen, &code)) {
- PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
- name);
+ if (!_getcode(self, name, namelen, &code, 1)) {
+ PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
return NULL;
}
-
-#ifndef Py_UNICODE_WIDE
- if (code >= 0x10000) {
- str[0] = 0xd800 + ((code - 0x10000) >> 10);
- str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
- return PyUnicode_FromUnicode(str, 2);
+ /* check if code is in the PUA range that we use for named sequences
+ and convert it */
+ if (IS_NAMED_SEQ(code)) {
+ index = code-named_sequences_start;
+ return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
+ named_sequences[index].seq,
+ named_sequences[index].seqlen);
}
-#endif
- str[0] = (Py_UNICODE) code;
- return PyUnicode_FromUnicode(str, 1);
+ return PyUnicode_FromOrdinal(code);
}
/* XXX Add doc strings. */
@@ -1242,11 +1329,10 @@ PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\
this database is based on the UnicodeData.txt file version\n\
-6.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
+" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
\n\
The module uses the same names and symbols as defined by the\n\
-UnicodeData File Format 6.0.0 (see\n\
-http://www.unicode.org/reports/tr44/tr44-6.html).");
+UnicodeData File Format " UNIDATA_VERSION ".");
static struct PyModuleDef unicodedatamodule = {