diff options
author | Benjamin Peterson <benjamin@python.org> | 2015-03-02 11:18:40 -0500 |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2015-03-02 11:18:40 -0500 |
commit | d6407b4e5f83ab386a1722de125f0fdc0d181d94 (patch) | |
tree | ee24f62a6af8962ebb1c11b102afee10ba34010c /Modules/unicodedata.c | |
parent | a7f410fcb001feedbe9d66a2b67fc9cae810ca63 (diff) | |
parent | 004c700927e153000f8a0931d304ac602956a50f (diff) | |
download | cpython-d6407b4e5f83ab386a1722de125f0fdc0d181d94.tar.gz |
merge 3.3 (#23367)
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r-- | Modules/unicodedata.c | 101 |
1 files changed, 75 insertions, 26 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 9fb1191fc5..507cef3ba1 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -13,10 +13,18 @@ ------------------------------------------------------------------------ */ +#define PY_SSIZE_T_CLEAN + #include "Python.h" #include "ucnhash.h" #include "structmember.h" +/*[clinic input] +module unicodedata +class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/ + /* character properties */ typedef struct { @@ -107,25 +115,63 @@ static Py_UCS4 getuchar(PyUnicodeObject *obj) /* --- Module API --------------------------------------------------------- */ -PyDoc_STRVAR(unicodedata_decimal__doc__, -"decimal(unichr[, default])\n\ -\n\ -Returns the decimal value assigned to the Unicode character unichr\n\ -as integer. If no such value is defined, default is returned, or, if\n\ -not given, ValueError is raised."); +/*[clinic input] + +unicodedata.UCD.decimal + + unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type') + default: object=NULL + / + +Converts a Unicode character into its equivalent decimal value. + +Returns the decimal value assigned to the Unicode character unichr +as integer. If no such value is defined, default is returned, or, if +not given, ValueError is raised. +[clinic start generated code]*/ + +PyDoc_STRVAR(unicodedata_UCD_decimal__doc__, +"decimal($self, unichr, default=None, /)\n" +"--\n" +"\n" +"Converts a Unicode character into its equivalent decimal value.\n" +"\n" +"Returns the decimal value assigned to the Unicode character unichr\n" +"as integer. If no such value is defined, default is returned, or, if\n" +"not given, ValueError is raised."); + +#define UNICODEDATA_UCD_DECIMAL_METHODDEF \ + {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__}, static PyObject * -unicodedata_decimal(PyObject *self, PyObject *args) +unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value); + +static PyObject * +unicodedata_UCD_decimal(PreviousDBVersion *self, PyObject *args) +{ + PyObject *return_value = NULL; + PyUnicodeObject *unichr; + PyObject *default_value = NULL; + + if (!PyArg_ParseTuple(args, + "O!|O:decimal", + &PyUnicode_Type, &unichr, &default_value)) + goto exit; + return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value); + +exit: + return return_value; +} + +static PyObject * +unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value) +/*[clinic end generated code: output=8689669896d293df input=c25c9d2b4de076b1]*/ { - PyUnicodeObject *v; - PyObject *defobj = NULL; int have_old = 0; long rc; Py_UCS4 c; - if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) - return NULL; - c = getuchar(v); + c = getuchar(unichr); if (c == (Py_UCS4)-1) return NULL; @@ -145,14 +191,14 @@ unicodedata_decimal(PyObject *self, PyObject *args) if (!have_old) rc = Py_UNICODE_TODECIMAL(c); if (rc < 0) { - if (defobj == NULL) { + if (default_value == NULL) { PyErr_SetString(PyExc_ValueError, "not a decimal"); return NULL; } else { - Py_INCREF(defobj); - return defobj; + Py_INCREF(default_value); + return default_value; } } return PyLong_FromLong(rc); @@ -937,7 +983,7 @@ is_unified_ideograph(Py_UCS4 code) (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */ } -/* macros used to determine if the given codepoint is in the PUA range that +/* macros used to determine if the given code point is in the PUA range that * we are using to store aliases and named sequences */ #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ @@ -947,7 +993,7 @@ static int _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq) { - /* Find the name associated with the given codepoint. + /* Find the name associated with the given code point. * If with_alias_and_seq is 1, check for names in the Private Use Area 15 * that we are using for aliases and named sequences. */ int offset; @@ -958,7 +1004,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, if (code >= 0x110000) return 0; - /* XXX should we just skip all the codepoints in the PUAs here? */ + /* XXX should we just skip all the code points in the PUAs here? */ if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) return 0; @@ -1086,8 +1132,8 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) /* check if named sequences are allowed */ if (!with_named_seq && IS_NAMED_SEQ(cp)) return 0; - /* if the codepoint is in the PUA range that we use for aliases, - * convert it to obtain the right codepoint */ + /* if the code point is in the PUA range that we use for aliases, + * convert it to obtain the right code point */ if (IS_ALIAS(cp)) *code = name_aliases[cp-aliases_start]; else @@ -1099,9 +1145,9 @@ static int _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, int with_named_seq) { - /* Return the codepoint associated with the given name. + /* Return the code point associated with the given name. * Named aliases are resolved too (unless self != NULL (i.e. we are using - * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are + * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are * using for the named sequence, and the caller must then convert it. */ unsigned int h, v; unsigned int mask = code_size-1; @@ -1234,12 +1280,16 @@ unicodedata_lookup(PyObject* self, PyObject* args) Py_UCS4 code; char* name; - int namelen; + Py_ssize_t namelen; unsigned int index; if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) return NULL; + if (namelen > INT_MAX) { + PyErr_SetString(PyExc_KeyError, "name too long"); + return NULL; + } - if (!_getcode(self, name, namelen, &code, 1)) { + if (!_getcode(self, name, (int)namelen, &code, 1)) { PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); return NULL; } @@ -1257,7 +1307,7 @@ unicodedata_lookup(PyObject* self, PyObject* args) /* XXX Add doc strings. */ static PyMethodDef unicodedata_functions[] = { - {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, + UNICODEDATA_UCD_DECIMAL_METHODDEF {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, {"category", unicodedata_category, METH_VARARGS, @@ -1334,7 +1384,6 @@ this database is based on the UnicodeData.txt file version\n\ The module uses the same names and symbols as defined by the\n\ UnicodeData File Format " UNIDATA_VERSION "."); - static struct PyModuleDef unicodedatamodule = { PyModuleDef_HEAD_INIT, "unicodedata", |