summaryrefslogtreecommitdiff
path: root/Modules/unicodedata.c
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2015-03-02 11:18:40 -0500
committerBenjamin Peterson <benjamin@python.org>2015-03-02 11:18:40 -0500
commitd6407b4e5f83ab386a1722de125f0fdc0d181d94 (patch)
treeee24f62a6af8962ebb1c11b102afee10ba34010c /Modules/unicodedata.c
parenta7f410fcb001feedbe9d66a2b67fc9cae810ca63 (diff)
parent004c700927e153000f8a0931d304ac602956a50f (diff)
downloadcpython-d6407b4e5f83ab386a1722de125f0fdc0d181d94.tar.gz
merge 3.3 (#23367)
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r--Modules/unicodedata.c101
1 files changed, 75 insertions, 26 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 9fb1191fc5..507cef3ba1 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -13,10 +13,18 @@
------------------------------------------------------------------------ */
+#define PY_SSIZE_T_CLEAN
+
#include "Python.h"
#include "ucnhash.h"
#include "structmember.h"
+/*[clinic input]
+module unicodedata
+class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
+
/* character properties */
typedef struct {
@@ -107,25 +115,63 @@ static Py_UCS4 getuchar(PyUnicodeObject *obj)
/* --- Module API --------------------------------------------------------- */
-PyDoc_STRVAR(unicodedata_decimal__doc__,
-"decimal(unichr[, default])\n\
-\n\
-Returns the decimal value assigned to the Unicode character unichr\n\
-as integer. If no such value is defined, default is returned, or, if\n\
-not given, ValueError is raised.");
+/*[clinic input]
+
+unicodedata.UCD.decimal
+
+ unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type')
+ default: object=NULL
+ /
+
+Converts a Unicode character into its equivalent decimal value.
+
+Returns the decimal value assigned to the Unicode character unichr
+as integer. If no such value is defined, default is returned, or, if
+not given, ValueError is raised.
+[clinic start generated code]*/
+
+PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
+"decimal($self, unichr, default=None, /)\n"
+"--\n"
+"\n"
+"Converts a Unicode character into its equivalent decimal value.\n"
+"\n"
+"Returns the decimal value assigned to the Unicode character unichr\n"
+"as integer. If no such value is defined, default is returned, or, if\n"
+"not given, ValueError is raised.");
+
+#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
+ {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
static PyObject *
-unicodedata_decimal(PyObject *self, PyObject *args)
+unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value);
+
+static PyObject *
+unicodedata_UCD_decimal(PreviousDBVersion *self, PyObject *args)
+{
+ PyObject *return_value = NULL;
+ PyUnicodeObject *unichr;
+ PyObject *default_value = NULL;
+
+ if (!PyArg_ParseTuple(args,
+ "O!|O:decimal",
+ &PyUnicode_Type, &unichr, &default_value))
+ goto exit;
+ return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
+
+exit:
+ return return_value;
+}
+
+static PyObject *
+unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value)
+/*[clinic end generated code: output=8689669896d293df input=c25c9d2b4de076b1]*/
{
- PyUnicodeObject *v;
- PyObject *defobj = NULL;
int have_old = 0;
long rc;
Py_UCS4 c;
- if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
- return NULL;
- c = getuchar(v);
+ c = getuchar(unichr);
if (c == (Py_UCS4)-1)
return NULL;
@@ -145,14 +191,14 @@ unicodedata_decimal(PyObject *self, PyObject *args)
if (!have_old)
rc = Py_UNICODE_TODECIMAL(c);
if (rc < 0) {
- if (defobj == NULL) {
+ if (default_value == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a decimal");
return NULL;
}
else {
- Py_INCREF(defobj);
- return defobj;
+ Py_INCREF(default_value);
+ return default_value;
}
}
return PyLong_FromLong(rc);
@@ -937,7 +983,7 @@ is_unified_ideograph(Py_UCS4 code)
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
}
-/* macros used to determine if the given codepoint is in the PUA range that
+/* macros used to determine if the given code point is in the PUA range that
* we are using to store aliases and named sequences */
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
@@ -947,7 +993,7 @@ static int
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
int with_alias_and_seq)
{
- /* Find the name associated with the given codepoint.
+ /* Find the name associated with the given code point.
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
* that we are using for aliases and named sequences. */
int offset;
@@ -958,7 +1004,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
if (code >= 0x110000)
return 0;
- /* XXX should we just skip all the codepoints in the PUAs here? */
+ /* XXX should we just skip all the code points in the PUAs here? */
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
return 0;
@@ -1086,8 +1132,8 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
/* check if named sequences are allowed */
if (!with_named_seq && IS_NAMED_SEQ(cp))
return 0;
- /* if the codepoint is in the PUA range that we use for aliases,
- * convert it to obtain the right codepoint */
+ /* if the code point is in the PUA range that we use for aliases,
+ * convert it to obtain the right code point */
if (IS_ALIAS(cp))
*code = name_aliases[cp-aliases_start];
else
@@ -1099,9 +1145,9 @@ static int
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
int with_named_seq)
{
- /* Return the codepoint associated with the given name.
+ /* Return the code point associated with the given name.
* Named aliases are resolved too (unless self != NULL (i.e. we are using
- * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
+ * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
* using for the named sequence, and the caller must then convert it. */
unsigned int h, v;
unsigned int mask = code_size-1;
@@ -1234,12 +1280,16 @@ unicodedata_lookup(PyObject* self, PyObject* args)
Py_UCS4 code;
char* name;
- int namelen;
+ Py_ssize_t namelen;
unsigned int index;
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL;
+ if (namelen > INT_MAX) {
+ PyErr_SetString(PyExc_KeyError, "name too long");
+ return NULL;
+ }
- if (!_getcode(self, name, namelen, &code, 1)) {
+ if (!_getcode(self, name, (int)namelen, &code, 1)) {
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
return NULL;
}
@@ -1257,7 +1307,7 @@ unicodedata_lookup(PyObject* self, PyObject* args)
/* XXX Add doc strings. */
static PyMethodDef unicodedata_functions[] = {
- {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
+ UNICODEDATA_UCD_DECIMAL_METHODDEF
{"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
{"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
{"category", unicodedata_category, METH_VARARGS,
@@ -1334,7 +1384,6 @@ this database is based on the UnicodeData.txt file version\n\
The module uses the same names and symbols as defined by the\n\
UnicodeData File Format " UNIDATA_VERSION ".");
-
static struct PyModuleDef unicodedatamodule = {
PyModuleDef_HEAD_INIT,
"unicodedata",