summaryrefslogtreecommitdiff
path: root/Objects/unicodectype.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodectype.c')
-rw-r--r--Objects/unicodectype.c125
1 files changed, 100 insertions, 25 deletions
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
index 9f6ac89b9f..a572c12bcc 100644
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -21,13 +21,20 @@
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
#define PRINTABLE_MASK 0x400
-#define NODELTA_MASK 0x800
-#define NUMERIC_MASK 0x1000
+#define NUMERIC_MASK 0x800
+#define CASE_IGNORABLE_MASK 0x1000
+#define CASED_MASK 0x2000
+#define EXTENDED_CASE_MASK 0x4000
typedef struct {
- const Py_UCS4 upper;
- const Py_UCS4 lower;
- const Py_UCS4 title;
+ /*
+ These are either deltas to the character or offsets in
+ _PyUnicode_ExtendedCase.
+ */
+ const int upper;
+ const int lower;
+ const int title;
+ /* Note if more flag space is needed, decimal and digit could be unified. */
const unsigned char decimal;
const unsigned char digit;
const unsigned short flags;
@@ -57,15 +64,10 @@ gettyperecord(Py_UCS4 code)
Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- int delta = ctype->title;
- if (ctype->flags & NODELTA_MASK)
- return delta;
-
- if (delta >= 32768)
- delta -= 65536;
-
- return ch + delta;
+ if (ctype->flags & EXTENDED_CASE_MASK)
+ return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
+ return ch + ctype->title;
}
/* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -188,12 +190,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- int delta = ctype->upper;
- if (ctype->flags & NODELTA_MASK)
- return delta;
- if (delta >= 32768)
- delta -= 65536;
- return ch + delta;
+
+ if (ctype->flags & EXTENDED_CASE_MASK)
+ return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
+ return ch + ctype->upper;
}
/* Returns the lowercase Unicode characters corresponding to ch or just
@@ -202,12 +202,87 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- int delta = ctype->lower;
- if (ctype->flags & NODELTA_MASK)
- return delta;
- if (delta >= 32768)
- delta -= 65536;
- return ch + delta;
+
+ if (ctype->flags & EXTENDED_CASE_MASK)
+ return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
+ return ch + ctype->lower;
+}
+
+int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK) {
+ int index = ctype->lower & 0xFFFF;
+ int n = ctype->lower >> 24;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ res[0] = ch + ctype->lower;
+ return 1;
+}
+
+int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK) {
+ int index = ctype->title & 0xFFFF;
+ int n = ctype->title >> 24;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ res[0] = ch + ctype->title;
+ return 1;
+}
+
+int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK) {
+ int index = ctype->upper & 0xFFFF;
+ int n = ctype->upper >> 24;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ res[0] = ch + ctype->upper;
+ return 1;
+}
+
+int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
+ int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
+ int n = (ctype->lower >> 20) & 7;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ return _PyUnicode_ToLowerFull(ch, res);
+}
+
+int _PyUnicode_IsCased(Py_UCS4 ch)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ return (ctype->flags & CASED_MASK) != 0;
+}
+
+int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
}
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',