1 files changed, 100 insertions, 25 deletions
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
index 9f6ac89b9f..a572c12bcc 100644
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -21,13 +21,20 @@
 #define XID_START_MASK 0x100
 #define XID_CONTINUE_MASK 0x200
 #define PRINTABLE_MASK 0x400
-#define NODELTA_MASK 0x800
-#define NUMERIC_MASK 0x1000
+#define NUMERIC_MASK 0x800
+#define CASE_IGNORABLE_MASK 0x1000
+#define CASED_MASK 0x2000
+#define EXTENDED_CASE_MASK 0x4000
 
 typedef struct {
-    const Py_UCS4 upper;
-    const Py_UCS4 lower;
-    const Py_UCS4 title;
+    /* 
+       These are either deltas to the character or offsets in
+       _PyUnicode_ExtendedCase.
+    */
+    const int upper;
+    const int lower;
+    const int title;
+    /* Note if more flag space is needed, decimal and digit could be unified. */
     const unsigned char decimal;
     const unsigned char digit;
     const unsigned short flags;
@@ -57,15 +64,10 @@ gettyperecord(Py_UCS4 code)
 Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->title;
 
-    if (ctype->flags & NODELTA_MASK)
-        return delta;
-
-    if (delta >= 32768)
-            delta -= 65536;
-
-    return ch + delta;
+    if (ctype->flags & EXTENDED_CASE_MASK)
+        return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
+    return ch + ctype->title;
 }
 
 /* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -188,12 +190,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->upper;
-    if (ctype->flags & NODELTA_MASK)
-        return delta;
-    if (delta >= 32768)
-            delta -= 65536;
-    return ch + delta;
+
+    if (ctype->flags & EXTENDED_CASE_MASK)
+        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
+    return ch + ctype->upper;
 }
 
 /* Returns the lowercase Unicode characters corresponding to ch or just
@@ -202,12 +202,87 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->lower;
-    if (ctype->flags & NODELTA_MASK)
-        return delta;
-    if (delta >= 32768)
-            delta -= 65536;
-    return ch + delta;
+
+    if (ctype->flags & EXTENDED_CASE_MASK)
+        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
+    return ch + ctype->lower;
+}
+
+int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->lower & 0xFFFF;
+        int n = ctype->lower >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ch + ctype->lower;
+    return 1;
+}
+
+int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->title & 0xFFFF;
+        int n = ctype->title >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ch + ctype->title;
+    return 1;
+}
+
+int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->upper & 0xFFFF;
+        int n = ctype->upper >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ch + ctype->upper;
+    return 1;
+}
+
+int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
+        int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
+        int n = (ctype->lower >> 20) & 7;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    return _PyUnicode_ToLowerFull(ch, res);
+}
+
+int _PyUnicode_IsCased(Py_UCS4 ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    return (ctype->flags & CASED_MASK) != 0;
+}
+
+int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
 }
 
 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',