diff options
Diffstat (limited to 'Objects/unicodectype.c')
-rw-r--r-- | Objects/unicodectype.c | 125 |
1 files changed, 100 insertions, 25 deletions
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 9f6ac89b9f..a572c12bcc 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -21,13 +21,20 @@ #define XID_START_MASK 0x100 #define XID_CONTINUE_MASK 0x200 #define PRINTABLE_MASK 0x400 -#define NODELTA_MASK 0x800 -#define NUMERIC_MASK 0x1000 +#define NUMERIC_MASK 0x800 +#define CASE_IGNORABLE_MASK 0x1000 +#define CASED_MASK 0x2000 +#define EXTENDED_CASE_MASK 0x4000 typedef struct { - const Py_UCS4 upper; - const Py_UCS4 lower; - const Py_UCS4 title; + /* + These are either deltas to the character or offsets in + _PyUnicode_ExtendedCase. + */ + const int upper; + const int lower; + const int title; + /* Note if more flag space is needed, decimal and digit could be unified. */ const unsigned char decimal; const unsigned char digit; const unsigned short flags; @@ -57,15 +64,10 @@ gettyperecord(Py_UCS4 code) Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->title; - if (ctype->flags & NODELTA_MASK) - return delta; - - if (delta >= 32768) - delta -= 65536; - - return ch + delta; + if (ctype->flags & EXTENDED_CASE_MASK) + return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF]; + return ch + ctype->title; } /* Returns 1 for Unicode characters having the category 'Lt', 0 @@ -188,12 +190,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch) Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->upper; - if (ctype->flags & NODELTA_MASK) - return delta; - if (delta >= 32768) - delta -= 65536; - return ch + delta; + + if (ctype->flags & EXTENDED_CASE_MASK) + return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF]; + return ch + ctype->upper; } /* Returns the lowercase Unicode characters corresponding to ch or just @@ -202,12 +202,87 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->lower; - if (ctype->flags & NODELTA_MASK) - return delta; - if (delta >= 32768) - delta -= 65536; - return ch + delta; + + if (ctype->flags & EXTENDED_CASE_MASK) + return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF]; + return ch + ctype->lower; +} + +int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->lower & 0xFFFF; + int n = ctype->lower >> 24; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + res[0] = ch + ctype->lower; + return 1; +} + +int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->title & 0xFFFF; + int n = ctype->title >> 24; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + res[0] = ch + ctype->title; + return 1; +} + +int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->upper & 0xFFFF; + int n = ctype->upper >> 24; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + res[0] = ch + ctype->upper; + return 1; +} + +int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { + int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); + int n = (ctype->lower >> 20) & 7; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + return _PyUnicode_ToLowerFull(ch, res); +} + +int _PyUnicode_IsCased(Py_UCS4 ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASED_MASK) != 0; +} + +int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASE_IGNORABLE_MASK) != 0; } /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', |