diff options
author | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:41:21 +0000 |
---|---|---|
committer | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:41:21 +0000 |
commit | ced1f145fdf26ec7df4b9048a9da0ef17e9618f2 (patch) | |
tree | 371f88a16cfb5ac0a176622bcd424aa6c28c4cc8 /ucpinternal.h | |
parent | 2550303b1f255c525d802f94d9c4411a0ccc630f (diff) | |
download | pcre-ced1f145fdf26ec7df4b9048a9da0ef17e9618f2.tar.gz |
Load pcre-6.5 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@87 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'ucpinternal.h')
-rw-r--r-- | ucpinternal.h | 116 |
1 files changed, 56 insertions, 60 deletions
diff --git a/ucpinternal.h b/ucpinternal.h index faefb03..16dfbe9 100644 --- a/ucpinternal.h +++ b/ucpinternal.h @@ -1,91 +1,87 @@ /************************************************* -* libucp - Unicode Property Table handler * +* Unicode Property Table handler * *************************************************/ -/* Internal header file defining the layout of compact nodes in the tree. */ +/* Internal header file defining the layout of the bits in each pair of 32-bit +words that form a data item in the table. */ typedef struct cnode { - unsigned short int f0; - unsigned short int f1; - unsigned short int f2; + pcre_uint32 f0; + pcre_uint32 f1; } cnode; /* Things for the f0 field */ -#define f0_leftexists 0x8000 /* Left child exists */ -#define f0_typemask 0x3f00 /* Type bits */ -#define f0_typeshift 8 /* Type shift */ -#define f0_chhmask 0x00ff /* Character high bits */ +#define f0_scriptmask 0xff000000 /* Mask for script field */ +#define f0_scriptshift 24 /* Shift for script value */ +#define f0_rangeflag 0x00f00000 /* Flag for a range item */ +#define f0_charmask 0x001fffff /* Mask for code point value */ -/* Things for the f2 field */ +/* Things for the f1 field */ -#define f2_rightmask 0xf000 /* Mask for right offset bits */ -#define f2_rightshift 12 /* Shift for right offset */ -#define f2_casemask 0x0fff /* Mask for case offset */ +#define f1_typemask 0xfc000000 /* Mask for char type field */ +#define f1_typeshift 26 /* Shift for the type field */ +#define f1_rangemask 0x0000ffff /* Mask for a range offset */ +#define f1_casemask 0x0000ffff /* Mask for a case offset */ +#define f1_caseneg 0xffff8000 /* Bits for negation */ -/* The tree consists of a vector of structures of type cnode, with the root -node as the first element. The three short ints (16-bits) are used as follows: +/* The data consists of a vector of structures of type cnode. The two unsigned +32-bit integers are used as follows: -(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node - is the next node in the vector. - (2) The 0x4000 bits of f0 is spare. - (3) The 0x3f00 bits of f0 contain the character type; this is a number - defined by the enumeration in ucp.h (e.g. ucp_Lu). - (4) The bottom 8 bits of f0 contain the most significant byte of the - character's 24-bit codepoint. +(f0) (1) The most significant byte holds the script number. The numbers are + defined by the enum in ucp.h. -(f1) (1) The f1 field contains the two least significant bytes of the - codepoint. + (2) The 0x00800000 bit is set if this entry defines a range of characters. + It is not set if this entry defines a single character -(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this - node. Otherwise, they contain one plus the exponent of the power of - two of the offset to the right node (e.g. a value of 3 means 8). The - units of the offset are node items. + (3) The 0x00600000 bits are spare. - (2) The 0x0fff bits of f2 contain the signed offset from this character to - its alternate cased value. They are zero if there is no such - character. + (4) The 0x001fffff bits contain the code point. No Unicode code point will + ever be greater than 0x0010ffff, so this should be OK for ever. +(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are + defined by an enum in ucp.h. ------------------------------------------------------------------------------ -||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) || ------------------------------------------------------------------------------ - | | | - | |-> spare | - | exponent of right - |-> left child exists child offset + (2) The 0x03ff0000 bits are spare. + (3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of + range if this entry defines a range, OR the *signed* offset to the + character's "other case" partner if this entry defines a single + character. There is no partner if the value is zero. -The upper/lower casing information is set only for characters that come in -pairs. There are (at present) four non-one-to-one mappings in the Unicode data. -These are ignored. They are: +------------------------------------------------------------------------------- +| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) | +------------------------------------------------------------------------------- + | | | | | + | | |-> spare | |-> spare + | | | + | |-> spare |-> spare + | + |-> range flag - 1FBE Greek Prosgegrammeni (lower, with upper -> capital iota) - 2126 Ohm - 212A Kelvin - 212B Angstrom +The upper/lower casing information is set only for characters that come in +pairs. The non-one-to-one mappings in the Unicode data are ignored. -Certainly for the last three, having an alternate case would seem to be a -mistake. I don't know any Greek, so cannot comment on the first one. +When searching the data, proceed as follows: +(1) Set up for a binary chop search. -When searching the tree, proceed as follows: +(2) If the top is not greater than the bottom, the character is not in the + table. Its type must therefore be "Cn" ("Undefined"). -(1) Start at the first node. +(3) Find the middle vector element. -(2) Extract the character value from f1 and the bottom 8 bits of f0; +(4) Extract the code point and compare. If equal, we are done. -(3) Compare with the character being sought. If equal, we are done. +(5) If the test character is smaller, set the top to the current point, and + goto (2). -(4) If the test character is smaller, inspect the f0_leftexists flag. If it is - not set, the character is not in the tree. If it is set, move to the next - node, and go to (2). +(6) If the current entry defines a range, compute the last character by adding + the offset, and see if the test character is within the range. If it is, + we are done. -(5) If the test character is bigger, extract the f2_rightmask bits from f2, and - shift them right by f2_rightshift. If the result is zero, the character is - not in the tree. Otherwise, calculate the number of nodes to skip by - shifting the value 1 left by this number minus one. Go to (2). +(7) Otherwise, set the bottom to one element past the current point and goto + (2). */ - -/* End of internal.h */ +/* End of ucpinternal.h */ |