summaryrefslogtreecommitdiff
path: root/ucpinternal.h
diff options
context:
space:
mode:
authornigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:41:21 +0000
committernigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:41:21 +0000
commitced1f145fdf26ec7df4b9048a9da0ef17e9618f2 (patch)
tree371f88a16cfb5ac0a176622bcd424aa6c28c4cc8 /ucpinternal.h
parent2550303b1f255c525d802f94d9c4411a0ccc630f (diff)
downloadpcre-ced1f145fdf26ec7df4b9048a9da0ef17e9618f2.tar.gz
Load pcre-6.5 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@87 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'ucpinternal.h')
-rw-r--r--ucpinternal.h116
1 files changed, 56 insertions, 60 deletions
diff --git a/ucpinternal.h b/ucpinternal.h
index faefb03..16dfbe9 100644
--- a/ucpinternal.h
+++ b/ucpinternal.h
@@ -1,91 +1,87 @@
/*************************************************
-* libucp - Unicode Property Table handler *
+* Unicode Property Table handler *
*************************************************/
-/* Internal header file defining the layout of compact nodes in the tree. */
+/* Internal header file defining the layout of the bits in each pair of 32-bit
+words that form a data item in the table. */
typedef struct cnode {
- unsigned short int f0;
- unsigned short int f1;
- unsigned short int f2;
+ pcre_uint32 f0;
+ pcre_uint32 f1;
} cnode;
/* Things for the f0 field */
-#define f0_leftexists 0x8000 /* Left child exists */
-#define f0_typemask 0x3f00 /* Type bits */
-#define f0_typeshift 8 /* Type shift */
-#define f0_chhmask 0x00ff /* Character high bits */
+#define f0_scriptmask 0xff000000 /* Mask for script field */
+#define f0_scriptshift 24 /* Shift for script value */
+#define f0_rangeflag 0x00f00000 /* Flag for a range item */
+#define f0_charmask 0x001fffff /* Mask for code point value */
-/* Things for the f2 field */
+/* Things for the f1 field */
-#define f2_rightmask 0xf000 /* Mask for right offset bits */
-#define f2_rightshift 12 /* Shift for right offset */
-#define f2_casemask 0x0fff /* Mask for case offset */
+#define f1_typemask 0xfc000000 /* Mask for char type field */
+#define f1_typeshift 26 /* Shift for the type field */
+#define f1_rangemask 0x0000ffff /* Mask for a range offset */
+#define f1_casemask 0x0000ffff /* Mask for a case offset */
+#define f1_caseneg 0xffff8000 /* Bits for negation */
-/* The tree consists of a vector of structures of type cnode, with the root
-node as the first element. The three short ints (16-bits) are used as follows:
+/* The data consists of a vector of structures of type cnode. The two unsigned
+32-bit integers are used as follows:
-(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
- is the next node in the vector.
- (2) The 0x4000 bits of f0 is spare.
- (3) The 0x3f00 bits of f0 contain the character type; this is a number
- defined by the enumeration in ucp.h (e.g. ucp_Lu).
- (4) The bottom 8 bits of f0 contain the most significant byte of the
- character's 24-bit codepoint.
+(f0) (1) The most significant byte holds the script number. The numbers are
+ defined by the enum in ucp.h.
-(f1) (1) The f1 field contains the two least significant bytes of the
- codepoint.
+ (2) The 0x00800000 bit is set if this entry defines a range of characters.
+ It is not set if this entry defines a single character
-(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
- node. Otherwise, they contain one plus the exponent of the power of
- two of the offset to the right node (e.g. a value of 3 means 8). The
- units of the offset are node items.
+ (3) The 0x00600000 bits are spare.
- (2) The 0x0fff bits of f2 contain the signed offset from this character to
- its alternate cased value. They are zero if there is no such
- character.
+ (4) The 0x001fffff bits contain the code point. No Unicode code point will
+ ever be greater than 0x0010ffff, so this should be OK for ever.
+(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
+ defined by an enum in ucp.h.
------------------------------------------------------------------------------
-||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) ||
------------------------------------------------------------------------------
- | | |
- | |-> spare |
- | exponent of right
- |-> left child exists child offset
+ (2) The 0x03ff0000 bits are spare.
+ (3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
+ range if this entry defines a range, OR the *signed* offset to the
+ character's "other case" partner if this entry defines a single
+ character. There is no partner if the value is zero.
-The upper/lower casing information is set only for characters that come in
-pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
-These are ignored. They are:
+-------------------------------------------------------------------------------
+| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
+-------------------------------------------------------------------------------
+ | | | | |
+ | | |-> spare | |-> spare
+ | | |
+ | |-> spare |-> spare
+ |
+ |-> range flag
- 1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
- 2126 Ohm
- 212A Kelvin
- 212B Angstrom
+The upper/lower casing information is set only for characters that come in
+pairs. The non-one-to-one mappings in the Unicode data are ignored.
-Certainly for the last three, having an alternate case would seem to be a
-mistake. I don't know any Greek, so cannot comment on the first one.
+When searching the data, proceed as follows:
+(1) Set up for a binary chop search.
-When searching the tree, proceed as follows:
+(2) If the top is not greater than the bottom, the character is not in the
+ table. Its type must therefore be "Cn" ("Undefined").
-(1) Start at the first node.
+(3) Find the middle vector element.
-(2) Extract the character value from f1 and the bottom 8 bits of f0;
+(4) Extract the code point and compare. If equal, we are done.
-(3) Compare with the character being sought. If equal, we are done.
+(5) If the test character is smaller, set the top to the current point, and
+ goto (2).
-(4) If the test character is smaller, inspect the f0_leftexists flag. If it is
- not set, the character is not in the tree. If it is set, move to the next
- node, and go to (2).
+(6) If the current entry defines a range, compute the last character by adding
+ the offset, and see if the test character is within the range. If it is,
+ we are done.
-(5) If the test character is bigger, extract the f2_rightmask bits from f2, and
- shift them right by f2_rightshift. If the result is zero, the character is
- not in the tree. Otherwise, calculate the number of nodes to skip by
- shifting the value 1 left by this number minus one. Go to (2).
+(7) Otherwise, set the bottom to one element past the current point and goto
+ (2).
*/
-
-/* End of internal.h */
+/* End of ucpinternal.h */