summaryrefslogtreecommitdiff
path: root/study.c
diff options
context:
space:
mode:
authornigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:40:24 +0000
committernigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:40:24 +0000
commit4af6fcff808e079ca1aa09104d6146baa932af47 (patch)
treedc14f3624835dd1275c31159a4c365ed439f3df7 /study.c
parentf08d5b6354f668c0047281d81eda8d0fd2a9e82d (diff)
downloadpcre-4af6fcff808e079ca1aa09104d6146baa932af47.tar.gz
Load pcre-4.4 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@71 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'study.c')
-rw-r--r--study.c45
1 files changed, 38 insertions, 7 deletions
diff --git a/study.c b/study.c
index 4320bd2..5f0f196 100644
--- a/study.c
+++ b/study.c
@@ -9,7 +9,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-2002 University of Cambridge
+ Copyright (c) 1997-2003 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -297,19 +297,50 @@ do
/* Character class where all the information is in a bit map: set the
bits and either carry on or not, according to the repeat count. If it was
a negative class, and we are operating with UTF-8 characters, any byte
- with the top-bit set is a potentially valid starter because it may start
- a character with a value > 255. (This is sub-optimal in that the
- character may be in the range 128-255, and those characters might be
- unwanted, but that's as far as we go for the moment.) */
+ with a value >= 0xc4 is a potentially valid starter because it starts a
+ character with a value > 255. */
case OP_NCLASS:
- if (utf8) memset(start_bits+16, 0xff, 16);
+ if (utf8)
+ {
+ start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
+ memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
+ }
/* Fall through */
case OP_CLASS:
{
tcode++;
- for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+
+ /* In UTF-8 mode, the bits in a bit map correspond to character
+ values, not to byte values. However, the bit map we are constructing is
+ for byte values. So we have to do a conversion for characters whose
+ value is > 127. In fact, there are only two possible starting bytes for
+ characters in the range 128 - 255. */
+
+ if (utf8)
+ {
+ for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
+ for (c = 128; c < 256; c++)
+ {
+ if ((tcode[c/8] && (1 << (c&7))) != 0)
+ {
+ int d = (c >> 6) | 0xc0; /* Set bit for this starter */
+ start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
+ c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
+ }
+ }
+ }
+
+ /* In non-UTF-8 mode, the two bit maps are completely compatible. */
+
+ else
+ {
+ for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+ }
+
+ /* Advance past the bit map, and act on what follows */
+
tcode += 32;
switch (*tcode)
{