summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-11-26 17:03:45 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-11-26 17:03:45 +0000
commitf5a7a91a3b28530c5a9a601f6aa3576033edabda (patch)
tree7e403e008a8fde5bcfa5c9e5abc951f56455e68c
parent2d3c19ac8ce3b6766824b517cd518c37a7c84ee1 (diff)
downloadpcre-f5a7a91a3b28530c5a9a601f6aa3576033edabda.tar.gz
Fix incorrect script bug in Unicode character table.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@277 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog21
-rwxr-xr-xmaint/Builducptable10
-rw-r--r--testdata/testinput663
-rw-r--r--testdata/testoutput6112
-rw-r--r--ucptable.h50
5 files changed, 238 insertions, 18 deletions
diff --git a/ChangeLog b/ChangeLog
index 329e860..32a5914 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -44,6 +44,27 @@ Version 7.5 12-Nov-07
10. Remove two redundant lines of code that can never be obeyed (their function
was moved elsewhere).
+
+11. The program that makes PCRE's Unicode character property table had a bug
+ which caused it to generate incorrect table entries for sequences of
+ characters that have the same character type, but are in different scripts.
+ It amalgamated them into a single range, with the script of the first of
+ them. In other words, some characters were in the wrong script. There were
+ thirteen such cases, affecting characters in the following ranges:
+
+ U+002b0 - U+002c1
+ U+0060c - U+0060d
+ U+0061e - U+00612
+ U+0064b - U+0065e
+ U+0074d - U+0076d
+ U+01800 - U+01805
+ U+01d00 - U+01d77
+ U+01d9b - U+01dbf
+ U+0200b - U+0200f
+ U+030fc - U+030fe
+ U+03260 - U+0327f
+ U+0fb46 - U+0fbb1
+ U+10450 - U+1049d
Version 7.4 21-Sep-07
diff --git a/maint/Builducptable b/maint/Builducptable
index 359ab08..3c317d0 100755
--- a/maint/Builducptable
+++ b/maint/Builducptable
@@ -8,7 +8,9 @@
# The script is rather slow because it just searches linearly through the
# Scripts data in order to find the script for each character or character
# range. It could be made faster by sorting that data, or something, but hey,
-# it is only ever run once in a blue moon.
+# it is only ever run once in a blue moon. (It's even slower after I mended the
+# "forgot to check for script number before amalgamation" bug, but even so,
+# the effort of improving it isn't worth it.)
# Subroutine: Given a character number, return the script number. The
# Scripts.txt file has been read into an array, keeping just the codepoints
@@ -164,7 +166,8 @@ while (<IN>)
}
else
- {
+ {
+ my($startscript) = script($cp);
my($ncp) = $cp + 1;
while (<IN>)
{
@@ -172,7 +175,8 @@ while (<IN>)
last if (hex($fields[0]) != $ncp ||
$fields[2] ne $gc ||
$fields[12] ne "" ||
- $fields[13] ne "");
+ $fields[13] ne "" ||
+ script($ncp) != $startscript);
$ncp++;
}
diff --git a/testdata/testinput6 b/testdata/testinput6
index 53d2b32..6bed743 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -832,4 +832,67 @@ was broken in all cases./
/(\p{Yi}{0,3}+\277)*/
+/^[\p{Arabic}]/8
+ \x{60e}
+ \x{656}
+ \x{657}
+ \x{658}
+ \x{659}
+ \x{65a}
+ \x{65b}
+ \x{65c}
+ \x{65d}
+ \x{65e}
+ \x{66a}
+ \x{6e9}
+ \x{6ef}
+ \x{6fa}
+ ** Failers
+ \x{600}
+ \x{650}
+ \x{651}
+ \x{652}
+ \x{653}
+ \x{654}
+ \x{655}
+ \x{65f}
+
+/^\p{Cyrillic}/8
+ \x{1d2b}
+
+/^\p{Common}/8
+ \x{589}
+ \x{60c}
+ \x{61f}
+ \x{964}
+ \x{965}
+ \x{970}
+
+/^\p{Inherited}/8
+ \x{64b}
+ \x{654}
+ \x{655}
+ \x{200c}
+ ** Failers
+ \x{64a}
+ \x{656}
+
+/^\p{Shavian}/8
+ \x{10450}
+ \x{1047f}
+
+/^\p{Deseret}/8
+ \x{10400}
+ \x{1044f}
+
+/^\p{Osmanya}/8
+ \x{10480}
+ \x{1049d}
+ \x{104a0}
+ \x{104a9}
+ ** Failers
+ \x{1049e}
+ \x{1049f}
+ \x{104aa}
+
/ End of testinput6 /
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index 0a58b84..049d1a3 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -1522,4 +1522,116 @@ No match
/(\p{Yi}{0,3}+\277)*/
+/^[\p{Arabic}]/8
+ \x{60e}
+ 0: \x{60e}
+ \x{656}
+ 0: \x{656}
+ \x{657}
+ 0: \x{657}
+ \x{658}
+ 0: \x{658}
+ \x{659}
+ 0: \x{659}
+ \x{65a}
+ 0: \x{65a}
+ \x{65b}
+ 0: \x{65b}
+ \x{65c}
+ 0: \x{65c}
+ \x{65d}
+ 0: \x{65d}
+ \x{65e}
+ 0: \x{65e}
+ \x{66a}
+ 0: \x{66a}
+ \x{6e9}
+ 0: \x{6e9}
+ \x{6ef}
+ 0: \x{6ef}
+ \x{6fa}
+ 0: \x{6fa}
+ ** Failers
+No match
+ \x{600}
+No match
+ \x{650}
+No match
+ \x{651}
+No match
+ \x{652}
+No match
+ \x{653}
+No match
+ \x{654}
+No match
+ \x{655}
+No match
+ \x{65f}
+No match
+
+/^\p{Cyrillic}/8
+ \x{1d2b}
+ 0: \x{1d2b}
+
+/^\p{Common}/8
+ \x{589}
+ 0: \x{589}
+ \x{60c}
+ 0: \x{60c}
+ \x{61f}
+ 0: \x{61f}
+ \x{964}
+ 0: \x{964}
+ \x{965}
+ 0: \x{965}
+ \x{970}
+ 0: \x{970}
+
+/^\p{Inherited}/8
+ \x{64b}
+ 0: \x{64b}
+ \x{654}
+ 0: \x{654}
+ \x{655}
+ 0: \x{655}
+ \x{200c}
+ 0: \x{200c}
+ ** Failers
+No match
+ \x{64a}
+No match
+ \x{656}
+No match
+
+/^\p{Shavian}/8
+ \x{10450}
+ 0: \x{10450}
+ \x{1047f}
+ 0: \x{1047f}
+
+/^\p{Deseret}/8
+ \x{10400}
+ 0: \x{10400}
+ \x{1044f}
+ 0: \x{1044f}
+
+/^\p{Osmanya}/8
+ \x{10480}
+ 0: \x{10480}
+ \x{1049d}
+ 0: \x{1049d}
+ \x{104a0}
+ 0: \x{104a0}
+ \x{104a9}
+ 0: \x{104a9}
+ ** Failers
+No match
+ \x{1049e}
+No match
+ \x{1049f}
+No match
+ \x{104aa}
+No match
+
/ End of testinput6 /
diff --git a/ucptable.h b/ucptable.h
index 07eaced..a274d44 100644
--- a/ucptable.h
+++ b/ucptable.h
@@ -539,7 +539,8 @@ static const cnode ucp_table[] = {
{ 0x21000293, 0x14000000 },
{ 0x21000294, 0x1c000000 },
{ 0x21800295, 0x1400001a },
- { 0x218002b0, 0x18000011 },
+ { 0x218002b0, 0x18000008 },
+ { 0x098002b9, 0x18000008 },
{ 0x098002c2, 0x60000003 },
{ 0x098002c6, 0x1800000b },
{ 0x098002d2, 0x6000000d },
@@ -1039,15 +1040,18 @@ static const cnode ucp_table[] = {
{ 0x198005f3, 0x54000001 },
{ 0x09800600, 0x04000003 },
{ 0x0000060b, 0x5c000000 },
- { 0x0980060c, 0x54000001 },
+ { 0x0900060c, 0x54000000 },
+ { 0x0000060d, 0x54000000 },
{ 0x0080060e, 0x68000001 },
{ 0x00800610, 0x30000005 },
{ 0x0900061b, 0x54000000 },
- { 0x0080061e, 0x54000001 },
+ { 0x0000061e, 0x54000000 },
+ { 0x0900061f, 0x54000000 },
{ 0x00800621, 0x1c000019 },
{ 0x09000640, 0x18000000 },
{ 0x00800641, 0x1c000009 },
- { 0x1b80064b, 0x30000013 },
+ { 0x1b80064b, 0x3000000a },
+ { 0x00800656, 0x30000008 },
{ 0x09800660, 0x34000009 },
{ 0x0080066a, 0x54000003 },
{ 0x0080066e, 0x1c000001 },
@@ -1074,7 +1078,8 @@ static const cnode ucp_table[] = {
{ 0x31000711, 0x30000000 },
{ 0x31800712, 0x1c00001d },
{ 0x31800730, 0x3000001a },
- { 0x3180074d, 0x1c000020 },
+ { 0x3180074d, 0x1c000002 },
+ { 0x00800750, 0x1c00001d },
{ 0x37800780, 0x1c000025 },
{ 0x378007a6, 0x3000000a },
{ 0x370007b1, 0x1c000000 },
@@ -1460,7 +1465,10 @@ static const cnode ucp_table[] = {
{ 0x1f0017dd, 0x30000000 },
{ 0x1f8017e0, 0x34000009 },
{ 0x1f8017f0, 0x3c000009 },
- { 0x25801800, 0x54000005 },
+ { 0x25801800, 0x54000001 },
+ { 0x09801802, 0x54000001 },
+ { 0x25001804, 0x54000000 },
+ { 0x09001805, 0x54000000 },
{ 0x25001806, 0x44000000 },
{ 0x25801807, 0x54000003 },
{ 0x2580180b, 0x30000002 },
@@ -1513,14 +1521,20 @@ static const cnode ucp_table[] = {
{ 0x3d801b61, 0x68000009 },
{ 0x3d801b6b, 0x30000008 },
{ 0x3d801b74, 0x68000008 },
- { 0x21801d00, 0x1400002b },
- { 0x21801d2c, 0x18000035 },
- { 0x21801d62, 0x14000015 },
+ { 0x21801d00, 0x14000025 },
+ { 0x13801d26, 0x14000004 },
+ { 0x0c001d2b, 0x14000000 },
+ { 0x21801d2c, 0x18000030 },
+ { 0x13801d5d, 0x18000004 },
+ { 0x21801d62, 0x14000003 },
+ { 0x13801d66, 0x14000004 },
+ { 0x21801d6b, 0x1400000c },
{ 0x0c001d78, 0x18000000 },
{ 0x21801d79, 0x14000003 },
{ 0x21001d7d, 0x14000ee6 },
{ 0x21801d7e, 0x1400001c },
- { 0x21801d9b, 0x18000024 },
+ { 0x21801d9b, 0x18000023 },
+ { 0x13001dbf, 0x18000000 },
{ 0x1b801dc0, 0x3000000a },
{ 0x1b801dfe, 0x30000001 },
{ 0x21001e00, 0x24000001 },
@@ -1982,7 +1996,9 @@ static const cnode ucp_table[] = {
{ 0x13001ffc, 0x2000fff7 },
{ 0x13801ffd, 0x60000001 },
{ 0x09802000, 0x7400000a },
- { 0x0980200b, 0x04000004 },
+ { 0x0900200b, 0x04000000 },
+ { 0x1b80200c, 0x04000001 },
+ { 0x0980200e, 0x04000001 },
{ 0x09802010, 0x44000005 },
{ 0x09802016, 0x54000001 },
{ 0x09002018, 0x50000000 },
@@ -2615,7 +2631,8 @@ static const cnode ucp_table[] = {
{ 0x090030a0, 0x44000000 },
{ 0x1d8030a1, 0x1c000059 },
{ 0x090030fb, 0x54000000 },
- { 0x098030fc, 0x18000002 },
+ { 0x090030fc, 0x18000000 },
+ { 0x1d8030fd, 0x18000001 },
{ 0x1d0030ff, 0x1c000000 },
{ 0x03803105, 0x1c000027 },
{ 0x17803131, 0x1c00005d },
@@ -2630,7 +2647,8 @@ static const cnode ucp_table[] = {
{ 0x0980322a, 0x68000019 },
{ 0x09003250, 0x68000000 },
{ 0x09803251, 0x3c00000e },
- { 0x17803260, 0x6800001f },
+ { 0x17803260, 0x6800001d },
+ { 0x0980327e, 0x68000001 },
{ 0x09803280, 0x3c000009 },
{ 0x0980328a, 0x68000026 },
{ 0x098032b1, 0x3c00000e },
@@ -2678,7 +2696,8 @@ static const cnode ucp_table[] = {
{ 0x1900fb3e, 0x1c000000 },
{ 0x1980fb40, 0x1c000001 },
{ 0x1980fb43, 0x1c000001 },
- { 0x1980fb46, 0x1c00006b },
+ { 0x1980fb46, 0x1c000009 },
+ { 0x0080fb50, 0x1c000061 },
{ 0x0080fbd3, 0x1c00016a },
{ 0x0900fd3e, 0x58000000 },
{ 0x0900fd3f, 0x48000000 },
@@ -2944,7 +2963,8 @@ static const cnode ucp_table[] = {
{ 0x0d01044d, 0x1400ffd8 },
{ 0x0d01044e, 0x1400ffd8 },
{ 0x0d01044f, 0x1400ffd8 },
- { 0x2e810450, 0x1c00004d },
+ { 0x2e810450, 0x1c00002f },
+ { 0x2c810480, 0x1c00001d },
{ 0x2c8104a0, 0x34000009 },
{ 0x0b810800, 0x1c000005 },
{ 0x0b010808, 0x1c000000 },