Tidy pcretest source code and some 8/16 messages. Add "16" error codes.

git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@823 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-24 17:43:22 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-24 17:43:22 +0000
commit: 5a61c28fd404ea8390c9e4c035e0a35c782536d3 (patch)
tree: d2c97299d0b409136f7e0f287e15eba502222b44
parent: 8d2226da6db1b735e0639479d609810455444476 (diff)
download: pcre-5a61c28fd404ea8390c9e4c035e0a35c782536d3.tar.gz
7 files changed, 223 insertions, 158 deletions
diff --git a/pcre.h.in b/pcre.h.in
index a2e2408..4034f35 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -146,36 +146,39 @@ compiling). */
 
 /* Exec-time and get/set-time error codes */
 
-#define PCRE_ERROR_NOMATCH         (-1)
-#define PCRE_ERROR_NULL            (-2)
-#define PCRE_ERROR_BADOPTION       (-3)
-#define PCRE_ERROR_BADMAGIC        (-4)
-#define PCRE_ERROR_UNKNOWN_OPCODE  (-5)
-#define PCRE_ERROR_UNKNOWN_NODE    (-5)  /* For backward compatibility */
-#define PCRE_ERROR_NOMEMORY        (-6)
-#define PCRE_ERROR_NOSUBSTRING     (-7)
-#define PCRE_ERROR_MATCHLIMIT      (-8)
-#define PCRE_ERROR_CALLOUT         (-9)  /* Never used by PCRE itself */
-#define PCRE_ERROR_BADUTF8        (-10)
-#define PCRE_ERROR_BADUTF8_OFFSET (-11)
-#define PCRE_ERROR_PARTIAL        (-12)
-#define PCRE_ERROR_BADPARTIAL     (-13)
-#define PCRE_ERROR_INTERNAL       (-14)
-#define PCRE_ERROR_BADCOUNT       (-15)
-#define PCRE_ERROR_DFA_UITEM      (-16)
-#define PCRE_ERROR_DFA_UCOND      (-17)
-#define PCRE_ERROR_DFA_UMLIMIT    (-18)
-#define PCRE_ERROR_DFA_WSSIZE     (-19)
-#define PCRE_ERROR_DFA_RECURSE    (-20)
-#define PCRE_ERROR_RECURSIONLIMIT (-21)
-#define PCRE_ERROR_NULLWSLIMIT    (-22)  /* No longer actually used */
-#define PCRE_ERROR_BADNEWLINE     (-23)
-#define PCRE_ERROR_BADOFFSET      (-24)
-#define PCRE_ERROR_SHORTUTF8      (-25)
-#define PCRE_ERROR_RECURSELOOP    (-26)
-#define PCRE_ERROR_JIT_STACKLIMIT (-27)
-#define PCRE_ERROR_BADMODE        (-28)
-#define PCRE_ERROR_BADENDIANNESS  (-29)
+#define PCRE_ERROR_NOMATCH          (-1)
+#define PCRE_ERROR_NULL             (-2)
+#define PCRE_ERROR_BADOPTION        (-3)
+#define PCRE_ERROR_BADMAGIC         (-4)
+#define PCRE_ERROR_UNKNOWN_OPCODE   (-5)
+#define PCRE_ERROR_UNKNOWN_NODE     (-5)  /* For backward compatibility */
+#define PCRE_ERROR_NOMEMORY         (-6)
+#define PCRE_ERROR_NOSUBSTRING      (-7)
+#define PCRE_ERROR_MATCHLIMIT       (-8)
+#define PCRE_ERROR_CALLOUT          (-9)  /* Never used by PCRE itself */
+#define PCRE_ERROR_BADUTF8         (-10)  /* Same for 8/16 */
+#define PCRE_ERROR_BADUTF16        (-10)  /* Same for 8/16 */
+#define PCRE_ERROR_BADUTF8_OFFSET  (-11)  /* Same for 8/16 */
+#define PCRE_ERROR_BADUTF16_OFFSET (-11)  /* Same for 8/16 */
+#define PCRE_ERROR_PARTIAL         (-12)
+#define PCRE_ERROR_BADPARTIAL      (-13)
+#define PCRE_ERROR_INTERNAL        (-14)
+#define PCRE_ERROR_BADCOUNT        (-15)
+#define PCRE_ERROR_DFA_UITEM       (-16)
+#define PCRE_ERROR_DFA_UCOND       (-17)
+#define PCRE_ERROR_DFA_UMLIMIT     (-18)
+#define PCRE_ERROR_DFA_WSSIZE      (-19)
+#define PCRE_ERROR_DFA_RECURSE     (-20)
+#define PCRE_ERROR_RECURSIONLIMIT  (-21)
+#define PCRE_ERROR_NULLWSLIMIT     (-22)  /* No longer actually used */
+#define PCRE_ERROR_BADNEWLINE      (-23)
+#define PCRE_ERROR_BADOFFSET       (-24)
+#define PCRE_ERROR_SHORTUTF8       (-25)
+#define PCRE_ERROR_SHORTUTF16      (-25)  /* Same for 8/16 */
+#define PCRE_ERROR_RECURSELOOP     (-26)
+#define PCRE_ERROR_JIT_STACKLIMIT  (-27)
+#define PCRE_ERROR_BADMODE         (-28)
+#define PCRE_ERROR_BADENDIANNESS   (-29)
 
 /* Specific error codes for UTF-8 validity checks */
 
diff --git a/pcre_exec.c b/pcre_exec.c
index c6bdaf6..fa8bc77 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -6078,8 +6078,13 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
       offsets[0] = erroroffset;
       offsets[1] = errorcode;
       }
+#ifdef COMPILE_PCRE16
+    return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
+      PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
+#else
     return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
+#endif       
     }
 
   /* Check that a start_offset points to the start of a UTF character. */
diff --git a/pcretest.c b/pcretest.c
index 398ea48..a387717 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -36,6 +36,16 @@ POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 
+/* This program now supports the testing of both the 8-bit and 16-bit PCRE 
+libraries in a single program. This is different from the modules such as 
+pcre_compile.c in the library itself, which are compiled separately for each 
+mode. If both modes are enabled, for example, pcre_compile.c is compiled twice 
+(the second time with COMPILE_PCRE16 defined). By contrast, pcretest.c is 
+compiled only once. Therefore, it must not make use of any of the macros from 
+pcre_internal.h that depend on COMPILE_PCRE8 or COMPILE_PCRE16. It does, 
+however, make use of SUPPORT_PCRE8 and SUPPORT_PCRE16 to ensure that it calls 
+only supported library functions. */ 
+
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -161,13 +171,13 @@ Makefile. */
 #endif
 
 /* It is also possible, originally for the benefit of a version that was
-imported into Exim, to build pcretest without support for UTF8 (define NOUTF8),
-without the interface to the DFA matcher (NODFA). In fact, we automatically cut
-out the UTF8 support if PCRE is built without it. */
+imported into Exim, to build pcretest without support for UTF8 or UTF16 (define
+NOUTF), without the interface to the DFA matcher (NODFA). In fact, we
+automatically cut out the UTF support if PCRE is built without it. */
 
-#ifndef SUPPORT_UTF8
-#ifndef NOUTF8
-#define NOUTF8
+#ifndef SUPPORT_UTF
+#ifndef NOUTF
+#define NOUTF
 #endif
 #endif
 
@@ -177,7 +187,13 @@ only from one place and is handled differently). I couldn't dream up any way of
 using a single macro to do this in a generic way, because of the many different
 argument requirements. We know that at least one of SUPPORT_PCRE8 and
 SUPPORT_PCRE16 must be set. First define macros for each individual mode; then
-use these in the definitions of generic macros. */
+use these in the definitions of generic macros. 
+
+**** Special note about the PCHARSxxx macros: the address of the string to be 
+printed is always given as two arguments: a base address followed by an offset.
+The base address is cast to the correct data size for 8 or 16 bit data; the
+offset is in units of this size. If the string were given as base+offset in one 
+argument, the casting might be incorrectly applied. */
 
 #ifdef SUPPORT_PCRE8
 
@@ -605,7 +621,6 @@ COMPILE_PCRE16 is *not* set. */
 #endif
 
 static const pcre_uint16 OP_lengths16[] = { OP_LENGTHS };
-
 #endif  /* SUPPORT_PCRE16 */
 
 /* If we have 8-bit support, default use_pcre16 to false; if there is also
@@ -631,8 +646,8 @@ static const char *errtexts[] = {
   NULL,  /* never returned by pcre_exec() or pcre_dfa_exec() */
   "match limit exceeded",
   "callout error code",
-  NULL,  /* BADUTF8 is handled specially */
-  "bad UTF-8 offset",
+  NULL,  /* BADUTF8/16 is handled specially */
+  NULL,  /* BADUTF8/16 offset is handled specially */
   NULL,  /* PARTIAL is handled specially */
   "not used - internal error",
   "internal error - pattern overwritten?",
@@ -646,7 +661,7 @@ static const char *errtexts[] = {
   "not used - internal error",
   "invalid combination of newline options",
   "bad offset value",
-  NULL,  /* SHORTUTF8 is handled specially */
+  NULL,  /* SHORTUTF8/16 is handled specially */
   "nested recursion at the same subject position",
   "JIT stack limit reached",
   "pattern compiled in wrong mode (8-bit/16-bit error)"
@@ -1011,6 +1026,7 @@ return (pcre_jit_stack *)arg;
 }
 
 
+#if !defined NOUTF
 /*************************************************
 *            Convert UTF-8 string to value       *
 *************************************************/
@@ -1026,8 +1042,6 @@ Returns:      >  0 => the number of bytes consumed
               -6 to 0 => malformed UTF-8 character at offset = (-return)
 */
 
-#if !defined NOUTF8
-
 static int
 utf82ord(pcre_uint8 *utf8bytes, int *vptr)
 {
@@ -1068,11 +1082,11 @@ if (j != i) return -(i+1);
 *vptr = d;
 return i+1;
 }
-
-#endif
+#endif  /* NOUTF */
 
 
 
+#if !defined NOUTF
 /*************************************************
 *       Convert character value to UTF-8         *
 *************************************************/
@@ -1087,8 +1101,6 @@ Arguments:
 Returns:     number of characters placed in the buffer
 */
 
-#if !defined NOUTF8
-
 static int
 ord2utf8(int cvalue, pcre_uint8 *utf8bytes)
 {
@@ -1104,7 +1116,6 @@ for (j = i; j > 0; j--)
 *utf8bytes = utf8_table2[i] | cvalue;
 return i + 1;
 }
-
 #endif
 
 
@@ -1120,6 +1131,10 @@ double, because up to 0xffff uses no more than 3 bytes in UTF-8 but possibly 4
 in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in UTF-16. The
 result is always left in buffer16.
 
+Note that this function does not object to surrogate values. This is 
+deliberate; it makes it possible to construct UTF-16 strings that are invalid, 
+for the purpose of testing that they are correctly faulted.
+
 Arguments:
   p          points to a byte string
   utf        true if UTF-8 (to be converted to UTF-16)
@@ -1127,6 +1142,7 @@ Arguments:
 
 Returns:     number of 16-bit data items used (excluding trailing zero)
              OR -1 if a UTF-8 string is malformed
+             OR -2 if a value > 0x10ffff is encountered 
 */
 
 static int
@@ -1160,6 +1176,7 @@ else
     {
     int chlen = utf82ord(p, &c);
     if (chlen <= 0) return -1;
+    if (c > 0x10ffff) return -2; 
     p += chlen;
     len -= chlen;
     if (c < 0x10000) *pp++ = c; else
@@ -1365,7 +1382,7 @@ if (length < 0)
 
 while (length-- > 0)
   {
-#if !defined NOUTF8
+#if !defined NOUTF
   if (use_utf)
     {
     int rc = utf82ord(p, &c);
@@ -1399,9 +1416,10 @@ int len = 0;
 while (*p++ != 0) len++;
 return len;
 }
+#endif  /* SUPPORT_PCRE16 */
 
 
-
+#ifdef SUPPORT_PCRE16
 /*************************************************
 *           Print 16-bit character string        *
 *************************************************/
@@ -1419,7 +1437,7 @@ if (length < 0)
 while (length-- > 0)
   {
   int c = *p++ & 0xffff;
-#if !defined NOUTF8
+#if !defined NOUTF
   if (use_utf && c >= 0xD800 && c < 0xDC00 && length > 0)
     {
     int d = *p & 0xffff;
@@ -1436,7 +1454,7 @@ while (length-- > 0)
 
 return yield;
 }
-#endif
+#endif  /* SUPPORT_PCRE16 */
 
 
 
@@ -1462,7 +1480,7 @@ if (pcre_get_stringnumber(re, (char *)(*pp)) < 0)
 *pp = npp; 
 return p;
 }
-#endif
+#endif  /* SUPPORT_PCRE8 */
 
 
 
@@ -1489,7 +1507,7 @@ if (pcre16_get_stringnumber(re, (PCRE_SPTR16)(*pp)) < 0)
 *pp = npp;   
 return p;
 }
-#endif
+#endif  /* SUPPORT_PCRE16 */
 
 
 
@@ -1680,8 +1698,8 @@ if (rc < 0) fprintf(outfile, "Error %d from pcre%s_fullinfo(%d)\n", rc,
 *             Swap byte functions                *
 *************************************************/
 
-/* The following functions swap the bytes of a pcre_uint16
-and pcre_uint32 value.
+/* The following functions swap the bytes of a pcre_uint16 and pcre_uint32
+value, respectively.
 
 Arguments:
   value        any number
@@ -1721,9 +1739,8 @@ static void
 regexflip(pcre *ere, pcre_extra *extra)
 {
 real_pcre *re = (real_pcre *)ere;
-int op;
-
 #ifdef SUPPORT_PCRE16
+int op;
 pcre_uint16 *ptr = (pcre_uint16 *)re + re->name_table_offset;
 int length = re->name_count * re->name_entry_size;
 #ifdef SUPPORT_UTF
@@ -2128,7 +2145,7 @@ _setmode( _fileno( stdout ), _O_BINARY );
 #endif
 
 /* Get the version number: both pcre_version() and pcre16_version() give the
-same answer. We just need to ensure that we call one that is availab.e */
+same answer. We just need to ensure that we call one that is available. */
 
 #ifdef SUPPORT_PCRE8
 version = pcre_version();
@@ -2706,11 +2723,20 @@ while (!done)
 #ifdef SUPPORT_PCRE16
     if (use_pcre16)
       {
-      if (to16(p, options & PCRE_UTF8, (int)strlen((char *)p)) < 0)
+      switch(to16(p, options & PCRE_UTF8, (int)strlen((char *)p)))
         {
+        case -1: 
         fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
           "converted to UTF-16\n");
         goto SKIP_DATA;
+         
+        case -2:
+        fprintf(outfile, "**Failed: character value greater than 0x10ffff "
+          "cannot be converted to UTF-16\n");
+        goto SKIP_DATA;
+         
+        default:
+        break;    
         }
       p = (pcre_uint8 *)buffer16;
       }
@@ -3231,7 +3257,7 @@ while (!done)
         while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
           c = c * 8 + *p++ - '0';
 
-#if !defined NOUTF8
+#if !defined NOUTF
         if (use_utf && c > 255)
           {
           pcre_uint8 buff8[8];
@@ -3247,7 +3273,7 @@ while (!done)
 
         /* Handle \x{..} specially - new Perl thing for utf8 */
 
-#if !defined NOUTF8
+#if !defined NOUTF
         if (*p == '{')
           {
           pcre_uint8 *pt = p;
@@ -3593,11 +3619,20 @@ while (!done)
     if (use_pcre16)
       {
       len = to16(bptr, (((real_pcre *)re)->options) & PCRE_UTF8, len);
-      if (len < 0)
+      switch(len)
         {
+        case -1: 
         fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
           "converted to UTF-16\n");
         goto NEXT_DATA;
+         
+        case -2:
+        fprintf(outfile, "**Failed: character value greater than 0x10ffff "
+          "cannot be converted to UTF-16\n");
+        goto NEXT_DATA;
+         
+        default:
+        break;    
         }
       bptr = (pcre_uint8 *)buffer16;
       }
@@ -4021,13 +4056,19 @@ while (!done)
 
             case PCRE_ERROR_BADUTF8:
             case PCRE_ERROR_SHORTUTF8:
-            fprintf(outfile, "Error %d (%s UTF-8 string)", count,
-              (count == PCRE_ERROR_BADUTF8)? "bad" : "short");
+            fprintf(outfile, "Error %d (%s UTF-%s string)", count,
+              (count == PCRE_ERROR_BADUTF8)? "bad" : "short",
+              use_pcre16? "16" : "8");
             if (use_size_offsets >= 2)
               fprintf(outfile, " offset=%d reason=%d", use_offsets[0],
                 use_offsets[1]);
             fprintf(outfile, "\n");
             break;
+            
+            case PCRE_ERROR_BADUTF8_OFFSET:
+            fprintf(outfile, "Error %d (bad UTF-%s offset)\n", count,
+              use_pcre16? "16" : "8");
+            break;   
 
             default:
             if (count < 0 && (-count) < sizeof(errtexts)/sizeof(const char *))
diff --git a/testdata/testinput17 b/testdata/testinput17
index 6d4a63d..2479fe5 100644
--- a/testdata/testinput17
+++ b/testdata/testinput17
@@ -1,5 +1,6 @@
 /-- This set of tests is for the 16-bit library's basic (non-UTF-16) features 
-    that are not compatible with the 8-bit library. --/
+    that are not compatible with the 8-bit library, or which give different 
+    output in 16-bit mode. --/
 
 /a\Cb/
     aXb
diff --git a/testdata/testinput18 b/testdata/testinput18
index dac88d6..263f077 100644
--- a/testdata/testinput18
+++ b/testdata/testinput18
@@ -11,46 +11,46 @@
 
 /X(\C{4})/8
     X\x{11234}YZ
-    
+
 /X\C*/8
     XYZabcdce
-    
+
 /X\C*?/8
     XYZabcde
-    
+
 /X\C{3,5}/8
-    Xabcdefg   
-    X\x{11234}Y 
+    Xabcdefg
+    X\x{11234}Y
     X\x{11234}YZ
-    X\x{11234}\x{512}  
+    X\x{11234}\x{512}
     X\x{11234}\x{512}YZ
     X\x{11234}\x{512}\x{11234}Z
 
 /X\C{3,5}?/8
-    Xabcdefg   
-    X\x{11234}Y 
+    Xabcdefg
+    X\x{11234}Y
     X\x{11234}YZ
-    X\x{11234}\x{512}YZ  
+    X\x{11234}\x{512}YZ
     *** Failers
     X\x{11234}
 
 /a\Cb/8
     aXb
     a\nb
-    
-/a\C\Cb/8 
+
+/a\C\Cb/8
     a\x{12257}b
-    ** Failers 
+    ** Failers
     a\x{100}b
 
 /ab\Cde/8
     abXde
-    
+
 /-- Check maximum character size --/
 
 /\x{ffff}/8DZ
 
-/\x{10000}/8DZ 
+/\x{10000}/8DZ
 
 /\x{100}/8DZ
 
@@ -70,8 +70,8 @@
 
 /\xff/8DZ
 
-/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
-    \x{D55c}\x{ad6d}\x{C5B4} 
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
+    \x{D55c}\x{ad6d}\x{C5B4}
 
 /\x{65e5}\x{672c}\x{8a9e}/DZ8
     \x{65e5}\x{672c}\x{8a9e}
@@ -89,26 +89,26 @@
 /-- This one is here not because it's different to Perl, but because the way
 the captured single-byte is displayed. (In Perl it becomes a character, and you
 can't tell the difference.) --/
-    
+
 /X(\C)(.*)/8
     X\x{1234}
-    X\nabc 
+    X\nabc
 
-/-- This one is here because Perl gives out a grumbly error message (quite 
+/-- This one is here because Perl gives out a grumbly error message (quite
 correctly, but that messes up comparisons). --/
-    
+
 /a\Cb/8
-    *** Failers 
-    a\x{100}b 
-    
+    *** Failers
+    a\x{100}b
+
 /[^ab\xC0-\xF0]/8SDZ
     \x{f1}
     \x{bf}
     \x{100}
-    \x{1000}   
+    \x{1000}
     *** Failers
-    \x{c0} 
-    \x{f0} 
+    \x{c0}
+    \x{f0}
 
 /Ā{3,4}/8SDZ
   \x{100}\x{100}\x{100}\x{100\x{100}
@@ -133,7 +133,7 @@ correctly, but that messes up comparisons). --/
     \x{100}
     Z\x{100}
     \x{100}Z
-    *** Failers 
+    *** Failers
 
 /[\xff]/DZ8
     >\x{ff}<
@@ -144,8 +144,8 @@ correctly, but that messes up comparisons). --/
 
 /\777/8I
   \x{1ff}
-  \777 
-  
+  \777
+
 /\x{100}+\x{200}/8DZ
 
 /\x{100}+X/8DZ
@@ -160,12 +160,9 @@ correctly, but that messes up comparisons). --/
     \x{da00}\?
     \x{dfff}
     \x{dfff}\?
-    \x{110000}    
-    \x{110000}\?    
-    \x{2000000} 
-    \x{2000000}\? 
-    \x{7fffffff} 
-    \x{7fffffff}\? 
+    \x{110000}
+    \x{d800}\x{1234}
+    \x{fffe}
 
 /(*UTF16)\x{11234}/
   abcd\x{11234}pqr
@@ -179,9 +176,9 @@ correctly, but that messes up comparisons). --/
     ABC\x{1680}
     ABC\x{180e}
     ABC\x{2000}
-    ABC\x{202f} 
-    ABC\x{205f} 
-    ABC\x{3000} 
+    ABC\x{202f}
+    ABC\x{205f}
+    ABC\x{3000}
 
 /\v/SI8
     ABC\x{0a}
@@ -193,7 +190,7 @@ correctly, but that messes up comparisons). --/
 
 /\h*A/SI8
     CDBABC
-    
+
 /\v+A/SI8
 
 /\s?xxx\s/8SI
@@ -203,8 +200,8 @@ correctly, but that messes up comparisons). --/
     AB\x{a0}xxx\x{85}XYZ
 
 /\S \S/I8ST1
-    \x{a2} \x{84} 
-    A Z 
+    \x{a2} \x{84}
+    A Z
 
 /a+/8
     a\x{123}aa\>1
@@ -228,4 +225,13 @@ correctly, but that messes up comparisons). --/
 
 /\R/SI8
 
+/-- Check bad offset --/
+
+/a/8
+    \x{10000}\>1
+    \x{10000}ab\>2
+    \x{10000}ab\>3
+    \x{10000}ab\>4
+    \x{10000}ab\>5
+
 /-- End of testinput18 --/
diff --git a/testdata/testoutput17 b/testdata/testoutput17
index e277ce1..0e754a3 100644
--- a/testdata/testoutput17
+++ b/testdata/testoutput17
@@ -1,5 +1,6 @@
 /-- This set of tests is for the 16-bit library's basic (non-UTF-16) features 
-    that are not compatible with the 8-bit library. --/
+    that are not compatible with the 8-bit library, or which give different 
+    output in 16-bit mode. --/
 
 /a\Cb/
     aXb
diff --git a/testdata/testoutput18 b/testdata/testoutput18
index 5b3fb43..b7b9630 100644
--- a/testdata/testoutput18
+++ b/testdata/testoutput18
@@ -17,23 +17,23 @@
     X\x{11234}YZ
  0: X\x{11234}YZ
  1: \x{11234}YZ
-    
+
 /X\C*/8
     XYZabcdce
  0: XYZabcdce
-    
+
 /X\C*?/8
     XYZabcde
  0: X
-    
+
 /X\C{3,5}/8
-    Xabcdefg   
+    Xabcdefg
  0: Xabcde
-    X\x{11234}Y 
+    X\x{11234}Y
  0: X\x{11234}Y
     X\x{11234}YZ
  0: X\x{11234}YZ
-    X\x{11234}\x{512}  
+    X\x{11234}\x{512}
  0: X\x{11234}\x{512}
     X\x{11234}\x{512}YZ
  0: X\x{11234}\x{512}YZ
@@ -41,13 +41,13 @@
  0: X\x{11234}\x{512}\x{11234}
 
 /X\C{3,5}?/8
-    Xabcdefg   
+    Xabcdefg
  0: Xabc
-    X\x{11234}Y 
+    X\x{11234}Y
  0: X\x{11234}Y
     X\x{11234}YZ
  0: X\x{11234}Y
-    X\x{11234}\x{512}YZ  
+    X\x{11234}\x{512}YZ
  0: X\x{11234}\x{512}
     *** Failers
 No match
@@ -59,11 +59,11 @@ No match
  0: aXb
     a\nb
  0: a\x{0a}b
-    
-/a\C\Cb/8 
+
+/a\C\Cb/8
     a\x{12257}b
  0: a\x{12257}b
-    ** Failers 
+    ** Failers
 No match
     a\x{100}b
 No match
@@ -71,7 +71,7 @@ No match
 /ab\Cde/8
     abXde
  0: abXde
-    
+
 /-- Check maximum character size --/
 
 /\x{ffff}/8DZ
@@ -86,7 +86,7 @@ Options: utf
 First char = \x{ffff}
 No need char
 
-/\x{10000}/8DZ 
+/\x{10000}/8DZ
 ------------------------------------------------------------------
         Bra
         \x{10000}
@@ -206,7 +206,7 @@ Options: utf
 First char = \x{ff}
 No need char
 
-/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
 ------------------------------------------------------------------
         Bra
         \x{d55c}\x{ad6d}\x{c5b4}
@@ -217,7 +217,7 @@ Capturing subpattern count = 0
 Options: utf
 First char = \x{d55c}
 Need char = \x{c5b4}
-    \x{D55c}\x{ad6d}\x{C5B4} 
+    \x{D55c}\x{ad6d}\x{C5B4}
  0: \x{d55c}\x{ad6d}\x{c5b4}
 
 /\x{65e5}\x{672c}\x{8a9e}/DZ8
@@ -297,26 +297,26 @@ Need char = \x{deab}
 /-- This one is here not because it's different to Perl, but because the way
 the captured single-byte is displayed. (In Perl it becomes a character, and you
 can't tell the difference.) --/
-    
+
 /X(\C)(.*)/8
     X\x{1234}
  0: X\x{1234}
  1: \x{1234}
  2: 
-    X\nabc 
+    X\nabc
  0: X\x{0a}abc
  1: \x{0a}
  2: abc
 
-/-- This one is here because Perl gives out a grumbly error message (quite 
+/-- This one is here because Perl gives out a grumbly error message (quite
 correctly, but that messes up comparisons). --/
-    
+
 /a\Cb/8
-    *** Failers 
+    *** Failers
 No match
-    a\x{100}b 
+    a\x{100}b
  0: a\x{100}b
-    
+
 /[^ab\xC0-\xF0]/8SDZ
 ------------------------------------------------------------------
         Bra
@@ -346,13 +346,13 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  0: \x{bf}
     \x{100}
  0: \x{100}
-    \x{1000}   
+    \x{1000}
  0: \x{1000}
     *** Failers
  0: *
-    \x{c0} 
+    \x{c0}
 No match
-    \x{f0} 
+    \x{f0}
 No match
 
 /Ā{3,4}/8SDZ
@@ -515,7 +515,7 @@ No need char
  0: \x{100}
     \x{100}Z
  0: \x{100}
-    *** Failers 
+    *** Failers
 No match
 
 /[\xff]/DZ8
@@ -567,9 +567,9 @@ First char = \x{1ff}
 No need char
   \x{1ff}
  0: \x{1ff}
-  \777 
+  \777
  0: \x{1ff}
-  
+
 /\x{100}+\x{200}/8DZ
 ------------------------------------------------------------------
         Bra
@@ -603,29 +603,23 @@ Failed: missing terminating ] for character class at offset 13
     \x{0}\x{d7ff}\x{e000}\x{10ffff}
 No match
     \x{d800}
-Error -10 (bad UTF-8 string) offset=0 reason=1
+Error -10 (bad UTF-16 string) offset=0 reason=1
     \x{d800}\?
 No match
     \x{da00}
-Error -10 (bad UTF-8 string) offset=0 reason=1
+Error -10 (bad UTF-16 string) offset=0 reason=1
     \x{da00}\?
 No match
     \x{dfff}
-Error -10 (bad UTF-8 string) offset=0 reason=3
+Error -10 (bad UTF-16 string) offset=0 reason=3
     \x{dfff}\?
 No match
-    \x{110000}    
-Error -10 (bad UTF-8 string) offset=0 reason=3
-    \x{110000}\?    
-No match
-    \x{2000000} 
-Error -10 (bad UTF-8 string) offset=1 reason=3
-    \x{2000000}\? 
-No match
-    \x{7fffffff} 
-Error -10 (bad UTF-8 string) offset=1 reason=3
-    \x{7fffffff}\? 
-No match
+    \x{110000}
+**Failed: character value greater than 0x10ffff cannot be converted to UTF-16
+    \x{d800}\x{1234}
+Error -10 (bad UTF-16 string) offset=1 reason=2
+    \x{fffe}
+Error -10 (bad UTF-16 string) offset=0 reason=4
 
 /(*UTF16)\x{11234}/
   abcd\x{11234}pqr
@@ -657,11 +651,11 @@ Starting byte set: \x09 \x20 \xa0 \xff
  0: \x{180e}
     ABC\x{2000}
  0: \x{2000}
-    ABC\x{202f} 
+    ABC\x{202f}
  0: \x{202f}
-    ABC\x{205f} 
+    ABC\x{205f}
  0: \x{205f}
-    ABC\x{3000} 
+    ABC\x{3000}
  0: \x{3000}
 
 /\v/SI8
@@ -693,7 +687,7 @@ Subject length lower bound = 1
 Starting byte set: \x09 \x20 A \xa0 
     CDBABC
  0: A
-    
+
 /\v+A/SI8
 Capturing subpattern count = 0
 Options: utf
@@ -742,9 +736,9 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
   \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee 
   \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd 
   \xfe \xff 
-    \x{a2} \x{84} 
+    \x{a2} \x{84}
  0: \x{a2} \x{84}
-    A Z 
+    A Z
  0: A Z
 
 /a+/8
@@ -826,4 +820,18 @@ No need char
 Subject length lower bound = 1
 Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff 
 
+/-- Check bad offset --/
+
+/a/8
+    \x{10000}\>1
+Error -11 (bad UTF-16 offset)
+    \x{10000}ab\>2
+ 0: a
+    \x{10000}ab\>3
+No match
+    \x{10000}ab\>4
+No match
+    \x{10000}ab\>5
+Error -24 (bad offset value)
+
 /-- End of testinput18 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-24 17:43:22 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-24 17:43:22 +0000
commit	5a61c28fd404ea8390c9e4c035e0a35c782536d3 (patch)
tree	d2c97299d0b409136f7e0f287e15eba502222b44
parent	8d2226da6db1b735e0639479d609810455444476 (diff)
download	pcre-5a61c28fd404ea8390c9e4c035e0a35c782536d3.tar.gz