summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-19 13:34:10 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-19 13:34:10 +0000
commit2576d4686e427f362d435c24606db9e5a76b6339 (patch)
treebc0b18e9e98a0d093c8aa2eef4b94707c8e53408
parent1183e193897ab3e03c2cebec978a1a053fcb179d (diff)
downloadpcre-2576d4686e427f362d435c24606db9e5a76b6339.tar.gz
A lot more work on pcretest; now runs many (but not all) tests.
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@810 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rwxr-xr-xRunTest129
-rw-r--r--pcre_printint.c5
-rw-r--r--pcretest.c172
-rw-r--r--testdata/testinput13
-rw-r--r--testdata/testinput16238
-rw-r--r--testdata/testinput17282
-rw-r--r--testdata/testinput24
-rw-r--r--testdata/testinput444
-rw-r--r--testdata/testinput5246
-rw-r--r--testdata/testoutput13
-rw-r--r--testdata/testoutput1010
-rw-r--r--testdata/testoutput1322
-rw-r--r--testdata/testoutput16819
-rw-r--r--testdata/testoutput17907
-rw-r--r--testdata/testoutput26
-rw-r--r--testdata/testoutput465
-rw-r--r--testdata/testoutput5919
-rw-r--r--testdata/testoutput88
18 files changed, 2545 insertions, 1337 deletions
diff --git a/RunTest b/RunTest
index 49cce8d..973b53a 100755
--- a/RunTest
+++ b/RunTest
@@ -18,7 +18,10 @@
# two tests for JIT-specific features, one to be run when JIT support is
# available, and one when it is not.
-# The arguments for this script can be individual test numbers, or the word
+# Whichever of the 8-bit and 16-bit libraries exist are tested. It is also
+# possible to select which to test by the arguments -8 or -16.
+
+# Other arguments for this script can be individual test numbers, or the word
# "valgrind", or "sim" followed by an argument to run cross-compiled
# executables under a simulator, for example:
#
@@ -26,6 +29,8 @@
valgrind=
sim=
+arg8=
+arg16=
# Select which tests to run; for those that are explicitly requested, check
# that the necessary optional facilities are available.
@@ -45,6 +50,8 @@ do12=no
do13=no
do14=no
do15=no
+do16=no
+do17=no
while [ $# -gt 0 ] ; do
case $1 in
@@ -63,6 +70,10 @@ while [ $# -gt 0 ] ; do
13) do13=yes;;
14) do14=yes;;
15) do15=yes;;
+ 16) do16=yes;;
+ 17) do17=yes;;
+ -8) arg8=yes;;
+ -16) arg16=yes;;
valgrind) valgrind="valgrind -q --smc-check=all";;
sim) shift; sim=$1;;
*) echo "Unknown test number $1"; exit 1;;
@@ -107,12 +118,26 @@ $sim ./pcretest -C | $sim ./pcregrep '8-bit and 16-bit support' >/dev/null
if [ $? -eq 0 ] ; then
test8=
test16=-16
+ if [ "$arg8" = yes -a "$arg16" != yes ] ; then
+ test16=skip
+ fi
+ if [ "$arg16" = yes -a "$arg8" != yes ] ; then
+ test8=skip
+ fi
else
$sim ./pcretest -C | $sim ./pcregrep '8-bit support' >/dev/null
if [ $? -eq 0 ] ; then
+ if [ "$arg16" = yes ] ; then
+ echo "Cannot run 16-bit library tests: 16-bit library not compiled"
+ exit 1
+ fi
test8=
test16=skip
else
+ if [ "$arg8" = yes ] ; then
+ echo "Cannot run 8-bit library tests: 8-bit library not compiled"
+ exit 1
+ fi
test8=skip
test16=-16
fi
@@ -135,6 +160,20 @@ if [ $jit -ne 0 ] ; then
jitopt=-s+
fi
+if [ "$test8" = skip ] ; then
+ if [ $do17 = yes ] ; then
+ echo "Can't run test 17 because the 8-bit library is not built"
+ exit 1
+ fi
+fi
+
+if [ "$test16" = skip ] ; then
+ if [ $do16 = yes ] ; then
+ echo "Can't run test 16 because the 16-bit library is not built"
+ exit 1
+ fi
+fi
+
if [ $utf -eq 0 ] ; then
if [ $do4 = yes ] ; then
echo "Can't run test 4 because UTF support is not configured"
@@ -152,6 +191,12 @@ if [ $utf -eq 0 ] ; then
echo "Can't run test 12 because UTF support is not configured"
exit 1
fi
+ if [ $do16 = yes ] ; then
+ echo "Can't run test 16 because UTF support is not configured"
+ fi
+ if [ $do17 = yes ] ; then
+ echo "Can't run test 17 because UTF support is not configured"
+ fi
fi
if [ $ucp -eq 0 ] ; then
@@ -198,7 +243,8 @@ fi
if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \
$do5 = no -a $do6 = no -a $do7 = no -a $do8 = no -a \
$do9 = no -a $do10 = no -a $do11 = no -a $do12 = no -a \
- $do13 = no -a $do14 = no -a $do15 = no ] ; then
+ $do13 = no -a $do14 = no -a $do15 = no -a $do16 = no -a \
+ $do17 = no ] ; then
do1=yes
do2=yes
do3=yes
@@ -214,6 +260,8 @@ if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \
do13=yes
do14=yes
do15=yes
+ do16=yes
+ do17=yes
fi
# Show which release and which test data
@@ -226,8 +274,8 @@ for bmode in "$test8" "$test16"; do
case "$bmode" in
skip) continue;;
-16) if [ "$test8" != "skip" ] ; then echo ""; fi
- echo "---- Testing 16-bit library ----"; echo "";;
- *) echo "---- Testing 8-bit library ----"; echo "";;
+ bits=16; echo "---- Testing 16-bit library ----"; echo "";;
+ *) bits=8; echo "---- Testing 8-bit library ----"; echo "";;
esac
# Primary test, compatible with JIT and all versions of Perl >= 5.8
@@ -251,7 +299,7 @@ fi
# PCRE tests that are not JIT or Perl-compatible: API, errors, internals
if [ $do2 = yes ] ; then
- echo "Test 2: API, errors, internals, and non-Perl stuff (not UTF-8/16)"
+ echo "Test 2: API, errors, internals, and non-Perl stuff (not UTF-$bits)"
for opt in "" "-s" $jitopt; do
$sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput2 testtry
if [ $? = 0 ] ; then
@@ -336,9 +384,9 @@ fi
# Additional tests for UTF support
if [ $do4 = yes ] ; then
- echo "Test 4: UTF-8/16 support (Compatible with Perl >= 5.8)"
+ echo "Test 4: UTF-$bits support (Compatible with Perl >= 5.8)"
if [ $utf -eq 0 ] ; then
- echo " Skipped because UTF support is not available"
+ echo " Skipped because UTF-$bits support is not available"
else
for opt in "" "-s" $jitopt; do
$sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput4 testtry
@@ -356,9 +404,9 @@ if [ $do4 = yes ] ; then
fi
if [ $do5 = yes ] ; then
- echo "Test 5: API, internals, and non-Perl stuff for UTF-8/16 support"
+ echo "Test 5: API, internals, and non-Perl stuff for UTF-$bits support"
if [ $utf -eq 0 ] ; then
- echo " Skipped because UTF support is not available"
+ echo " Skipped because UTF-$bits support is not available"
else
for opt in "" "-s" $jitopt; do
$sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput5 testtry
@@ -411,9 +459,9 @@ if [ $do7 = yes ] ; then
fi
if [ $do8 = yes ] ; then
- echo "Test 8: DFA matching with UTF-8 or UTF-16"
+ echo "Test 8: DFA matching with UTF-$bits"
if [ $utf -eq 0 ] ; then
- echo " Skipped because UTF support is not available"
+ echo " Skipped because UTF-$bits support is not available"
else
for opt in "" "-s"; do
$sim $valgrind ./pcretest -q $bmode $opt -dfa $testdata/testinput8 testtry
@@ -469,10 +517,10 @@ if [ $do10 = yes ] ; then
fi
fi
-# Test of Perl >= 5.10 features without UTF8 support
+# Test of Perl >= 5.10 features without UTF support
if [ $do11 = yes ] ; then
- echo "Test 11: Features from Perl >= 5.10 without UTF8 support"
+ echo "Test 11: Features from Perl >= 5.10 without UTF-$bits support"
for opt in "" "-s" $jitopt; do
$sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput11 testtry
if [ $? = 0 ] ; then
@@ -487,12 +535,12 @@ if [ $do11 = yes ] ; then
done
fi
-# Test of Perl >= 5.10 features with UTF8 support
+# Test of Perl >= 5.10 features with UTF support
if [ $do12 = yes ] ; then
- echo "Test 12: Features from Perl >= 5.10 with UTF-8 or UTF-16 support"
+ echo "Test 12: Features from Perl >= 5.10 with UTF-$bits support"
if [ $utf -eq 0 ] ; then
- echo " Skipped because UTF support is not available"
+ echo " Skipped because UTF-$bits support is not available"
else
for opt in "" "-s" $jitopt; do
$sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput12 testtry
@@ -565,8 +613,55 @@ if [ $do15 = yes ] ; then
fi
fi
-# End of loop for 8-bit/16-bit tests
+# Tests for 16-bit-specific features (needs UTF-8 support)
+
+if [ $do16 = yes ] ; then
+ echo "Test 16: specials for the 16-bit library"
+ if [ "$bits" = "8" ] ; then
+ echo " Skipped when running 8-bit tests"
+ elif [ $utf -eq 0 ] ; then
+ echo " Skipped because UTF-$bits support is not available"
+ else
+ for opt in "" "-s" $jitopt; do
+ $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput16 testtry
+ if [ $? = 0 ] ; then
+ $cf $testdata/testoutput16 testtry
+ if [ $? != 0 ] ; then exit 1; fi
+ else exit 1
+ fi
+ if [ "$opt" = "-s" ] ; then echo " OK with study"
+ elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
+ else echo " OK"
+ fi
+ done
+ fi
+fi
+
+# Tests for 8-bit-specific features (needs UTF-8 support)
+
+if [ $do17 = yes ] ; then
+ echo "Test 17: specials for the 8-bit library"
+ if [ "$bits" = "16" ] ; then
+ echo " Skipped when running 16-bit tests"
+ elif [ $utf -eq 0 ] ; then
+ echo " Skipped because UTF-$bits support is not available"
+ else
+ for opt in "" "-s" $jitopt; do
+ $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput17 testtry
+ if [ $? = 0 ] ; then
+ $cf $testdata/testoutput17 testtry
+ if [ $? != 0 ] ; then exit 1; fi
+ else exit 1
+ fi
+ if [ "$opt" = "-s" ] ; then echo " OK with study"
+ elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study"
+ else echo " OK"
+ fi
+ done
+ fi
+fi
+# End of loop for 8-bit/16-bit tests
done
# End
diff --git a/pcre_printint.c b/pcre_printint.c
index 2fcf985..8d504ce 100644
--- a/pcre_printint.c
+++ b/pcre_printint.c
@@ -473,7 +473,10 @@ for(;;)
case OP_NOT:
c = code[1];
if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
- else fprintf(f, " %s [^\\x%02x]", flag, c);
+ else if (utf || c > 0xff)
+ fprintf(f, " %s [^\\x{%02x}]", flag, c);
+ else
+ fprintf(f, " %s [^\\x%02x]", flag, c);
break;
case OP_NOTSTARI:
diff --git a/pcretest.c b/pcretest.c
index b99ae41..6bd66bc 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -219,12 +219,12 @@ use these in the definitions of generic macros. */
count = pcre16_exec(re, extra, (PCRE_SPTR16)bptr, len, start_offset, \
options, offsets, size_offsets)
-#define PCRE_STUDY16(extra, re, options, error) \
- extra = pcre16_study(re, options, error)
-
#define PCRE_FREE_STUDY16(extra) \
pcre16_free_study(extra)
+#define PCRE_STUDY16(extra, re, options, error) \
+ extra = pcre16_study(re, options, error)
+
#endif /* SUPPORT_PCRE16 */
@@ -259,18 +259,18 @@ use these in the definitions of generic macros. */
PCRE_EXEC8(count, re, extra, bptr, len, start_offset, options, \
offsets, size_offsets)
-#define PCRE_STUDY(extra, re, options, error) \
- if (use_pcre16) \
- PCRE_STUDY16(extra, re, options, error); \
- else \
- PCRE_STUDY8(extra, re, options, error)
-
#define PCRE_FREE_STUDY(extra) \
if (use_pcre16) \
PCRE_FREE_STUDY16(extra); \
else \
PCRE_FREE_STUDY8(extra)
+#define PCRE_STUDY(extra, re, options, error) \
+ if (use_pcre16) \
+ PCRE_STUDY16(extra, re, options, error); \
+ else \
+ PCRE_STUDY8(extra, re, options, error)
+
/* ----- Only 8-bit mode is supported ----- */
#elif defined SUPPORT_PCRE8
@@ -278,8 +278,8 @@ use these in the definitions of generic macros. */
#define PCHARSV PCHARSV8
#define PCRE_COMPILE PCRE_COMPILE8
#define PCRE_EXEC PCRE_EXEC8
-#define PCRE_STUDY PCRE_STUDY8
#define PCRE_FREE_STUDY PCRE_FREE_STUDY8
+#define PCRE_STUDY PCRE_STUDY8
/* ----- Only 16-bit mode is supported ----- */
@@ -288,8 +288,8 @@ use these in the definitions of generic macros. */
#define PCHARSV PCHARSV16
#define PCRE_COMPILE PCRE_COMPILE16
#define PCRE_EXEC PCRE_EXEC16
-#define PCRE_STUDY PCRE_STUDY16
#define PCRE_FREE_STUDY PCRE_FREE_STUDY16
+#define PCRE_STUDY PCRE_STUDY16
#endif
/* ----- End of mode-specific function call macros ----- */
@@ -321,7 +321,7 @@ static int debug_lengths;
static int first_callout;
static int locale_set = 0;
static int show_malloc;
-static int use_utf8;
+static int use_utf;
static size_t gotten_store;
static size_t first_gotten_store = 0;
static const unsigned char *last_callout_mark = NULL;
@@ -848,7 +848,16 @@ return i + 1;
8-bit size. For a UTF-8 string, the size needed for UTF-16 is no more than
double, because up to 0xffff uses no more than 3 bytes in UTF-8 but possibly 4
in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in UTF-16. The
-result is always left in buffer16. */
+result is always left in buffer16.
+
+Arguments:
+ p points to a byte string
+ utf true if UTF-8 (to be converted to UTF-16)
+ len number of bytes in the string (excluding trailing zero)
+
+Returns: number of 16-bit data items used (excluding trailing zero)
+ OR -1 if a UTF-8 string is malformed
+*/
static int
to16(pcre_uint8 *p, int utf, int len)
@@ -880,6 +889,7 @@ else
while (len > 0)
{
int chlen = utf82ord(p, &c);
+ if (chlen <= 0) return -1;
p += chlen;
len -= chlen;
if (c < 0x10000) *pp++ = c; else
@@ -1030,6 +1040,43 @@ return(result);
+/*************************************************
+* Print one character *
+*************************************************/
+
+/* Print a single character either literally, or as a hex escape. */
+
+static int pchar(int c, FILE *f)
+{
+if (PRINTOK(c))
+ {
+ if (f != NULL) fprintf(f, "%c", c);
+ return 1;
+ }
+
+if (c < 0x100)
+ {
+ if (use_utf)
+ {
+ if (f != NULL) fprintf(f, "\\x{%02x}", c);
+ return 6;
+ }
+ else
+ {
+ if (f != NULL) fprintf(f, "\\x%02x", c);
+ return 4;
+ }
+ }
+
+if (f != NULL) fprintf(f, "\\x{%02x}", c);
+return (c <= 0x000000ff)? 6 :
+ (c <= 0x00000fff)? 7 :
+ (c <= 0x0000ffff)? 8 :
+ (c <= 0x000fffff)? 9 : 10;
+}
+
+
+
#ifdef SUPPORT_PCRE8
/*************************************************
* Print 8-bit character string *
@@ -1046,46 +1093,20 @@ int yield = 0;
while (length-- > 0)
{
#if !defined NOUTF8
- if (use_utf8)
+ if (use_utf)
{
int rc = utf82ord(p, &c);
-
if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
{
length -= rc - 1;
p += rc;
- if (PRINTOK(c))
- {
- if (f != NULL) fprintf(f, "%c", c);
- yield++;
- }
- else
- {
- int n = 4;
- if (f != NULL) fprintf(f, "\\x{%02x}", c);
- yield += (n <= 0x000000ff)? 2 :
- (n <= 0x00000fff)? 3 :
- (n <= 0x0000ffff)? 4 :
- (n <= 0x000fffff)? 5 : 6;
- }
- continue;
+ yield += pchar(c, f);
+ continue;
}
}
#endif
-
- /* Not UTF-8, or malformed UTF-8 */
-
c = *p++;
- if (PRINTOK(c))
- {
- if (f != NULL) fprintf(f, "%c", c);
- yield++;
- }
- else
- {
- if (f != NULL) fprintf(f, "\\x%02x", c);
- yield += 4;
- }
+ yield += pchar(c, f);
}
return yield;
@@ -1109,9 +1130,8 @@ int yield = 0;
while (length-- > 0)
{
int c = *p++ & 0xffff;
-
#if !defined NOUTF8
- if (use_utf8 && c >= 0xD800 && c < 0xDC00 && length > 0)
+ if (use_utf && c >= 0xD800 && c < 0xDC00 && length > 0)
{
int d = *p & 0xffff;
if (d >= 0xDC00 && d < 0xDFFF)
@@ -1122,28 +1142,7 @@ while (length-- > 0)
}
}
#endif
-
- if (PRINTOK(c))
- {
- if (f != NULL) fprintf(f, "%c", c);
- yield++;
- }
- else
- {
- yield += 4;
- if (c < 0x100)
- {
- if (f != NULL) fprintf(f, "\\x%02x", c);
- }
- else
- {
- if (f != NULL) fprintf(f, "\\x{%02x}", c);
- yield += (c <= 0x000000ff)? 2 :
- (c <= 0x00000fff)? 3 :
- (c <= 0x0000ffff)? 4 :
- (c <= 0x000fffff)? 5 : 6;
- }
- }
+ yield += pchar(c, f);
}
return yield;
@@ -1795,7 +1794,7 @@ while (!done)
int do_flip = 0;
int erroroffset, len, delimiter, poffset;
- use_utf8 = 0;
+ use_utf = 0;
debug_lengths = 1;
if (extend_inputline(infile, buffer, " re> ") == NULL) break;
@@ -1859,7 +1858,7 @@ while (!done)
/* Need to know if UTF-8 for printing data strings */
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
- use_utf8 = (get_options & PCRE_UTF8) != 0;
+ use_utf = (get_options & PCRE_UTF8) != 0;
/* Now see if there is any following study data. */
@@ -2004,7 +2003,7 @@ while (!done)
case 'X': options |= PCRE_EXTRA; break;
case 'Y': options |= PCRE_NO_START_OPTIMISE; break;
case 'Z': debug_lengths = 0; break;
- case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
+ case '8': options |= PCRE_UTF8; use_utf = 1; break;
case '?': options |= PCRE_NO_UTF8_CHECK; break;
case 'T':
@@ -2122,7 +2121,12 @@ while (!done)
#ifdef SUPPORT_PCRE16
if (use_pcre16)
{
- (void)to16(p, options & PCRE_UTF8, (int)strlen((char *)p));
+ if (to16(p, options & PCRE_UTF8, (int)strlen((char *)p)) < 0)
+ {
+ fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
+ "converted to UTF-16\n");
+ goto SKIP_DATA;
+ }
p = (pcre_uint8 *)buffer16;
}
#endif
@@ -2178,7 +2182,7 @@ while (!done)
lines. */
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
- if ((get_options & PCRE_UTF8) != 0) use_utf8 = 1;
+ if ((get_options & PCRE_UTF8) != 0) use_utf = 1;
/* Extract the size for possible writing before possibly flipping it,
and remember the store that was got. */
@@ -2395,9 +2399,9 @@ while (!done)
((get_options & PCRE_EXTRA) != 0)? " extra" : "",
((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
((get_options & PCRE_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "",
- ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
+ ((get_options & PCRE_UTF8) != 0)? " utf" : "",
((get_options & PCRE_UCP) != 0)? " ucp" : "",
- ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
+ ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf_check" : "",
((get_options & PCRE_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "",
((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
@@ -2442,11 +2446,15 @@ while (!done)
const char *caseless =
((((real_pcre *)re)->flags & PCRE_FCH_CASELESS) == 0)?
"" : " (caseless)";
-
+
if (PRINTOK(first_char))
fprintf(outfile, "First char = \'%c\'%s\n", first_char, caseless);
else
- fprintf(outfile, "First char = %d%s\n", first_char, caseless);
+ {
+ fprintf(outfile, "First char = ");
+ pchar(first_char, outfile);
+ fprintf(outfile, "%s\n", caseless);
+ }
}
if (need_char < 0)
@@ -2690,7 +2698,7 @@ while (!done)
c = c * 8 + *p++ - '0';
#if !defined NOUTF8
- if (use_utf8 && c > 255)
+ if (use_utf && c > 255)
{
pcre_uint8 buff8[8];
int ii, utn;
@@ -2722,7 +2730,7 @@ while (!done)
{
pcre_uint8 buff8[8];
int ii, utn;
- if (use_utf8)
+ if (use_utf)
{
utn = ord2utf8(c, buff8);
for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
@@ -3055,6 +3063,12 @@ while (!done)
if (use_pcre16)
{
len = to16(bptr, (((real_pcre *)re)->options) & PCRE_UTF8, len);
+ if (len < 0)
+ {
+ fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
+ "converted to UTF-16\n");
+ goto NEXT_DATA;
+ }
bptr = (pcre_uint8 *)buffer16;
}
#endif
@@ -3369,7 +3383,7 @@ while (!done)
bptr[start_offset] == '\r' &&
bptr[start_offset+1] == '\n')
onechar++;
- else if (use_utf8)
+ else if (use_utf)
{
while (start_offset + onechar < len)
{
diff --git a/testdata/testinput1 b/testdata/testinput1
index 36d7028..d9849fe 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -1,5 +1,6 @@
/-- This set of tests is for features that are compatible with all versions of
- Perl 5, in non-UTF-8 mode. --/
+ Perl 5, in non-UTF-8 mode. It should run clean for both the 8-bit and
+ 16-bit PCRE libraries. --/
/the quick brown fox/
the quick brown fox
diff --git a/testdata/testinput16 b/testdata/testinput16
new file mode 100644
index 0000000..83cadbe
--- /dev/null
+++ b/testdata/testinput16
@@ -0,0 +1,238 @@
+/-- This set of tests is for UTF-16 support, and is relevant only to the 16-bit
+ library. There are some non-UTF 16-bit tests as well (it doesn't seem
+ worth setting up another test file just for this case). --/
+
+/xxx/8?DZSS
+
+/abc/8
+ ]
+
+/X(\C{3})/8
+ X\x{11234}Y
+
+/X(\C{4})/8
+ X\x{11234}YZ
+
+/X\C*/8
+ XYZabcdce
+
+/X\C*?/8
+ XYZabcde
+
+/X\C{3,5}/8
+ Xabcdefg
+ X\x{11234}Y
+ X\x{11234}YZ
+ X\x{11234}\x{512}
+ X\x{11234}\x{512}YZ
+ X\x{11234}\x{512}\x{11234}Z
+
+/X\C{3,5}?/8
+ Xabcdefg
+ X\x{11234}Y
+ X\x{11234}YZ
+ X\x{11234}\x{512}YZ
+ *** Failers
+ X\x{11234}
+
+/a\Cb/
+ aXb
+ a\nb
+
+/a\Cb/8
+ aXb
+ a\nb
+
+/a\C\Cb/8
+ a\x{12257}b
+ ** Failers
+ a\x{100}b
+
+/ab\Cde/8
+ abXde
+
+/-- Check maximum non-UTF character size --/
+
+/\x{ffff}/
+
+/\x{10000}/
+
+/\x{100}/8DZ
+
+/\x{1000}/8DZ
+
+/\x{10000}/8DZ
+
+/\x{100000}/8DZ
+
+/\x{10ffff}/8DZ
+
+/[\x{ff}]/8DZ
+
+/[\x{100}]/8DZ
+
+/\x80/8DZ
+
+/\xff/8DZ
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
+ \x{D55c}\x{ad6d}\x{C5B4}
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+ \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+
+/\x{084}/DZ8
+
+/\x{104}/DZ8
+
+/\x{861}/DZ8
+
+/\x{212ab}/DZ8
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+
+/X(\C)(.*)/8
+ X\x{1234}
+ X\nabc
+
+/-- This one is here because Perl gives out a grumbly error message (quite
+correctly, but that messes up comparisons). --/
+
+/a\Cb/8
+ *** Failers
+ a\x{100}b
+
+/[^ab\xC0-\xF0]/8SDZ
+ \x{f1}
+ \x{bf}
+ \x{100}
+ \x{1000}
+ *** Failers
+ \x{c0}
+ \x{f0}
+
+/Ā{3,4}/8SDZ
+ \x{100}\x{100}\x{100}\x{100\x{100}
+
+/(\x{100}+|x)/8SDZ
+
+/(\x{100}*a|x)/8SDZ
+
+/(\x{100}{0,2}a|x)/8SDZ
+
+/(\x{100}{1,2}a|x)/8SDZ
+
+/\x{100}/8DZ
+
+/a\x{100}\x{101}*/8DZ
+
+/a\x{100}\x{101}+/8DZ
+
+/[^\x{c4}]/DZ
+
+/[\x{100}]/8DZ
+ \x{100}
+ Z\x{100}
+ \x{100}Z
+ *** Failers
+
+/[\xff]/DZ8
+ >\x{ff}<
+
+/[^\xff]/8DZ
+
+/\x{100}abc(xyz(?1))/8DZ
+
+/\777/8I
+ \x{1ff}
+ \777
+
+/\x{100}+\x{200}/8DZ
+
+/\x{100}+X/8DZ
+
+/^[\QĀ\E-\QŐ\E/BZ8
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/
+
+/X/8
+ \x{0}\x{d7ff}\x{e000}\x{10ffff}
+ \x{d800}
+ \x{d800}\?
+ \x{da00}
+ \x{da00}\?
+ \x{dfff}
+ \x{dfff}\?
+ \x{110000}
+ \x{110000}\?
+ \x{2000000}
+ \x{2000000}\?
+ \x{7fffffff}
+ \x{7fffffff}\?
+
+/(*UTF16)\x{11234}/
+ abcd\x{11234}pqr
+
+/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
+
+/\h/SI8
+ ABC\x{09}
+ ABC\x{20}
+ ABC\x{a0}
+ ABC\x{1680}
+ ABC\x{180e}
+ ABC\x{2000}
+ ABC\x{202f}
+ ABC\x{205f}
+ ABC\x{3000}
+
+/\v/SI8
+ ABC\x{0a}
+ ABC\x{0b}
+ ABC\x{0c}
+ ABC\x{0d}
+ ABC\x{85}
+ ABC\x{2028}
+
+/\h*A/SI8
+ CDBABC
+
+/\v+A/SI8
+
+/\s?xxx\s/8SI
+
+/\sxxx\s/I8ST1
+ AB\x{85}xxx\x{a0}XYZ
+ AB\x{a0}xxx\x{85}XYZ
+
+/\S \S/I8ST1
+ \x{a2} \x{84}
+ A Z
+
+/a+/8
+ a\x{123}aa\>1
+ a\x{123}aa\>2
+ a\x{123}aa\>3
+ a\x{123}aa\>4
+ a\x{123}aa\>5
+ a\x{123}aa\>6
+
+/\x{1234}+/iS8I
+
+/\x{1234}+?/iS8I
+
+/\x{1234}++/iS8I
+
+/\x{1234}{2}/iS8I
+
+/[^\x{c4}]/8DZ
+
+/X+\x{200}/8DZ
+
+/\R/SI8
+
+/-- End of testinput16 --/
diff --git a/testdata/testinput17 b/testdata/testinput17
new file mode 100644
index 0000000..5b16183
--- /dev/null
+++ b/testdata/testinput17
@@ -0,0 +1,282 @@
+/-- This set of tests is for UTF-8 support, and is relevant only to the 8-bit
+ library. --/
+
+/X(\C{3})/8
+ X\x{1234}
+
+/X(\C{4})/8
+ X\x{1234}YZ
+
+/X\C*/8
+ XYZabcdce
+
+/X\C*?/8
+ XYZabcde
+
+/X\C{3,5}/8
+ Xabcdefg
+ X\x{1234}
+ X\x{1234}YZ
+ X\x{1234}\x{512}
+ X\x{1234}\x{512}YZ
+
+/X\C{3,5}?/8
+ Xabcdefg
+ X\x{1234}
+ X\x{1234}YZ
+ X\x{1234}\x{512}
+
+/a\Cb/
+ aXb
+ a\nb
+
+/a\Cb/8
+ aXb
+ a\nb
+
+/a\C\Cb/8
+ a\x{100}b
+
+/ab\Cde/8
+ abXde
+
+/a\C\Cb/8
+ a\x{100}b
+ ** Failers
+ a\x{12257}b
+
+/[]/8
+
+//8
+
+/xxx/8
+
+/xxx/8?DZSS
+
+/abc/8
+ ]
+
+
+ \?
+ \xe1\x88
+ \P\xe1\x88
+ \P\P\xe1\x88
+ XX\xea
+ \O0XX\xea
+ \O1XX\xea
+ \O2XX\xea
+ XX\xf1
+ XX\xf8
+ XX\xfc
+ ZZ\xea\xaf\x20YY
+ ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY
+ ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY
+ ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY
+ ZZ\xffYY
+ ZZ\xfeYY
+
+/anything/8
+ \xc0\x80
+ \xc1\x8f
+ \xe0\x9f\x80
+ \xf0\x8f\x80\x80
+ \xf8\x87\x80\x80\x80
+ \xfc\x83\x80\x80\x80\x80
+ \xfe\x80\x80\x80\x80\x80
+ \xff\x80\x80\x80\x80\x80
+ \xc3\x8f
+ \xe0\xaf\x80
+ \xe1\x80\x80
+ \xf0\x9f\x80\x80
+ \xf1\x8f\x80\x80
+ \xf8\x88\x80\x80\x80
+ \xf9\x87\x80\x80\x80
+ \xfc\x84\x80\x80\x80\x80
+ \xfd\x83\x80\x80\x80\x80
+ \?\xf8\x88\x80\x80\x80
+ \?\xf9\x87\x80\x80\x80
+ \?\xfc\x84\x80\x80\x80\x80
+ \?\xfd\x83\x80\x80\x80\x80
+
+/\x{100}/8DZ
+
+/\x{1000}/8DZ
+
+/\x{10000}/8DZ
+
+/\x{100000}/8DZ
+
+/\x{10ffff}/8DZ
+
+/[\x{ff}]/8DZ
+
+/[\x{100}]/8DZ
+
+/\x80/8DZ
+
+/\xff/8DZ
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
+ \x{D55c}\x{ad6d}\x{C5B4}
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+ \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+
+/\x{084}/DZ8
+
+/\x{104}/DZ8
+
+/\x{861}/DZ8
+
+/\x{212ab}/DZ8
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+
+/X(\C)(.*)/8
+ X\x{1234}
+ X\nabc
+
+/-- This one is here because Perl gives out a grumbly error message (quite
+correctly, but that messes up comparisons). --/
+
+/a\Cb/8
+ *** Failers
+ a\x{100}b
+
+/[^ab\xC0-\xF0]/8SDZ
+ \x{f1}
+ \x{bf}
+ \x{100}
+ \x{1000}
+ *** Failers
+ \x{c0}
+ \x{f0}
+
+/Ā{3,4}/8SDZ
+ \x{100}\x{100}\x{100}\x{100\x{100}
+
+/(\x{100}+|x)/8SDZ
+
+/(\x{100}*a|x)/8SDZ
+
+/(\x{100}{0,2}a|x)/8SDZ
+
+/(\x{100}{1,2}a|x)/8SDZ
+
+/\x{100}/8DZ
+
+/a\x{100}\x{101}*/8DZ
+
+/a\x{100}\x{101}+/8DZ
+
+/[^\x{c4}]/DZ
+
+/[\x{100}]/8DZ
+ \x{100}
+ Z\x{100}
+ \x{100}Z
+ *** Failers
+
+/[\xff]/DZ8
+ >\x{ff}<
+
+/[^\xff]/8DZ
+
+/\x{100}abc(xyz(?1))/8DZ
+
+/a\x{1234}b/P8
+ a\x{1234}b
+
+/\777/8I
+ \x{1ff}
+ \777
+
+/\x{100}+\x{200}/8DZ
+
+/\x{100}+X/8DZ
+
+/^[\QĀ\E-\QŐ\E/BZ8
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/
+
+/X/8
+ \x{0}\x{d7ff}\x{e000}\x{10ffff}
+ \x{d800}
+ \x{d800}\?
+ \x{da00}
+ \x{da00}\?
+ \x{dfff}
+ \x{dfff}\?
+ \x{110000}
+ \x{110000}\?
+ \x{2000000}
+ \x{2000000}\?
+ \x{7fffffff}
+ \x{7fffffff}\?
+
+/(*UTF8)\x{1234}/
+ abcd\x{1234}pqr
+
+/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
+
+/\h/SI8
+ ABC\x{09}
+ ABC\x{20}
+ ABC\x{a0}
+ ABC\x{1680}
+ ABC\x{180e}
+ ABC\x{2000}
+ ABC\x{202f}
+ ABC\x{205f}
+ ABC\x{3000}
+
+/\v/SI8
+ ABC\x{0a}
+ ABC\x{0b}
+ ABC\x{0c}
+ ABC\x{0d}
+ ABC\x{85}
+ ABC\x{2028}
+
+/\h*A/SI8
+ CDBABC
+
+/\v+A/SI8
+
+/\s?xxx\s/8SI
+
+/\sxxx\s/I8ST1
+ AB\x{85}xxx\x{a0}XYZ
+ AB\x{a0}xxx\x{85}XYZ
+
+/\S \S/I8ST1
+ \x{a2} \x{84}
+ A Z
+
+/a+/8
+ a\x{123}aa\>1
+ a\x{123}aa\>2
+ a\x{123}aa\>3
+ a\x{123}aa\>4
+ a\x{123}aa\>5
+ a\x{123}aa\>6
+
+/\x{1234}+/iS8I
+
+/\x{1234}+?/iS8I
+
+/\x{1234}++/iS8I
+
+/\x{1234}{2}/iS8I
+
+/[^\x{c4}]/8DZ
+
+/X+\x{200}/8DZ
+
+/\R/SI8
+
+/-- End of testinput17 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index 19801ef..7c1e3c5 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5,8 +5,8 @@
either because PCRE can't be compatible, or there is a possible Perl
bug.
- NOTE: This is a non-UTF-8 set of tests. When UTF-8 is needed, use test
- 5, and if Unicode Property Support is needed, use test 13. --/
+ NOTE: This is a non-UTF set of tests. When UTF support is needed, use
+ test 5, and if Unicode Property Support is needed, use test 13. --/
/-- Originally, the Perl >= 5.10 things were in here too, but now I have
separated many (most?) of them out into test 11. However, there may still
diff --git a/testdata/testinput4 b/testdata/testinput4
index b339f71..3adaa1a 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -1,5 +1,6 @@
-/-- This set of tests is for UTF-8 support, excluding Unicode properties. It is
- compatible with all versions of Perl 5. --/
+/-- This set of tests is for UTF support, excluding Unicode properties. It is
+ compatible with all versions of Perl 5 and both the 8-bit and 16-bit PCRE
+ libraries. --/
/a.b/8
acb
@@ -126,31 +127,6 @@
*** Failers
XYZ
-/X(\C{3})/8
- X\x{1234}
-
-/X(\C{4})/8
- X\x{1234}YZ
-
-/X\C*/8
- XYZabcdce
-
-/X\C*?/8
- XYZabcde
-
-/X\C{3,5}/8
- Xabcdefg
- X\x{1234}
- X\x{1234}YZ
- X\x{1234}\x{512}
- X\x{1234}\x{512}YZ
-
-/X\C{3,5}?/8
- Xabcdefg
- X\x{1234}
- X\x{1234}YZ
- X\x{1234}\x{512}
-
/[^a]+/8g
bcd
\x{100}aY\x{256}Z
@@ -456,17 +432,6 @@
\x{150}X
\x{200}X
-/a\Cb/
- aXb
- a\nb
-
-/a\Cb/8
- aXb
- a\nb
-
-/a\C\Cb/8
- a\x{100}b
-
/[z-\x{100}]/8i
z
Z
@@ -650,7 +615,4 @@
/(abc)\1/8
abc
-/ab\Cde/8
- abXde
-
/-- End of testinput4 --/
diff --git a/testdata/testinput5 b/testdata/testinput5
index 9ba5b4b..87f0884 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -1,22 +1,9 @@
-/-- This set of tests checks the API, internals, and non-Perl stuff for UTF-8
- support, excluding Unicode properties. --/
-
-/\x{100}/8DZ
-
-/\x{1000}/8DZ
-
-/\x{10000}/8DZ
-
-/\x{100000}/8DZ
-
-/\x{10ffff}/8DZ
+/-- This set of tests checks the API, internals, and non-Perl stuff for UTF
+ support, excluding Unicode properties. However, tests that give different
+ results in 8-bit and 16-bit modes are excluded (see tests 16 and 17). --/
/\x{110000}/8DZ
-/[\x{ff}]/8DZ
-
-/[\x{100}]/8DZ
-
/\x{ffffffff}/8
/\x{100000000}/8
@@ -32,54 +19,18 @@
/^\x{100}a\x{1234}/8
\x{100}a\x{1234}bcd
-/\x80/8DZ
-
-/\xff/8DZ
-
/\x{0041}\x{2262}\x{0391}\x{002e}/DZ8
\x{0041}\x{2262}\x{0391}\x{002e}
-/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
- \x{D55c}\x{ad6d}\x{C5B4}
-
-/\x{65e5}\x{672c}\x{8a9e}/DZ8
- \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/DZ8
-
-/\x{084}/DZ8
-
-/\x{104}/DZ8
-
-/\x{861}/DZ8
-
-/\x{212ab}/DZ8
-
/.{3,5}X/DZ8
\x{212ab}\x{212ab}\x{212ab}\x{861}X
-
/.{3,5}?/DZ8
\x{212ab}\x{212ab}\x{212ab}\x{861}
/(?<=\C)X/8
Should produce an error diagnostic
-/-- This one is here not because it's different to Perl, but because the way
-the captured single-byte is displayed. (In Perl it becomes a character, and you
-can't tell the difference.) --/
-
-/X(\C)(.*)/8
- X\x{1234}
- X\nabc
-
-/-- This one is here because Perl gives out a grumbly error message (quite
-correctly, but that messes up comparisons). --/
-
-/a\Cb/8
- *** Failers
- a\x{100}b
-
/^[ab]/8DZ
bar
*** Failers
@@ -94,26 +45,6 @@ correctly, but that messes up comparisons). --/
*** Failers
aaa
-/[^ab\xC0-\xF0]/8SDZ
- \x{f1}
- \x{bf}
- \x{100}
- \x{1000}
- *** Failers
- \x{c0}
- \x{f0}
-
-/Ā{3,4}/8SDZ
- \x{100}\x{100}\x{100}\x{100\x{100}
-
-/(\x{100}+|x)/8SDZ
-
-/(\x{100}*a|x)/8SDZ
-
-/(\x{100}{0,2}a|x)/8SDZ
-
-/(\x{100}{1,2}a|x)/8SDZ
-
/\x{100}*(\d+|"(?1)")/8
1234
"1234"
@@ -124,33 +55,17 @@ correctly, but that messes up comparisons). --/
*** Failers
\x{100}\x{100}abcd
-/\x{100}/8DZ
-
/\x{100}*/8DZ
/a\x{100}*/8DZ
/ab\x{100}*/8DZ
-/a\x{100}\x{101}*/8DZ
-
-/a\x{100}\x{101}+/8DZ
-
/\x{100}*A/8DZ
A
/\x{100}*\d(?R)/8DZ
-/[^\x{c4}]/DZ
-
-/[^\x{c4}]/8DZ
-
-/[\x{100}]/8DZ
- \x{100}
- Z\x{100}
- \x{100}Z
- *** Failers
-
/[Z\x{100}]/8DZ
Z\x{100}
\x{100}
@@ -175,13 +90,8 @@ correctly, but that messes up comparisons). --/
/[\xFF]/DZ
>\xff<
-/[\xff]/DZ8
- >\x{ff}<
-
/[^\xFF]/DZ
-/[^\xff]/8DZ
-
/[Ä-Ü]/8
Ö # Matches without Study
\x{d6}
@@ -198,61 +108,6 @@ correctly, but that messes up comparisons). --/
Ö <-- Same with Study
\x{d6}
-/[]/8
-
-//8
-
-/xxx/8
-
-/xxx/8?DZSS
-
-/abc/8
- ]
-
-
- \?
- \xe1\x88
- \P\xe1\x88
- \P\P\xe1\x88
- XX\xea
- \O0XX\xea
- \O1XX\xea
- \O2XX\xea
- XX\xf1
- XX\xf8
- XX\xfc
- ZZ\xea\xaf\x20YY
- ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY
- ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY
- ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY
- ZZ\xffYY
- ZZ\xfeYY
-
-/anything/8
- \xc0\x80
- \xc1\x8f
- \xe0\x9f\x80
- \xf0\x8f\x80\x80
- \xf8\x87\x80\x80\x80
- \xfc\x83\x80\x80\x80\x80
- \xfe\x80\x80\x80\x80\x80
- \xff\x80\x80\x80\x80\x80
- \xc3\x8f
- \xe0\xaf\x80
- \xe1\x80\x80
- \xf0\x9f\x80\x80
- \xf1\x8f\x80\x80
- \xf8\x88\x80\x80\x80
- \xf9\x87\x80\x80\x80
- \xfc\x84\x80\x80\x80\x80
- \xfd\x83\x80\x80\x80\x80
- \?\xf8\x88\x80\x80\x80
- \?\xf9\x87\x80\x80\x80
- \?\xfc\x84\x80\x80\x80\x80
- \?\xfd\x83\x80\x80\x80\x80
-
-/\x{100}abc(xyz(?1))/8DZ
-
/[^\x{100}]abc(xyz(?1))/8DZ
/[ab\x{100}]abc(xyz(?1))/8DZ
@@ -272,17 +127,10 @@ correctly, but that messes up comparisons). --/
/\w/8
\x{100}X
-/a\x{1234}b/P8
- a\x{1234}b
-
/^\ሴ/8DZ
/\777/I
-/\777/8I
- \x{1ff}
- \777
-
/\x{100}*\d/8DZ
/\x{100}*\s/8DZ
@@ -295,12 +143,6 @@ correctly, but that messes up comparisons). --/
/\x{100}*\W/8DZ
-/\x{100}+\x{200}/8DZ
-
-/\x{100}+X/8DZ
-
-/X+\x{200}/8DZ
-
/()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
@@ -312,8 +154,6 @@ correctly, but that messes up comparisons). --/
/^[\QĀ\E-\QŐ\E]/BZ8
-/^[\QĀ\E-\QŐ\E/BZ8
-
/^abc./mgx8<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
@@ -408,23 +248,6 @@ correctly, but that messes up comparisons). --/
/.*$/8<any>
\x{1ec5}
-/-- This tests the stricter UTF-8 check according to RFC 3629. --/
-
-/X/8
- \x{0}\x{d7ff}\x{e000}\x{10ffff}
- \x{d800}
- \x{d800}\?
- \x{da00}
- \x{da00}\?
- \x{dfff}
- \x{dfff}\?
- \x{110000}
- \x{110000}\?
- \x{2000000}
- \x{2000000}\?
- \x{7fffffff}
- \x{7fffffff}\?
-
/a\Rb/I8<bsr_anycrlf>
a\rb
a\nb
@@ -488,11 +311,6 @@ correctly, but that messes up comparisons). --/
/X/8f<any>
A\x{1ec5}ABCXYZ
-/(*UTF8)\x{1234}/
- abcd\x{1234}pqr
-
-/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
-
/Xa{2,4}b/8
X\P
Xa\P
@@ -776,53 +594,17 @@ correctly, but that messes up comparisons). --/
/\h/SI
-/\h/SI8
- ABC\x{09}
- ABC\x{20}
- ABC\x{a0}
- ABC\x{1680}
- ABC\x{180e}
- ABC\x{2000}
- ABC\x{202f}
- ABC\x{205f}
- ABC\x{3000}
-
/\v/SI
-/\v/SI8
- ABC\x{0a}
- ABC\x{0b}
- ABC\x{0c}
- ABC\x{0d}
- ABC\x{85}
- ABC\x{2028}
-
/\R/SI
-/\R/SI8
-
-/\h*A/SI8
- CDBABC
-
-/\v+A/SI8
-
-/\s?xxx\s/8SI
-
/\sxxx\s/8T1
AB\x{85}xxx\x{a0}XYZ
AB\x{a0}xxx\x{85}XYZ
-/\sxxx\s/I8ST1
- AB\x{85}xxx\x{a0}XYZ
- AB\x{a0}xxx\x{85}XYZ
-
/\S \S/8T1
\x{a2} \x{84}
-/\S \S/I8ST1
- \x{a2} \x{84}
- A Z
-
'A#хц'8x<any>BZ
'A#хц
@@ -838,14 +620,6 @@ correctly, but that messes up comparisons). --/
/\g{A}xxx#bх(?'A'123) (?'A'456)/8x<any>BZ
-/a+/8
- a\x{123}aa\>1
- a\x{123}aa\>2
- a\x{123}aa\>3
- a\x{123}aa\>4
- a\x{123}aa\>5
- a\x{123}aa\>6
-
/^\cģ/8
/(\R*)(.)/s8
@@ -858,14 +632,6 @@ correctly, but that messes up comparisons). --/
\r\r\n\n\r
\r\r\n\n\r\n
-/\x{1234}+/iS8I
-
-/\x{1234}+?/iS8I
-
-/\x{1234}++/iS8I
-
-/\x{1234}{2}/iS8I
-
/[^\x{1234}]+/iS8I
/[^\x{1234}]+?/iS8I
@@ -887,5 +653,11 @@ correctly, but that messes up comparisons). --/
/f.*/8s
\P\Pfor
+
+/\x{d7ff}\x{e000}/8
+
+/\x{d800}/8
+
+/\x{dfff}/8
/-- End of testinput5 --/
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index bc237ab..13c79c5 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -1,5 +1,6 @@
/-- This set of tests is for features that are compatible with all versions of
- Perl 5, in non-UTF-8 mode. --/
+ Perl 5, in non-UTF-8 mode. It should run clean for both the 8-bit and
+ 16-bit PCRE libraries. --/
/the quick brown fox/
the quick brown fox
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index 62d6f3e..55f10a5 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -374,7 +374,7 @@ Memory allocation (code space): 18
17 End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
First char = 'A'
Need char = '.'
@@ -387,8 +387,8 @@ Memory allocation (code space): 19
18 End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
-First char = 237
+Options: utf
+First char = \x{ed}
Need char = 180
/\x{65e5}\x{672c}\x{8a9e}/D8M
@@ -400,8 +400,8 @@ Memory allocation (code space): 19
18 End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
-First char = 230
+Options: utf
+First char = \x{e6}
Need char = 158
/[\x{100}]/8BM
diff --git a/testdata/testoutput13 b/testdata/testoutput13
index 3151699..20d8c0e 100644
--- a/testdata/testoutput13
+++ b/testdata/testoutput13
@@ -57,7 +57,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -69,7 +69,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
1234
@@ -83,7 +83,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
1234
@@ -105,7 +105,7 @@ No match
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
First char = 'A' (caseless)
No need char
@@ -117,7 +117,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
First char = 'A'
Need char = 176
@@ -129,7 +129,7 @@ Need char = 176
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
First char = 'A'
Need char = 176
@@ -141,7 +141,7 @@ Need char = 176
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
First char = 'A' (caseless)
Need char = 'B' (caseless)
@@ -153,7 +153,7 @@ Need char = 'B' (caseless)
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
\x{104}
@@ -177,7 +177,7 @@ No match
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Z
@@ -215,7 +215,7 @@ No match
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
@@ -1049,7 +1049,7 @@ No match: POSIX code 17: match failed
/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/8iSI
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 17
diff --git a/testdata/testoutput16 b/testdata/testoutput16
new file mode 100644
index 0000000..2cc97aa
--- /dev/null
+++ b/testdata/testoutput16
@@ -0,0 +1,819 @@
+/-- This set of tests is for UTF-16 support, and is relevant only to the 16-bit
+ library. There are some non-UTF 16-bit tests as well (it doesn't seem
+ worth setting up another test file just for this case). --/
+
+/xxx/8?DZSS
+**Failed: invalid UTF-8 string cannot be converted to UTF-16
+
+/abc/8
+ ]
+**Failed: invalid UTF-8 string cannot be converted to UTF-16
+
+/X(\C{3})/8
+ X\x{11234}Y
+ 0: X\x{11234}Y
+ 1: \x{11234}Y
+
+/X(\C{4})/8
+ X\x{11234}YZ
+ 0: X\x{11234}YZ
+ 1: \x{11234}YZ
+
+/X\C*/8
+ XYZabcdce
+ 0: XYZabcdce
+
+/X\C*?/8
+ XYZabcde
+ 0: X
+
+/X\C{3,5}/8
+ Xabcdefg
+ 0: Xabcde
+ X\x{11234}Y
+ 0: X\x{11234}Y
+ X\x{11234}YZ
+ 0: X\x{11234}YZ
+ X\x{11234}\x{512}
+ 0: X\x{11234}\x{512}
+ X\x{11234}\x{512}YZ
+ 0: X\x{11234}\x{512}YZ
+ X\x{11234}\x{512}\x{11234}Z
+ 0: X\x{11234}\x{512}\x{11234}
+
+/X\C{3,5}?/8
+ Xabcdefg
+ 0: Xabc
+ X\x{11234}Y
+ 0: X\x{11234}Y
+ X\x{11234}YZ
+ 0: X\x{11234}Y
+ X\x{11234}\x{512}YZ
+ 0: X\x{11234}\x{512}
+ *** Failers
+No match
+ X\x{11234}
+No match
+
+/a\Cb/
+ aXb
+ 0: aXb
+ a\nb
+ 0: a\x0ab
+
+/a\Cb/8
+ aXb
+ 0: aXb
+ a\nb
+ 0: a\x{0a}b
+
+/a\C\Cb/8
+ a\x{12257}b
+ 0: a\x{12257}b
+ ** Failers
+No match
+ a\x{100}b
+No match
+
+/ab\Cde/8
+ abXde
+ 0: abXde
+
+/-- Check maximum non-UTF character size --/
+
+/\x{ffff}/
+
+/\x{10000}/
+Failed: character value in \x{...} sequence is too large at offset 8
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+
+/\x{1000}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{1000}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{1000}
+No need char
+
+/\x{10000}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{10000}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{d800}
+Need char = 56320
+
+/\x{100000}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100000}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{dbc0}
+Need char = 56320
+
+/\x{10ffff}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{10ffff}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{dbff}
+Need char = 57343
+
+/[\x{ff}]/8DZ
+------------------------------------------------------------------
+ Bra
+ \xff
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ff}
+No need char
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+
+/\x80/8DZ
+------------------------------------------------------------------
+ Bra
+ \x80
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{80}
+No need char
+
+/\xff/8DZ
+------------------------------------------------------------------
+ Bra
+ \xff
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ff}
+No need char
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{d55c}\x{ad6d}\x{c5b4}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{d55c}
+Need char = 50612
+ \x{D55c}\x{ad6d}\x{C5B4}
+ 0: \x{d55c}\x{ad6d}\x{c5b4}
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{65e5}\x{672c}\x{8a9e}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{65e5}
+Need char = 35486
+ \x{65e5}\x{672c}\x{8a9e}
+ 0: \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x80
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{80}
+No need char
+
+/\x{084}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x84
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{84}
+No need char
+
+/\x{104}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{104}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{104}
+No need char
+
+/\x{861}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{861}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{861}
+No need char
+
+/\x{212ab}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{212ab}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{d844}
+Need char = 57003
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+
+/X(\C)(.*)/8
+ X\x{1234}
+ 0: X\x{1234}
+ 1: \x{1234}
+ 2:
+ X\nabc
+ 0: X\x{0a}abc
+ 1: \x{0a}
+ 2: abc
+
+/-- This one is here because Perl gives out a grumbly error message (quite
+correctly, but that messes up comparisons). --/
+
+/a\Cb/8
+ *** Failers
+No match
+ a\x{100}b
+ 0: a\x{100}b
+
+/[^ab\xC0-\xF0]/8SDZ
+------------------------------------------------------------------
+ Bra
+ [\x00-`c-\xbf\xf1-\xff] (neg)
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
+ \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
+ \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
+ 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
+ Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
+ \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e
+ \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d
+ \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac
+ \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb
+ \xbc \xbd \xbe \xbf \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb
+ \xfc \xfd \xfe \xff
+ \x{f1}
+ 0: \x{f1}
+ \x{bf}
+ 0: \x{bf}
+ \x{100}
+ 0: \x{100}
+ \x{1000}
+ 0: \x{1000}
+ *** Failers
+ 0: *
+ \x{c0}
+No match
+ \x{f0}
+No match
+
+/Ā{3,4}/8SDZ
+------------------------------------------------------------------
+ Bra
+ \x{100}{3}
+ \x{100}?
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+Need char = 256
+Subject length lower bound = 3
+No set of starting bytes
+ \x{100}\x{100}\x{100}\x{100\x{100}
+ 0: \x{100}\x{100}\x{100}
+
+/(\x{100}+|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}+
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xff
+
+/(\x{100}*a|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}*+
+ a
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xff
+
+/(\x{100}{0,2}a|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}{0,2}
+ a
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xff
+
+/(\x{100}{1,2}a|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}
+ \x{100}{0,1}
+ a
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xff
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+
+/a\x{100}\x{101}*/8DZ
+------------------------------------------------------------------
+ Bra
+ a\x{100}
+ \x{101}*
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 256
+
+/a\x{100}\x{101}+/8DZ
+------------------------------------------------------------------
+ Bra
+ a\x{100}
+ \x{101}+
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 257
+
+/[^\x{c4}]/DZ
+------------------------------------------------------------------
+ Bra
+ [^\xc4]
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+ \x{100}
+ 0: \x{100}
+ Z\x{100}
+ 0: \x{100}
+ \x{100}Z
+ 0: \x{100}
+ *** Failers
+No match
+
+/[\xff]/DZ8
+------------------------------------------------------------------
+ Bra
+ \xff
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ff}
+No need char
+ >\x{ff}<
+ 0: \x{ff}
+
+/[^\xff]/8DZ
+------------------------------------------------------------------
+ Bra
+ [^\x{ff}]
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/\x{100}abc(xyz(?1))/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}abc
+ CBra 1
+ xyz
+ Recurse
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+First char = \x{100}
+Need char = 'z'
+
+/\777/8I
+Capturing subpattern count = 0
+Options: utf
+First char = \x{1ff}
+No need char
+ \x{1ff}
+ 0: \x{1ff}
+ \777
+ 0: \x{1ff}
+
+/\x{100}+\x{200}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}++
+ \x{200}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+Need char = 512
+
+/\x{100}+X/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}++
+ X
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+Need char = 'X'
+
+/^[\QĀ\E-\QŐ\E/BZ8
+Failed: missing terminating ] for character class at offset 13
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/
+
+/X/8
+ \x{0}\x{d7ff}\x{e000}\x{10ffff}
+No match
+ \x{d800}
+Error -10 (bad UTF-8 string) offset=0 reason=1
+ \x{d800}\?
+No match
+ \x{da00}
+Error -10 (bad UTF-8 string) offset=0 reason=1
+ \x{da00}\?
+No match
+ \x{dfff}
+Error -10 (bad UTF-8 string) offset=0 reason=3
+ \x{dfff}\?
+No match
+ \x{110000}
+Error -10 (bad UTF-8 string) offset=0 reason=3
+ \x{110000}\?
+No match
+ \x{2000000}
+Error -10 (bad UTF-8 string) offset=1 reason=3
+ \x{2000000}\?
+No match
+ \x{7fffffff}
+Error -10 (bad UTF-8 string) offset=1 reason=3
+ \x{7fffffff}\?
+No match
+
+/(*UTF16)\x{11234}/
+ abcd\x{11234}pqr
+ 0: \x{11234}
+
+/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
+Capturing subpattern count = 0
+Options: bsr_unicode utf
+Forced newline sequence: CRLF
+First char = 'a'
+Need char = 'b'
+
+/\h/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0 \xff
+ ABC\x{09}
+ 0: \x{09}
+ ABC\x{20}
+ 0:
+ ABC\x{a0}
+ 0: \x{a0}
+ ABC\x{1680}
+ 0: \x{1680}
+ ABC\x{180e}
+ 0: \x{180e}
+ ABC\x{2000}
+ 0: \x{2000}
+ ABC\x{202f}
+ 0: \x{202f}
+ ABC\x{205f}
+ 0: \x{205f}
+ ABC\x{3000}
+ 0: \x{3000}
+
+/\v/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff
+ ABC\x{0a}
+ 0: \x{0a}
+ ABC\x{0b}
+ 0: \x{0b}
+ ABC\x{0c}
+ 0: \x{0c}
+ ABC\x{0d}
+ 0: \x{0d}
+ ABC\x{85}
+ 0: \x{85}
+ ABC\x{2028}
+ 0: \x{2028}
+
+/\h*A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 A \xa0
+ CDBABC
+ 0: A
+
+/\v+A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff
+
+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x
+
+/\sxxx\s/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 5
+Starting byte set: \x09 \x0a \x0c \x0d \x20 \x85 \xa0
+ AB\x{85}xxx\x{a0}XYZ
+ 0: \x{85}xxx\x{a0}
+ AB\x{a0}xxx\x{85}XYZ
+ 0: \x{a0}xxx\x{85}
+
+/\S \S/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = ' '
+Subject length lower bound = 3
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
+ \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d
+ \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e
+ f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83
+ \x84 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93
+ \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa1 \xa2 \xa3
+ \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2
+ \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1
+ \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0
+ \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf
+ \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee
+ \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd
+ \xfe \xff
+ \x{a2} \x{84}
+ 0: \x{a2} \x{84}
+ A Z
+ 0: A Z
+
+/a+/8
+ a\x{123}aa\>1
+ 0: aa
+ a\x{123}aa\>2
+ 0: aa
+ a\x{123}aa\>3
+ 0: a
+ a\x{123}aa\>4
+No match
+ a\x{123}aa\>5
+Error -24 (bad offset value)
+ a\x{123}aa\>6
+Error -24 (bad offset value)
+
+/\x{1234}+/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+No need char
+Subject length lower bound = 1
+No set of starting bytes
+
+/\x{1234}+?/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+No need char
+Subject length lower bound = 1
+No set of starting bytes
+
+/\x{1234}++/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+No need char
+Subject length lower bound = 1
+No set of starting bytes
+
+/\x{1234}{2}/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+Need char = 4660
+Subject length lower bound = 2
+No set of starting bytes
+
+/[^\x{c4}]/8DZ
+------------------------------------------------------------------
+ Bra
+ [^\x{c4}]
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/X+\x{200}/8DZ
+------------------------------------------------------------------
+ Bra
+ X++
+ \x{200}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'X'
+Need char = 512
+
+/\R/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff
+
+/-- End of testinput16 --/
diff --git a/testdata/testoutput17 b/testdata/testoutput17
new file mode 100644
index 0000000..23fc980
--- /dev/null
+++ b/testdata/testoutput17
@@ -0,0 +1,907 @@
+/-- This set of tests is for UTF-8 support, and is relevant only to the 8-bit
+ library. --/
+
+/X(\C{3})/8
+ X\x{1234}
+ 0: X\x{1234}
+ 1: \x{1234}
+
+/X(\C{4})/8
+ X\x{1234}YZ
+ 0: X\x{1234}Y
+ 1: \x{1234}Y
+
+/X\C*/8
+ XYZabcdce
+ 0: XYZabcdce
+
+/X\C*?/8
+ XYZabcde
+ 0: X
+
+/X\C{3,5}/8
+ Xabcdefg
+ 0: Xabcde
+ X\x{1234}
+ 0: X\x{1234}
+ X\x{1234}YZ
+ 0: X\x{1234}YZ
+ X\x{1234}\x{512}
+ 0: X\x{1234}\x{512}
+ X\x{1234}\x{512}YZ
+ 0: X\x{1234}\x{512}
+
+/X\C{3,5}?/8
+ Xabcdefg
+ 0: Xabc
+ X\x{1234}
+ 0: X\x{1234}
+ X\x{1234}YZ
+ 0: X\x{1234}
+ X\x{1234}\x{512}
+ 0: X\x{1234}
+
+/a\Cb/
+ aXb
+ 0: aXb
+ a\nb
+ 0: a\x0ab
+
+/a\Cb/8
+ aXb
+ 0: aXb
+ a\nb
+ 0: a\x{0a}b
+
+/a\C\Cb/8
+ a\x{100}b
+ 0: a\x{100}b
+
+/ab\Cde/8
+ abXde
+ 0: abXde
+
+/a\C\Cb/8
+ a\x{100}b
+ 0: a\x{100}b
+ ** Failers
+No match
+ a\x{12257}b
+No match
+
+/[]/8
+Failed: invalid UTF-8 string at offset 1
+
+//8
+Failed: invalid UTF-8 string at offset 0
+
+/xxx/8
+Failed: invalid UTF-8 string at offset 0
+
+/xxx/8?DZSS
+------------------------------------------------------------------
+ Bra
+ \X{c0}\X{c0}\X{c0}xxx
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf no_utf_check
+First char = \x{c3}
+Need char = 'x'
+
+/abc/8
+ ]
+Error -10 (bad UTF-8 string) offset=0 reason=6
+
+Error -10 (bad UTF-8 string) offset=0 reason=1
+
+Error -10 (bad UTF-8 string) offset=0 reason=6
+ \?
+No match
+ \xe1\x88
+Error -10 (bad UTF-8 string) offset=0 reason=1
+ \P\xe1\x88
+Error -10 (bad UTF-8 string) offset=0 reason=1
+ \P\P\xe1\x88
+Error -25 (short UTF-8 string) offset=0 reason=1
+ XX\xea
+Error -10 (bad UTF-8 string) offset=2 reason=2
+ \O0XX\xea
+Error -10 (bad UTF-8 string)
+ \O1XX\xea
+Error -10 (bad UTF-8 string)
+ \O2XX\xea
+Error -10 (bad UTF-8 string) offset=2 reason=2
+ XX\xf1
+Error -10 (bad UTF-8 string) offset=2 reason=3
+ XX\xf8
+Error -10 (bad UTF-8 string) offset=2 reason=4
+ XX\xfc
+Error -10 (bad UTF-8 string) offset=2 reason=5
+ ZZ\xea\xaf\x20YY
+Error -10 (bad UTF-8 string) offset=2 reason=7
+ ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY
+Error -10 (bad UTF-8 string) offset=2 reason=8
+ ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY
+Error -10 (bad UTF-8 string) offset=2 reason=9
+ ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY
+Error -10 (bad UTF-8 string) offset=2 reason=10
+ ZZ\xffYY
+Error -10 (bad UTF-8 string) offset=2 reason=21
+ ZZ\xfeYY
+Error -10 (bad UTF-8 string) offset=2 reason=21
+
+/anything/8
+ \xc0\x80
+Error -10 (bad UTF-8 string) offset=0 reason=15
+ \xc1\x8f
+Error -10 (bad UTF-8 string) offset=0 reason=15
+ \xe0\x9f\x80
+Error -10 (bad UTF-8 string) offset=0 reason=16
+ \xf0\x8f\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=17
+ \xf8\x87\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=18
+ \xfc\x83\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=19
+ \xfe\x80\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=21
+ \xff\x80\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=21
+ \xc3\x8f
+No match
+ \xe0\xaf\x80
+No match
+ \xe1\x80\x80
+No match
+ \xf0\x9f\x80\x80
+No match
+ \xf1\x8f\x80\x80
+No match
+ \xf8\x88\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=11
+ \xf9\x87\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=11
+ \xfc\x84\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=12
+ \xfd\x83\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=12
+ \?\xf8\x88\x80\x80\x80
+No match
+ \?\xf9\x87\x80\x80\x80
+No match
+ \?\xfc\x84\x80\x80\x80\x80
+No match
+ \?\xfd\x83\x80\x80\x80\x80
+No match
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/\x{1000}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{1000}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{e1}
+Need char = 128
+
+/\x{10000}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{10000}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f0}
+Need char = 128
+
+/\x{100000}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100000}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f4}
+Need char = 128
+
+/\x{10ffff}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{10ffff}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f4}
+Need char = 191
+
+/[\x{ff}]/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{ff}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c3}
+Need char = 191
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/\x80/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{80}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c2}
+Need char = 128
+
+/\xff/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{ff}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c3}
+Need char = 191
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{d55c}\x{ad6d}\x{c5b4}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ed}
+Need char = 180
+ \x{D55c}\x{ad6d}\x{C5B4}
+ 0: \x{d55c}\x{ad6d}\x{c5b4}
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{65e5}\x{672c}\x{8a9e}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{e6}
+Need char = 158
+ \x{65e5}\x{672c}\x{8a9e}
+ 0: \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{80}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c2}
+Need char = 128
+
+/\x{084}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{84}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c2}
+Need char = 132
+
+/\x{104}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{104}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 132
+
+/\x{861}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{861}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{e0}
+Need char = 161
+
+/\x{212ab}/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{212ab}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f0}
+Need char = 171
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+
+/X(\C)(.*)/8
+ X\x{1234}
+ 0: X\x{1234}
+ 1: \x{e1}
+ 2: \x{88}\x{b4}
+ X\nabc
+ 0: X\x{0a}abc
+ 1: \x{0a}
+ 2: abc
+
+/-- This one is here because Perl gives out a grumbly error message (quite
+correctly, but that messes up comparisons). --/
+
+/a\Cb/8
+ *** Failers
+No match
+ a\x{100}b
+No match
+
+/[^ab\xC0-\xF0]/8SDZ
+------------------------------------------------------------------
+ Bra
+ [\x00-`c-\xbf\xf1-\xff] (neg)
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
+ \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
+ \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
+ 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
+ Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
+ \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0
+ \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf
+ \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee
+ \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd
+ \xfe \xff
+ \x{f1}
+ 0: \x{f1}
+ \x{bf}
+ 0: \x{bf}
+ \x{100}
+ 0: \x{100}
+ \x{1000}
+ 0: \x{1000}
+ *** Failers
+ 0: *
+ \x{c0}
+No match
+ \x{f0}
+No match
+
+/Ā{3,4}/8SDZ
+------------------------------------------------------------------
+ Bra
+ \x{100}{3}
+ \x{100}?
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+Subject length lower bound = 3
+No set of starting bytes
+ \x{100}\x{100}\x{100}\x{100\x{100}
+ 0: \x{100}\x{100}\x{100}
+
+/(\x{100}+|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}+
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xc4
+
+/(\x{100}*a|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}*+
+ a
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xc4
+
+/(\x{100}{0,2}a|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}{0,2}
+ a
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xc4
+
+/(\x{100}{1,2}a|x)/8SDZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \x{100}
+ \x{100}{0,1}
+ a
+ Alt
+ x
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xc4
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/a\x{100}\x{101}*/8DZ
+------------------------------------------------------------------
+ Bra
+ a\x{100}
+ \x{101}*
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 128
+
+/a\x{100}\x{101}+/8DZ
+------------------------------------------------------------------
+ Bra
+ a\x{100}
+ \x{101}+
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 129
+
+/[^\x{c4}]/DZ
+------------------------------------------------------------------
+ Bra
+ [^\xc4]
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+ \x{100}
+ 0: \x{100}
+ Z\x{100}
+ 0: \x{100}
+ \x{100}Z
+ 0: \x{100}
+ *** Failers
+No match
+
+/[\xff]/DZ8
+------------------------------------------------------------------
+ Bra
+ \x{ff}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c3}
+Need char = 191
+ >\x{ff}<
+ 0: \x{ff}
+
+/[^\xff]/8DZ
+------------------------------------------------------------------
+ Bra
+ [\x00-\xfe] (neg)
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/\x{100}abc(xyz(?1))/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}abc
+ CBra 1
+ xyz
+ Recurse
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+First char = \x{c4}
+Need char = 'z'
+
+/a\x{1234}b/P8
+ a\x{1234}b
+ 0: a\x{1234}b
+
+/\777/8I
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c7}
+Need char = 191
+ \x{1ff}
+ 0: \x{1ff}
+ \777
+ 0: \x{1ff}
+
+/\x{100}+\x{200}/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}++
+ \x{200}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/\x{100}+X/8DZ
+------------------------------------------------------------------
+ Bra
+ \x{100}++
+ X
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 'X'
+
+/^[\QĀ\E-\QŐ\E/BZ8
+Failed: missing terminating ] for character class at offset 15
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/
+
+/X/8
+ \x{0}\x{d7ff}\x{e000}\x{10ffff}
+No match
+ \x{d800}
+Error -10 (bad UTF-8 string) offset=0 reason=14
+ \x{d800}\?
+No match
+ \x{da00}
+Error -10 (bad UTF-8 string) offset=0 reason=14
+ \x{da00}\?
+No match
+ \x{dfff}
+Error -10 (bad UTF-8 string) offset=0 reason=14
+ \x{dfff}\?
+No match
+ \x{110000}
+Error -10 (bad UTF-8 string) offset=0 reason=13
+ \x{110000}\?
+No match
+ \x{2000000}
+Error -10 (bad UTF-8 string) offset=0 reason=11
+ \x{2000000}\?
+No match
+ \x{7fffffff}
+Error -10 (bad UTF-8 string) offset=0 reason=12
+ \x{7fffffff}\?
+No match
+
+/(*UTF8)\x{1234}/
+ abcd\x{1234}pqr
+ 0: \x{1234}
+
+/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
+Capturing subpattern count = 0
+Options: bsr_unicode utf
+Forced newline sequence: CRLF
+First char = 'a'
+Need char = 'b'
+
+/\h/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3
+ ABC\x{09}
+ 0: \x{09}
+ ABC\x{20}
+ 0:
+ ABC\x{a0}
+ 0: \x{a0}
+ ABC\x{1680}
+ 0: \x{1680}
+ ABC\x{180e}
+ 0: \x{180e}
+ ABC\x{2000}
+ 0: \x{2000}
+ ABC\x{202f}
+ 0: \x{202f}
+ ABC\x{205f}
+ 0: \x{205f}
+ ABC\x{3000}
+ 0: \x{3000}
+
+/\v/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
+ ABC\x{0a}
+ 0: \x{0a}
+ ABC\x{0b}
+ 0: \x{0b}
+ ABC\x{0c}
+ 0: \x{0c}
+ ABC\x{0d}
+ 0: \x{0d}
+ ABC\x{85}
+ 0: \x{85}
+ ABC\x{2028}
+ 0: \x{2028}
+
+/\h*A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3
+ CDBABC
+ 0: A
+
+/\v+A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
+
+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x
+
+/\sxxx\s/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 5
+Starting byte set: \x09 \x0a \x0c \x0d \x20 \xc2
+ AB\x{85}xxx\x{a0}XYZ
+ 0: \x{85}xxx\x{a0}
+ AB\x{a0}xxx\x{85}XYZ
+ 0: \x{a0}xxx\x{85}
+
+/\S \S/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = ' '
+Subject length lower bound = 3
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
+ \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d
+ \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e
+ f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3
+ \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2
+ \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1
+ \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0
+ \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
+ \x{a2} \x{84}
+ 0: \x{a2} \x{84}
+ A Z
+ 0: A Z
+
+/a+/8
+ a\x{123}aa\>1
+ 0: aa
+ a\x{123}aa\>2
+Error -11 (bad UTF-8 offset)
+ a\x{123}aa\>3
+ 0: aa
+ a\x{123}aa\>4
+ 0: a
+ a\x{123}aa\>5
+No match
+ a\x{123}aa\>6
+Error -24 (bad offset value)
+
+/\x{1234}+/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \xe1
+
+/\x{1234}+?/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \xe1
+
+/\x{1234}++/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \xe1
+
+/\x{1234}{2}/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 2
+Starting byte set: \xe1
+
+/[^\x{c4}]/8DZ
+------------------------------------------------------------------
+ Bra
+ [\x00-\xc3\xc5-\xff] (neg)
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/X+\x{200}/8DZ
+------------------------------------------------------------------
+ Bra
+ X++
+ \x{200}
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'X'
+Need char = 128
+
+/\R/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
+
+/-- End of testinput17 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 22a0725..5796f10 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -5,8 +5,8 @@
either because PCRE can't be compatible, or there is a possible Perl
bug.
- NOTE: This is a non-UTF-8 set of tests. When UTF-8 is needed, use test
- 5, and if Unicode Property Support is needed, use test 13. --/
+ NOTE: This is a non-UTF set of tests. When UTF support is needed, use
+ test 5, and if Unicode Property Support is needed, use test 13. --/
/-- Originally, the Perl >= 5.10 things were in here too, but now I have
separated many (most?) of them out into test 11. However, there may still
@@ -6178,7 +6178,7 @@ Failed: character value in \x{...} sequence is too large at offset 6
/\x{0000ff}/I
Capturing subpattern count = 0
No options
-First char = 255
+First char = \xff
No need char
/^((?P<A>a1)|(?P<A>a2)b)/I
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 2f1b4fd..067d6f5 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1,5 +1,6 @@
-/-- This set of tests is for UTF-8 support, excluding Unicode properties. It is
- compatible with all versions of Perl 5. --/
+/-- This set of tests is for UTF support, excluding Unicode properties. It is
+ compatible with all versions of Perl 5 and both the 8-bit and 16-bit PCRE
+ libraries. --/
/a.b/8
acb
@@ -255,46 +256,6 @@ No match
XYZ
No match
-/X(\C{3})/8
- X\x{1234}
- 0: X\x{1234}
- 1: \x{1234}
-
-/X(\C{4})/8
- X\x{1234}YZ
- 0: X\x{1234}Y
- 1: \x{1234}Y
-
-/X\C*/8
- XYZabcdce
- 0: XYZabcdce
-
-/X\C*?/8
- XYZabcde
- 0: X
-
-/X\C{3,5}/8
- Xabcdefg
- 0: Xabcde
- X\x{1234}
- 0: X\x{1234}
- X\x{1234}YZ
- 0: X\x{1234}YZ
- X\x{1234}\x{512}
- 0: X\x{1234}\x{512}
- X\x{1234}\x{512}YZ
- 0: X\x{1234}\x{512}
-
-/X\C{3,5}?/8
- Xabcdefg
- 0: Xabc
- X\x{1234}
- 0: X\x{1234}
- X\x{1234}YZ
- 0: X\x{1234}
- X\x{1234}\x{512}
- 0: X\x{1234}
-
/[^a]+/8g
bcd
0: bcd
@@ -791,22 +752,6 @@ No match
\x{200}X
No match
-/a\Cb/
- aXb
- 0: aXb
- a\nb
- 0: a\x0ab
-
-/a\Cb/8
- aXb
- 0: aXb
- a\nb
- 0: a\x{0a}b
-
-/a\C\Cb/8
- a\x{100}b
- 0: a\x{100}b
-
/[z-\x{100}]/8i
z
0: z
@@ -1136,8 +1081,4 @@ No match
abc
No match
-/ab\Cde/8
- abXde
- 0: abXde
-
/-- End of testinput4 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 603e55f..9b86dad 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1,93 +1,10 @@
-/-- This set of tests checks the API, internals, and non-Perl stuff for UTF-8
- support, excluding Unicode properties. --/
-
-/\x{100}/8DZ
-------------------------------------------------------------------
- Bra
- \x{100}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
-/\x{1000}/8DZ
-------------------------------------------------------------------
- Bra
- \x{1000}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 225
-Need char = 128
-
-/\x{10000}/8DZ
-------------------------------------------------------------------
- Bra
- \x{10000}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 128
-
-/\x{100000}/8DZ
-------------------------------------------------------------------
- Bra
- \x{100000}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 244
-Need char = 128
-
-/\x{10ffff}/8DZ
-------------------------------------------------------------------
- Bra
- \x{10ffff}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 244
-Need char = 191
+/-- This set of tests checks the API, internals, and non-Perl stuff for UTF
+ support, excluding Unicode properties. However, tests that give different
+ results in 8-bit and 16-bit modes are excluded (see tests 16 and 17). --/
/\x{110000}/8DZ
Failed: character value in \x{...} sequence is too large at offset 9
-/[\x{ff}]/8DZ
-------------------------------------------------------------------
- Bra
- \x{ff}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
-
-/[\x{100}]/8DZ
-------------------------------------------------------------------
- Bra
- \x{100}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
/\x{ffffffff}/8
Failed: character value in \x{...} sequence is too large at offset 11
@@ -108,30 +25,6 @@ Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7
\x{100}a\x{1234}bcd
0: \x{100}a\x{1234}
-/\x80/8DZ
-------------------------------------------------------------------
- Bra
- \x{80}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\xff/8DZ
-------------------------------------------------------------------
- Bra
- \x{ff}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
-
/\x{0041}\x{2262}\x{0391}\x{002e}/DZ8
------------------------------------------------------------------
Bra
@@ -140,100 +33,12 @@ Need char = 191
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
First char = 'A'
Need char = '.'
\x{0041}\x{2262}\x{0391}\x{002e}
0: A\x{2262}\x{391}.
-/\x{D55c}\x{ad6d}\x{C5B4}/DZ8
-------------------------------------------------------------------
- Bra
- \x{d55c}\x{ad6d}\x{c5b4}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 237
-Need char = 180
- \x{D55c}\x{ad6d}\x{C5B4}
- 0: \x{d55c}\x{ad6d}\x{c5b4}
-
-/\x{65e5}\x{672c}\x{8a9e}/DZ8
-------------------------------------------------------------------
- Bra
- \x{65e5}\x{672c}\x{8a9e}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 230
-Need char = 158
- \x{65e5}\x{672c}\x{8a9e}
- 0: \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/DZ8
-------------------------------------------------------------------
- Bra
- \x{80}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\x{084}/DZ8
-------------------------------------------------------------------
- Bra
- \x{84}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 132
-
-/\x{104}/DZ8
-------------------------------------------------------------------
- Bra
- \x{104}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 132
-
-/\x{861}/DZ8
-------------------------------------------------------------------
- Bra
- \x{861}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 224
-Need char = 161
-
-/\x{212ab}/DZ8
-------------------------------------------------------------------
- Bra
- \x{212ab}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 171
-
/.{3,5}X/DZ8
------------------------------------------------------------------
Bra
@@ -244,13 +49,12 @@ Need char = 171
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
Need char = 'X'
\x{212ab}\x{212ab}\x{212ab}\x{861}X
0: \x{212ab}\x{212ab}\x{212ab}\x{861}X
-
/.{3,5}?/DZ8
------------------------------------------------------------------
Bra
@@ -260,7 +64,7 @@ Need char = 'X'
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
\x{212ab}\x{212ab}\x{212ab}\x{861}
@@ -269,29 +73,6 @@ No need char
/(?<=\C)X/8
Failed: \C not allowed in lookbehind assertion at offset 6
-/-- This one is here not because it's different to Perl, but because the way
-the captured single-byte is displayed. (In Perl it becomes a character, and you
-can't tell the difference.) --/
-
-/X(\C)(.*)/8
- X\x{1234}
- 0: X\x{1234}
- 1: \xe1
- 2: \x88\xb4
- X\nabc
- 0: X\x{0a}abc
- 1: \x{0a}
- 2: abc
-
-/-- This one is here because Perl gives out a grumbly error message (quite
-correctly, but that messes up comparisons). --/
-
-/a\Cb/8
- *** Failers
-No match
- a\x{100}b
-No match
-
/^[ab]/8DZ
------------------------------------------------------------------
Bra
@@ -301,7 +82,7 @@ No match
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: anchored utf8
+Options: anchored utf
No first char
No need char
bar
@@ -324,7 +105,7 @@ No match
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: anchored utf8
+Options: anchored utf
No first char
No need char
c
@@ -338,136 +119,6 @@ No need char
aaa
No match
-/[^ab\xC0-\xF0]/8SDZ
-------------------------------------------------------------------
- Bra
- [\x00-`c-\xbf\xf1-\xff] (neg)
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
- \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
- \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
- 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
- Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
- \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0
- \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf
- \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee
- \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd
- \xfe \xff
- \x{f1}
- 0: \x{f1}
- \x{bf}
- 0: \x{bf}
- \x{100}
- 0: \x{100}
- \x{1000}
- 0: \x{1000}
- *** Failers
- 0: *
- \x{c0}
-No match
- \x{f0}
-No match
-
-/Ā{3,4}/8SDZ
-------------------------------------------------------------------
- Bra
- \x{100}{3}
- \x{100}?
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-Subject length lower bound = 3
-No set of starting bytes
- \x{100}\x{100}\x{100}\x{100\x{100}
- 0: \x{100}\x{100}\x{100}
-
-/(\x{100}+|x)/8SDZ
-------------------------------------------------------------------
- Bra
- CBra 1
- \x{100}+
- Alt
- x
- Ket
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: x \xc4
-
-/(\x{100}*a|x)/8SDZ
-------------------------------------------------------------------
- Bra
- CBra 1
- \x{100}*+
- a
- Alt
- x
- Ket
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: a x \xc4
-
-/(\x{100}{0,2}a|x)/8SDZ
-------------------------------------------------------------------
- Bra
- CBra 1
- \x{100}{0,2}
- a
- Alt
- x
- Ket
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: a x \xc4
-
-/(\x{100}{1,2}a|x)/8SDZ
-------------------------------------------------------------------
- Bra
- CBra 1
- \x{100}
- \x{100}{0,1}
- a
- Alt
- x
- Ket
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: x \xc4
-
/\x{100}*(\d+|"(?1)")/8
1234
0: 1234
@@ -492,18 +143,6 @@ No match
\x{100}\x{100}abcd
No match
-/\x{100}/8DZ
-------------------------------------------------------------------
- Bra
- \x{100}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
/\x{100}*/8DZ
------------------------------------------------------------------
Bra
@@ -512,7 +151,7 @@ Need char = 128
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -525,7 +164,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
First char = 'a'
No need char
@@ -538,36 +177,10 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
First char = 'a'
Need char = 'b'
-/a\x{100}\x{101}*/8DZ
-------------------------------------------------------------------
- Bra
- a\x{100}
- \x{101}*
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'a'
-Need char = 128
-
-/a\x{100}\x{101}+/8DZ
-------------------------------------------------------------------
- Bra
- a\x{100}
- \x{101}+
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'a'
-Need char = 129
-
/\x{100}*A/8DZ
------------------------------------------------------------------
Bra
@@ -577,7 +190,7 @@ Need char = 129
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
Need char = 'A'
A
@@ -593,54 +206,10 @@ Need char = 'A'
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
-/[^\x{c4}]/DZ
-------------------------------------------------------------------
- Bra
- [^\xc4]
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/[^\x{c4}]/8DZ
-------------------------------------------------------------------
- Bra
- [\x00-\xc3\xc5-\xff] (neg)
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-
-/[\x{100}]/8DZ
-------------------------------------------------------------------
- Bra
- \x{100}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
- \x{100}
- 0: \x{100}
- Z\x{100}
- 0: \x{100}
- \x{100}Z
- 0: \x{100}
- *** Failers
-No match
-
/[Z\x{100}]/8DZ
------------------------------------------------------------------
Bra
@@ -649,7 +218,7 @@ No match
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
Z\x{100}
@@ -684,7 +253,7 @@ No match
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -696,7 +265,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
\x{100}
@@ -713,25 +282,11 @@ No need char
------------------------------------------------------------------
Capturing subpattern count = 0
No options
-First char = 255
+First char = \xff
No need char
>\xff<
0: \xff
-/[\xff]/DZ8
-------------------------------------------------------------------
- Bra
- \x{ff}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
- >\x{ff}<
- 0: \x{ff}
-
/[^\xFF]/DZ
------------------------------------------------------------------
Bra
@@ -744,18 +299,6 @@ No options
No first char
No need char
-/[^\xff]/8DZ
-------------------------------------------------------------------
- Bra
- [\x00-\xfe] (neg)
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-
/[Ä-Ü]/8
Ö # Matches without Study
0: \x{d6}
@@ -780,129 +323,6 @@ No need char
\x{d6}
0: \x{d6}
-/[]/8
-Failed: invalid UTF-8 string at offset 1
-
-//8
-Failed: invalid UTF-8 string at offset 0
-
-/xxx/8
-Failed: invalid UTF-8 string at offset 0
-
-/xxx/8?DZSS
-------------------------------------------------------------------
- Bra
- \X{c0}\X{c0}\X{c0}xxx
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8 no_utf8_check
-First char = 195
-Need char = 'x'
-
-/abc/8
- ]
-Error -10 (bad UTF-8 string) offset=0 reason=6
-
-Error -10 (bad UTF-8 string) offset=0 reason=1
-
-Error -10 (bad UTF-8 string) offset=0 reason=6
- \?
-No match
- \xe1\x88
-Error -10 (bad UTF-8 string) offset=0 reason=1
- \P\xe1\x88
-Error -10 (bad UTF-8 string) offset=0 reason=1
- \P\P\xe1\x88
-Error -25 (short UTF-8 string) offset=0 reason=1
- XX\xea
-Error -10 (bad UTF-8 string) offset=2 reason=2
- \O0XX\xea
-Error -10 (bad UTF-8 string)
- \O1XX\xea
-Error -10 (bad UTF-8 string)
- \O2XX\xea
-Error -10 (bad UTF-8 string) offset=2 reason=2
- XX\xf1
-Error -10 (bad UTF-8 string) offset=2 reason=3
- XX\xf8
-Error -10 (bad UTF-8 string) offset=2 reason=4
- XX\xfc
-Error -10 (bad UTF-8 string) offset=2 reason=5
- ZZ\xea\xaf\x20YY
-Error -10 (bad UTF-8 string) offset=2 reason=7
- ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY
-Error -10 (bad UTF-8 string) offset=2 reason=8
- ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY
-Error -10 (bad UTF-8 string) offset=2 reason=9
- ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY
-Error -10 (bad UTF-8 string) offset=2 reason=10
- ZZ\xffYY
-Error -10 (bad UTF-8 string) offset=2 reason=21
- ZZ\xfeYY
-Error -10 (bad UTF-8 string) offset=2 reason=21
-
-/anything/8
- \xc0\x80
-Error -10 (bad UTF-8 string) offset=0 reason=15
- \xc1\x8f
-Error -10 (bad UTF-8 string) offset=0 reason=15
- \xe0\x9f\x80
-Error -10 (bad UTF-8 string) offset=0 reason=16
- \xf0\x8f\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=17
- \xf8\x87\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=18
- \xfc\x83\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=19
- \xfe\x80\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=21
- \xff\x80\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=21
- \xc3\x8f
-No match
- \xe0\xaf\x80
-No match
- \xe1\x80\x80
-No match
- \xf0\x9f\x80\x80
-No match
- \xf1\x8f\x80\x80
-No match
- \xf8\x88\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=11
- \xf9\x87\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=11
- \xfc\x84\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=12
- \xfd\x83\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=12
- \?\xf8\x88\x80\x80\x80
-No match
- \?\xf9\x87\x80\x80\x80
-No match
- \?\xfc\x84\x80\x80\x80\x80
-No match
- \?\xfd\x83\x80\x80\x80\x80
-No match
-
-/\x{100}abc(xyz(?1))/8DZ
-------------------------------------------------------------------
- Bra
- \x{100}abc
- CBra 1
- xyz
- Recurse
- Ket
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-First char = 196
-Need char = 'z'
-
/[^\x{100}]abc(xyz(?1))/8DZ
------------------------------------------------------------------
Bra
@@ -916,7 +336,7 @@ Need char = 'z'
End
------------------------------------------------------------------
Capturing subpattern count = 1
-Options: utf8
+Options: utf
No first char
Need char = 'z'
@@ -933,7 +353,7 @@ Need char = 'z'
End
------------------------------------------------------------------
Capturing subpattern count = 1
-Options: utf8
+Options: utf
No first char
Need char = 'z'
@@ -953,7 +373,7 @@ Need char = 'z'
End
------------------------------------------------------------------
Capturing subpattern count = 2
-Options: utf8
+Options: utf
No first char
No need char
@@ -984,7 +404,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 2
-Options: utf8
+Options: utf
No first char
No need char
@@ -1004,7 +424,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 2
-Options: utf8
+Options: utf
No first char
No need char
@@ -1035,7 +455,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 2
-Options: utf8
+Options: utf
No first char
No need char
@@ -1049,10 +469,6 @@ No need char
\x{100}X
0: X
-/a\x{1234}b/P8
- a\x{1234}b
- 0: a\x{1234}b
-
/^\ሴ/8DZ
------------------------------------------------------------------
Bra
@@ -1062,23 +478,13 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: anchored utf8
+Options: anchored utf
No first char
No need char
/\777/I
Failed: octal value is greater than \377 (not in UTF-8 mode) at offset 3
-/\777/8I
-Capturing subpattern count = 0
-Options: utf8
-First char = 199
-Need char = 191
- \x{1ff}
- 0: \x{1ff}
- \777
- 0: \x{1ff}
-
/\x{100}*\d/8DZ
------------------------------------------------------------------
Bra
@@ -1088,7 +494,7 @@ Need char = 191
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -1101,7 +507,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -1114,7 +520,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -1127,7 +533,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -1140,7 +546,7 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
@@ -1153,49 +559,10 @@ No need char
End
------------------------------------------------------------------
Capturing subpattern count = 0
-Options: utf8
+Options: utf
No first char
No need char
-/\x{100}+\x{200}/8DZ
-------------------------------------------------------------------
- Bra
- \x{100}++
- \x{200}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
-/\x{100}+X/8DZ
-------------------------------------------------------------------
- Bra
- \x{100}++
- X
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 'X'
-
-/X+\x{200}/8DZ
-------------------------------------------------------------------
- Bra
- X++
- \x{200}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'X'
-Need char = 128
-
/()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
@@ -1237,9 +604,6 @@ Matched, but too many substrings
End
------------------------------------------------------------------
-/^[\QĀ\E-\QŐ\E/BZ8
-Failed: missing terminating ] for character class at offset 15
-
/^abc./mgx8<any>
abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
0: abc1
@@ -1442,39 +806,9 @@ No match
\x{1ec5}
0: \x{1ec5}
-/-- This tests the stricter UTF-8 check according to RFC 3629. --/
-
-/X/8
- \x{0}\x{d7ff}\x{e000}\x{10ffff}
-No match
- \x{d800}
-Error -10 (bad UTF-8 string) offset=0 reason=14
- \x{d800}\?
-No match
- \x{da00}
-Error -10 (bad UTF-8 string) offset=0 reason=14
- \x{da00}\?
-No match
- \x{dfff}
-Error -10 (bad UTF-8 string) offset=0 reason=14
- \x{dfff}\?
-No match
- \x{110000}
-Error -10 (bad UTF-8 string) offset=0 reason=13
- \x{110000}\?
-No match
- \x{2000000}
-Error -10 (bad UTF-8 string) offset=0 reason=11
- \x{2000000}\?
-No match
- \x{7fffffff}
-Error -10 (bad UTF-8 string) offset=0 reason=12
- \x{7fffffff}\?
-No match
-
/a\Rb/I8<bsr_anycrlf>
Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
First char = 'a'
Need char = 'b'
a\rb
@@ -1492,7 +826,7 @@ No match
/a\Rb/I8<bsr_unicode>
Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
First char = 'a'
Need char = 'b'
a\rb
@@ -1514,7 +848,7 @@ No match
/a\R?b/I8<bsr_anycrlf>
Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
First char = 'a'
Need char = 'b'
a\rb
@@ -1532,7 +866,7 @@ No match
/a\R?b/I8<bsr_unicode>
Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
First char = 'a'
Need char = 'b'
a\rb
@@ -1598,17 +932,6 @@ No match
A\x{1ec5}ABCXYZ
0: X
-/(*UTF8)\x{1234}/
- abcd\x{1234}pqr
- 0: \x{1234}
-
-/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
-Capturing subpattern count = 0
-Options: bsr_unicode utf8
-Forced newline sequence: CRLF
-First char = 'a'
-Need char = 'b'
-
/Xa{2,4}b/8
X\P
Partial match: X
@@ -2094,32 +1417,6 @@ No need char
Subject length lower bound = 1
Starting byte set: \x09 \x20 \xa0
-/\h/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3
- ABC\x{09}
- 0: \x{09}
- ABC\x{20}
- 0:
- ABC\x{a0}
- 0: \x{a0}
- ABC\x{1680}
- 0: \x{1680}
- ABC\x{180e}
- 0: \x{180e}
- ABC\x{2000}
- 0: \x{2000}
- ABC\x{202f}
- 0: \x{202f}
- ABC\x{205f}
- 0: \x{205f}
- ABC\x{3000}
- 0: \x{3000}
-
/\v/SI
Capturing subpattern count = 0
No options
@@ -2128,26 +1425,6 @@ No need char
Subject length lower bound = 1
Starting byte set: \x0a \x0b \x0c \x0d \x85
-/\v/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
- ABC\x{0a}
- 0: \x{0a}
- ABC\x{0b}
- 0: \x{0b}
- ABC\x{0c}
- 0: \x{0c}
- ABC\x{0d}
- 0: \x{0d}
- ABC\x{85}
- 0: \x{85}
- ABC\x{2028}
- 0: \x{2028}
-
/\R/SI
Capturing subpattern count = 0
No options
@@ -2156,82 +1433,16 @@ No need char
Subject length lower bound = 1
Starting byte set: \x0a \x0b \x0c \x0d \x85
-/\R/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
-
-/\h*A/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'A'
-Subject length lower bound = 1
-Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3
- CDBABC
- 0: A
-
-/\v+A/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'A'
-Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
-
-/\s?xxx\s/8SI
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'x'
-Subject length lower bound = 4
-Starting byte set: \x09 \x0a \x0c \x0d \x20 x
-
/\sxxx\s/8T1
AB\x{85}xxx\x{a0}XYZ
0: \x{85}xxx\x{a0}
AB\x{a0}xxx\x{85}XYZ
0: \x{a0}xxx\x{85}
-/\sxxx\s/I8ST1
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'x'
-Subject length lower bound = 5
-Starting byte set: \x09 \x0a \x0c \x0d \x20 \xc2
- AB\x{85}xxx\x{a0}XYZ
- 0: \x{85}xxx\x{a0}
- AB\x{a0}xxx\x{85}XYZ
- 0: \x{a0}xxx\x{85}
-
/\S \S/8T1
\x{a2} \x{84}
0: \x{a2} \x{84}
-/\S \S/I8ST1
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = ' '
-Subject length lower bound = 3
-Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
- \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d
- \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @
- A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e
- f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3
- \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2
- \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1
- \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0
- \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
- \x{a2} \x{84}
- 0: \x{a2} \x{84}
- A Z
- 0: A Z
-
'A#хц'8x<any>BZ
------------------------------------------------------------------
Bra
@@ -2293,20 +1504,6 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
End
------------------------------------------------------------------
-/a+/8
- a\x{123}aa\>1
- 0: aa
- a\x{123}aa\>2
-Error -11 (bad UTF-8 offset)
- a\x{123}aa\>3
- 0: aa
- a\x{123}aa\>4
- 0: a
- a\x{123}aa\>5
-No match
- a\x{123}aa\>6
-Error -24 (bad offset value)
-
/^\cģ/8
Failed: \c must be followed by an ASCII character at offset 3
@@ -2338,41 +1535,9 @@ Failed: \c must be followed by an ASCII character at offset 3
1: \x{0a}
2: \x{0d}
-/\x{1234}+/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \xe1
-
-/\x{1234}+?/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \xe1
-
-/\x{1234}++/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \xe1
-
-/\x{1234}{2}/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 2
-Starting byte set: \xe1
-
/[^\x{1234}]+/iS8I
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 1
@@ -2380,7 +1545,7 @@ No set of starting bytes
/[^\x{1234}]+?/iS8I
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 1
@@ -2388,7 +1553,7 @@ No set of starting bytes
/[^\x{1234}]++/iS8I
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 1
@@ -2396,7 +1561,7 @@ No set of starting bytes
/[^\x{1234}]{2}/iS8I
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 2
@@ -2420,5 +1585,13 @@ Partial match: for
/f.*/8s
\P\Pfor
Partial match: for
+
+/\x{d7ff}\x{e000}/8
+
+/\x{d800}/8
+Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{dfff}/8
+Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7
/-- End of testinput5 --/
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index 0c569b3..29d51e2 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -1210,7 +1210,7 @@ No match
/a\Rb/I8<bsr_anycrlf>
Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
First char = 'a'
Need char = 'b'
a\rb
@@ -1228,7 +1228,7 @@ No match
/a\Rb/I8<bsr_unicode>
Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
First char = 'a'
Need char = 'b'
a\rb
@@ -1250,7 +1250,7 @@ No match
/a\R?b/I8<bsr_anycrlf>
Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
First char = 'a'
Need char = 'b'
a\rb
@@ -1268,7 +1268,7 @@ No match
/a\R?b/I8<bsr_unicode>
Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
First char = 'a'
Need char = 'b'
a\rb